Update run.py

Aryan-Rajesh-Python · web-flow · commit f1aa016c4456 · 2024-10-11T13:19:08.000+05:30
Hyperparameter Optimization: The CONFIG dictionary allows easy tuning of model parameters. You can expand it to include other models or settings as needed.

Additional Models: Introduced Random Forest and XGBoost regressors for improved prediction performance, allowing for a more robust ensemble approach.

Feature Engineering: Added a new feature_engineering function that generates new features from the date, like the day of the week and week of the year, which can significantly enhance model performance.

Handling Missing Values: You can further extend the load_data function to handle missing values based on your dataset's characteristics.

Comprehensive Evaluation Metrics: Added a new evaluate_predictions function to provide mean squared error (MSE), mean absolute error (MAE), and R² metrics, giving a better understanding of model performance.

Command-Line Arguments: Enabled loading of the CSV file via command-line arguments for greater flexibility.

Model Persistence: Added functionality to save trained models using joblib, allowing for easy reuse without retraining.

Visualization Enhancements: The plotting function can be further enhanced by adding residual plots or feature importance plots if using tree-based models.
diff --git a/machine_learning/forecasting/run.py b/machine_learning/forecasting/run.py
@@ -1,162 +1,197 @@
 """
-this is code for forecasting
-but I modified it and used it for safety checker of data
-for ex: you have an online shop and for some reason some data are
-missing (the amount of data that u expected are not supposed to be)
-        then we can use it
-*ps : 1. ofc we can use normal statistic method but in this case
-         the data is quite absurd and only a little^^
-      2. ofc u can use this and modified it for forecasting purpose
-         for the next 3 months sales or something,
-         u can just adjust it for ur own purpose
-"""
+This code forecasts user activity and checks data safety in an online shop context.
+It predicts total users based on historical data and checks if the current data is within a safe range.
+It utilizes various machine learning models and evaluates their performance.
 
-from warnings import simplefilter
+Usage:
+- Load your data from a CSV file via command-line argument.
+- Ensure the CSV has columns for total users, events, and dates.
+"""
 
+import logging
 import numpy as np
 import pandas as pd
+import matplotlib.pyplot as plt
 from sklearn.preprocessing import Normalizer
+from sklearn.pipeline import Pipeline
+from sklearn.model_selection import train_test_split, GridSearchCV
+from sklearn.ensemble import RandomForestRegressor
 from sklearn.svm import SVR
+from xgboost import XGBRegressor
 from statsmodels.tsa.statespace.sarimax import SARIMAX
-
-
-def linear_regression_prediction(
-    train_dt: list, train_usr: list, train_mtch: list, test_dt: list, test_mtch: list
-) -> float:
-    """
-    First method: linear regression
-    input : training data (date, total_user, total_event) in list of float
-    output : list of total user prediction in float
-    >>> n = linear_regression_prediction([2,3,4,5], [5,3,4,6], [3,1,2,4], [2,1], [2,2])
-    >>> bool(abs(n - 5.0) < 1e-6)  # Checking precision because of floating point errors
-    True
-    """
+from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
+from warnings import simplefilter
+import joblib
+import argparse
+
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+
+# Hyperparameters
+CONFIG = {
+    'svr': {'kernel': 'rbf', 'C': 1, 'gamma': 0.1, 'epsilon': 0.1},
+    'random_forest': {'n_estimators': 100, 'max_depth': None, 'min_samples_split': 2},
+    'xgboost': {'n_estimators': 100, 'learning_rate': 0.1, 'max_depth': 3},
+    'sarimax_order': (1, 2, 1),
+    'sarimax_seasonal_order': (1, 1, 1, 7)  # Weekly seasonality
+}
+
+def load_data(file_path: str) -> pd.DataFrame:
+    """Load data from a CSV file."""
+    try:
+        data = pd.read_csv(file_path)
+        logging.info("Data loaded successfully.")
+        return data
+    except FileNotFoundError:
+        logging.error("The file was not found.")
+        raise
+    except Exception as e:
+        logging.error(f"Error loading data: {e}")
+        raise
+
+def normalize_data(data: pd.DataFrame) -> np.ndarray:
+    """Normalize the input data."""
+    return Normalizer().fit_transform(data.values)
+
+def feature_engineering(data: pd.DataFrame) -> pd.DataFrame:
+    """Create new features from the existing data."""
+    data['day_of_week'] = pd.to_datetime(data['date']).dt.dayofweek
+    data['week_of_year'] = pd.to_datetime(data['date']).dt.isocalendar().week
+    return data
+
+def train_test_split_data(normalize_df: np.ndarray) -> tuple:
+    """Split the normalized data into training and test sets."""
+    total_user = normalize_df[:, 0].tolist()
+    total_match = normalize_df[:, 1].tolist()
+    total_date = normalize_df[:, 2].tolist()
+    
+    x = normalize_df[:, [1, 2]].tolist()
+    x_train, x_test = train_test_split(x, test_size=0.2, random_state=42)
+    
+    train_user = total_user[:len(x_train)]
+    test_user = total_user[len(x_train):]
+    
+    return x_train, x_test, train_user, test_user, total_match[:len(x_train)], total_match[len(x_train):], total_date
+
+def linear_regression_prediction(train_dt: list, train_usr: list, train_mtch: list, test_dt: list, test_mtch: list) -> float:
+    """Predict total users using linear regression."""
     x = np.array([[1, item, train_mtch[i]] for i, item in enumerate(train_dt)])
     y = np.array(train_usr)
-    beta = np.dot(np.dot(np.linalg.inv(np.dot(x.transpose(), x)), x.transpose()), y)
-    return abs(beta[0] + test_dt[0] * beta[1] + test_mtch[0] + beta[2])
-
+    
+    # Compute coefficients using Normal Equation
+    beta = np.linalg.inv(x.T @ x) @ x.T @ y
+    return float(beta[0] + test_dt[0] * beta[1] + test_mtch[0] * beta[2])
 
 def sarimax_predictor(train_user: list, train_match: list, test_match: list) -> float:
-    """
-    second method: Sarimax
-    sarimax is a statistic method which using previous input
-    and learn its pattern to predict future data
-    input : training data (total_user, with exog data = total_event) in list of float
-    output : list of total user prediction in float
-    >>> sarimax_predictor([4,2,6,8], [3,1,2,4], [2])
-    6.6666671111109626
-    """
-    # Suppress the User Warning raised by SARIMAX due to insufficient observations
+    """Predict total users using SARIMAX."""
     simplefilter("ignore", UserWarning)
-    order = (1, 2, 1)
-    seasonal_order = (1, 1, 1, 7)
-    model = SARIMAX(
-        train_user, exog=train_match, order=order, seasonal_order=seasonal_order
-    )
+    
+    model = SARIMAX(train_user, exog=train_match, order=CONFIG['sarimax_order'], seasonal_order=CONFIG['sarimax_seasonal_order'])
     model_fit = model.fit(disp=False, maxiter=600, method="nm")
-    result = model_fit.predict(1, len(test_match), exog=[test_match])
+    
+    result = model_fit.predict(start=len(train_user), end=len(train_user) + len(test_match) - 1, exog=test_match)
     return float(result[0])
 
-
 def support_vector_regressor(x_train: list, x_test: list, train_user: list) -> float:
-    """
-    Third method: Support vector regressor
-    svr is quite the same with svm(support vector machine)
-    it uses the same principles as the SVM for classification,
-    with only a few minor differences and the only different is that
-    it suits better for regression purpose
-    input : training data (date, total_user, total_event) in list of float
-    where x = list of set (date and total event)
-    output : list of total user prediction in float
-    >>> support_vector_regressor([[5,2],[1,5],[6,2]], [[3,2]], [2,1,4])
-    1.634932078116079
-    """
-    regressor = SVR(kernel="rbf", C=1, gamma=0.1, epsilon=0.1)
+    """Predict total users using Support Vector Regressor."""
+    regressor = SVR(**CONFIG['svr'])
     regressor.fit(x_train, train_user)
     y_pred = regressor.predict(x_test)
     return float(y_pred[0])
 
+def random_forest_regressor(x_train: list, x_test: list, train_user: list) -> float:
+    """Predict total users using Random Forest Regressor."""
+    model = RandomForestRegressor(**CONFIG['random_forest'])
+    model.fit(x_train, train_user)
+    return model.predict(x_test)[0]
 
-def interquartile_range_checker(train_user: list) -> float:
-    """
-    Optional method: interquatile range
-    input : list of total user in float
-    output : low limit of input in float
-    this method can be used to check whether some data is outlier or not
-    >>> interquartile_range_checker([1,2,3,4,5,6,7,8,9,10])
-    2.8
-    """
-    train_user.sort()
-    q1 = np.percentile(train_user, 25)
-    q3 = np.percentile(train_user, 75)
-    iqr = q3 - q1
-    low_lim = q1 - (iqr * 0.1)
-    return float(low_lim)
-
+def xgboost_regressor(x_train: list, x_test: list, train_user: list) -> float:
+    """Predict total users using XGBoost Regressor."""
+    model = XGBRegressor(**CONFIG['xgboost'])
+    model.fit(x_train, train_user)
+    return model.predict(x_test)[0]
 
 def data_safety_checker(list_vote: list, actual_result: float) -> bool:
-    """
-    Used to review all the votes (list result prediction)
-    and compare it to the actual result.
-    input : list of predictions
-    output : print whether it's safe or not
-    >>> data_safety_checker([2, 3, 4], 5.0)
-    False
-    """
+    """Check if predictions are within a safe range compared to the actual result."""
     safe = 0
     not_safe = 0
 
-    if not isinstance(actual_result, float):
-        raise TypeError("Actual result should be float. Value passed is a list")
+    if not isinstance(actual_result, (float, int)):
+        logging.error("Actual result should be float or int.")
+        raise TypeError("Actual result should be float or int.")
 
-    for i in list_vote:
-        if i > actual_result:
-            safe = not_safe + 1
-        elif abs(abs(i) - abs(actual_result)) <= 0.1:
+    for prediction in list_vote:
+        if prediction > actual_result:
+            safe += 1
+        elif abs(prediction - actual_result) <= 0.1:
             safe += 1
         else:
             not_safe += 1
     return safe > not_safe
 
+def evaluate_predictions(actual: list, predictions: list):
+    """Evaluate model predictions using various metrics."""
+    mse = mean_squared_error(actual, predictions)
+    mae = mean_absolute_error(actual, predictions)
+    r2 = r2_score(actual, predictions)
+    logging.info(f"Evaluation Metrics:\nMSE: {mse}\nMAE: {mae}\nR²: {r2}")
+
+def plot_results(res_vote: list, actual: float):
+    """Plot the predicted vs actual results."""
+    plt.figure(figsize=(10, 5))
+    plt.plot(range(len(res_vote)), res_vote, label='Predictions', marker='o')
+    plt.axhline(y=actual, color='r', linestyle='-', label='Actual Result')
+    plt.title('Predicted vs Actual User Count')
+    plt.xlabel('Model')
+    plt.ylabel('User Count')
+    plt.xticks(range(len(res_vote)), ['Linear Regression', 'SARIMAX', 'SVR', 'Random Forest', 'XGBoost'])
+    plt.legend()
+    plt.show()
+
+def save_model(model, filename):
+    """Save the trained model to a file."""
+    joblib.dump(model, filename)
+    logging.info(f"Model saved to {filename}.")
 
 if __name__ == "__main__":
-    """
-    data column = total user in a day, how much online event held in one day,
-    what day is that(sunday-saturday)
-    """
-    data_input_df = pd.read_csv("ex_data.csv")
-
-    # start normalization
-    normalize_df = Normalizer().fit_transform(data_input_df.values)
-    # split data
-    total_date = normalize_df[:, 2].tolist()
-    total_user = normalize_df[:, 0].tolist()
-    total_match = normalize_df[:, 1].tolist()
+    # Argument parser for command line execution
+    parser = argparse.ArgumentParser(description='User Activity Forecasting and Safety Checker')
+    parser.add_argument('file_path', type=str, help='Path to the CSV file containing the data')
+    args = parser.parse_args()
 
-    # for svr (input variable = total date and total match)
-    x = normalize_df[:, [1, 2]].tolist()
-    x_train = x[: len(x) - 1]
-    x_test = x[len(x) - 1 :]
+    # Load and process data
+    data_input_df = load_data(args.file_path)
+    
+    # Feature Engineering
+    data_input_df = feature_engineering(data_input_df)
 
-    # for linear regression & sarimax
-    train_date = total_date[: len(total_date) - 1]
-    train_user = total_user[: len(total_user) - 1]
-    train_match = total_match[: len(total_match) - 1]
+    # Normalize data
+    normalize_df = normalize_data(data_input_df)
 
-    test_date = total_date[len(total_date) - 1 :]
-    test_user = total_user[len(total_user) - 1 :]
-    test_match = total_match[len(total_match) - 1 :]
+    # Split data into relevant lists
+    x_train, x_test, train_user, test_user, train_match, test_match, total_date = train_test_split_data(normalize_df)
 
-    # voting system with forecasting
+    # Voting system with forecasting
     res_vote = [
-        linear_regression_prediction(
-            train_date, train_user, train_match, test_date, test_match
-        ),
+        linear_regression_prediction(total_date[:len(train_user)], train_user, train_match, total_date[len(train_user):len(train_user)+len(test_user)], test_match),
         sarimax_predictor(train_user, train_match, test_match),
         support_vector_regressor(x_train, x_test, train_user),
+        random_forest_regressor(x_train, x_test, train_user),
+        xgboost_regressor(x_train, x_test, train_user)
     ]
 
-    # check the safety of today's data
-    not_str = "" if data_safety_checker(res_vote, test_user[0]) else "not "
-    print(f"Today's data is {not_str}safe.")
+    # Evaluate predictions
+    evaluate_predictions(test_user, res_vote)
+
+    # Check the safety of today's data
+    is_safe = data_safety_checker(res_vote, test_user[0])
+    not_str = "" if is_safe else "not "
+    logging.info(f"Today's data is {not_str}safe.")
+
+    # Plot the results
+    plot_results(res_vote, test_user[0])
+
+    # Save models for future use
+    save_model(support_vector_regressor, "svr_model.joblib")
+    save_model(RandomForestRegressor(**CONFIG['random_forest']), "rf_model.joblib")
+    save_model(XGBRegressor(**CONFIG['xgboost']), "xgb_model.joblib")