import numpy as np
from matplotlib import pyplot as plt
from sklearn import datasets

# Ridge Regression function
# reference : https://en.wikipedia.org/wiki/Ridge_regression
def ridge_cost_function(x: np.ndarray, y: np.ndarray, theta: np.ndarray, alpha: float) -> float:
    """
    Compute the Ridge regression cost function with L2 regularization.

    J(θ) = (1/2m) * Σ (y_i - hθ(x))^2 + (a/2) * Σ θ_j^2 (for j=1 to n)

    Where:
       - J(θ) is the cost function we aim to minimize
       - m is the number of training examples
       - hθ(x) = X * θ (prediction)
       - y_i is the actual target value for example i
       - a is the regularization parameter

    @param X: The feature matrix (m x n)
    @param y: The target vector (m,)
    @param theta: The parameters (weights) of the model (n,)
    @param alpha: The regularization parameter

    @returns: The computed cost value
    """
    m = len(y)
    predictions = np.dot(x, theta)
    cost = (1 / (2 * m)) * np.sum((predictions - y) ** 2) + \
        (alpha / 2) * np.sum(theta[1:] ** 2)

    return cost

def ridge_gradient_descent(x: np.ndarray, y: np.ndarray, theta: np.ndarray, alpha: float, learning_rate: float, max_iterations: int) -> np.ndarray:
    """
    Perform gradient descent to minimize the
    cost function and fit the Ridge regression model.

    @param X: The feature matrix (m x n)
    @param y: The target vector (m,)
    @param theta: The initial parameters (weights) of the model (n,)
    @param alpha: The regularization parameter
    @param learning_rate: The learning rate for gradient descent
    @param max_iterations: The number of iterations for gradient descent

    @returns: The optimized parameters (weights) of the model (n,)
    """
    m = len(y)

    for iteration in range(max_iterations):
        predictions = np.dot(x, theta)
        error = predictions - y

        # calculate the gradient
        gradient = (1 / m) * np.dot(x.T, error)
        gradient[1:] += (alpha / m) * theta[1:]
        theta -= learning_rate * gradient

        if iteration % 100 == 0:
            cost = ridge_cost_function(x, y, theta, alpha)
            print(f"Iteration {iteration}, Cost: {cost}")

    return theta

if __name__ == "__main__":
    import doctest
    doctest.testmod()

    # Load California Housing dataset
    california_housing = datasets.fetch_california_housing()
    x = california_housing.data[:, :2]  # 2 features for simplicity
    y = california_housing.target
    x = (x - np.mean(x, axis=0)) / np.std(x, axis=0)

    # Add a bias column (intercept) to X
    x = np.c_[np.ones(x.shape[0]), x]

    # Initialize parameters (theta)
    theta_initial = np.zeros(x.shape[1])

    # Set hyperparameters
    alpha = 0.1
    learning_rate = 0.01
    max_iterations = 1000

    optimized_theta = ridge_gradient_descent(x, y, theta_initial, alpha, learning_rate, max_iterations)
    print(f"Optimized theta: {optimized_theta}")

    # Prediction
    def predict(x, theta):
        return np.dot(x, theta)

    y_pred = predict(x, optimized_theta)

    # Plotting the results (here we visualize predicted vs actual values)
    plt.figure(figsize=(10, 6))
    plt.scatter(y, y_pred, color='b', label='Predictions vs Actual')
    plt.plot([min(y), max(y)], [min(y), max(y)], color='r', label='Perfect Fit')
    plt.xlabel("Actual values")
    plt.ylabel("Predicted values")
    plt.title("Ridge Regression: Actual vs Predicted Values")
    plt.legend()
    plt.show()