stashing changes

megpay · megpay · commit 900fc0e43519 · 2024-10-19T13:05:13.000+02:00
diff --git a/financial/binomial_options_pricing_model.py b/financial/binomial_options_pricing_model.py
@@ -0,0 +1,94 @@
+"""
+
+https://en.wikipedia.org/wiki/Binomial_options_pricing_model
+"""
+
+from math import exp, sqrt
+import numpy as np
+
+class BinomialOptionsPrice:
+    """
+    This calculates the binomial options price given a call or put option. 
+    Input:
+        expiration_time: int This is the time until the option expires
+        stock_price: float This is the stock price today
+        strike_price: float This is the strike price at the time the option expires
+        interest_rate: float The interest rate for the option
+        sigma: float The volitility of the stock
+        tree_height: int The number of levels of the tree
+        option_type: str Either 'call' or 'put'. Any other value raises an error
+    Output:
+        options_price: float 
+    """
+    def __init__(self, 
+        expiration_time: int, 
+        stock_price: float, 
+        strike_price: float, 
+        interest_rate: float, 
+        sigma: float, 
+        tree_height: int,
+        option_type: str) -> None:
+        self.tree_height = tree_height
+        self.sigma = sigma
+        self.interest_rate = interest_rate
+        self.strike_price = strike_price
+        self.stock_price = stock_price
+        self.expiration_time = expiration_time
+        self.option_type = option_type
+
+        if self.expiration_time / self.tree_height >= self.sigma **2 / self.interest_rate**2:
+            raise ValueError("Time step too big. This will cause the probability to be outside [0, 1]")
+
+    def calculate_down(self) -> float:
+        """Calculates the down factor"""
+        return exp(self.sigma * sqrt(self.expiration_time/self.tree_height))
+    
+    def calculate_up(self) -> float:
+        """Calculates the up factor"""
+        return 1/self.calculate_down()
+
+    def calculate_rate_delta_t(self) -> float:
+        """Calculates"""
+        return exp(-self.interest_rate * self.expiration_time/self.tree_height)
+    
+    def calculate_s_n(self, placement):
+        return self.stock_price * self.calculate_up()**(self.tree_height - placement)
+
+    def calculate_leaf_values(self):
+        _leaf_list = []
+        for i in range(self.tree_height + 1):
+            if self.option_type == 'call':
+                _leaf_list.append(max(self.calculate_s_n(i) - self.strike_price, 0))
+            elif self.option_type == 'put':
+                _leaf_list.append(max(self.strike_price - self.calculate_s_n(i), 0))
+            else:
+                raise ValueError("Option type must be either 'call' or 'put'")
+        return np.array(_leaf_list)
+    
+    def calculate_node_values(self, i):
+
+        return [i] * (self.tree_height + 1)
+
+    def calculate_node_values(self, previous_nodes: list, current_tree_level: int):
+        while current_tree_level - 1 >= 0:
+                     
+
+
+    def generate_other_rows(self):
+        _nodes = np.zeros((self.tree_height, self.tree_height + 1))
+        _leaves = self.calculate_leaf_values()
+        # calculate the nodes here
+        i = self.tree_height - 1
+        while i >= 0:
+            _nodes[i, ] = self.calculate_node_values(i)
+            i -= 1
+        return np.vstack((_leaves, _nodes))  
+
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod()
+    tmp = BinomialOptionsPrice(1, 4.5, 5.6, 0.23, 1, 5, 'call')
+    print(tmp.generate_other_rows())
diff --git a/machine_learning/univariate_linear_regression.py b/machine_learning/univariate_linear_regression.py
@@ -0,0 +1,28 @@
+""" Often called "simple linear regression", univariate linear regression is a
+    method for determining the best line 
+
+    Since simple linear regression only uses a single predictor variable,
+    it is of the form:
+
+    y_pred = beta * x + alpha
+
+    The values for alpha and beta are calculated as follows. 
+
+    beta = correlation_x_y * (sample_standard_deviation_y / sample_standard_deviation_x)
+    alpha = y - beta * x. 
+
+
+"""
+def get_data():
+    pass
+
+def 
+
+def sample_correlation_coefficient(x_val, y_val):
+    pass
+
+def sample_standard_deviation(x_or_y_values):
+    pass
+
+def simple_linear_regression(x, y):
+    pass
diff --git a/maths/cooks_distance.py b/maths/cooks_distance.py
@@ -2,8 +2,10 @@
 Cook's Distance is used to estimate the influence of a data point in 
 in least squares regression. 
 
-Cook's Distance removes each data point and measures the effect of removing the
-data point. 
+Cook's Distance removes each data point one at a time, and measures the effect. Large
+Cook's Distance values for an individual data point indicates that data point should 
+be further investigated. A cutoff for what is large needs to be decided upon, and 1 is
+often used. 
 
 The algorithm works as follows:
 For each data point in the regression, remove the point from the set 
@@ -14,10 +16,13 @@
 
 https://en.wikipedia.org/wiki/Cook's_distance
 """
-from machine_learning.loss_functions.mean_squared_error import mean_squared_error
+
 import numpy as np
+from machine_learning.loss_functions import mean_squared_error
+from sklearn import datasets
+from sklearn.linear_model import LinearRegression
 
-def calculate_cooks_distance(y_observed: array, y_fitted: array, rank: int) -> array:
+def calculate_cooks_distance(y_observed: np.ndarray, y_fitted: np.ndarray, rank: int) -> np.ndarray:
     """Calculate Cook's Distance
         Input: 
             y_observed: numpy array of observed y values
@@ -27,23 +32,29 @@ def calculate_cooks_distance(y_observed: array, y_fitted: array, rank: int) -> a
             cooks_distance: numpy array of Cook's distance for each y value.
          
     """
-    import numpy as np
-    _mse = mean_squared_error(y_observed, y_fitted)
-    _y_difference_squared = (y_observed - y_fitted)**2
+    mse = mean_squared_error(y_observed, y_fitted)
+    y_difference_squared = (y_observed - y_fitted)**2
 
     if isinstance(rank) is not int:
         msg = f"Rank is an integer representing the number of predictors. Input: {rank}"
         raise TypeError(msg)
     
-    if len(y_observed) != len(y_fitted):
-        msg = f"The arrays of observed and fitted values must be equal length. Currently 
-            observed = {len(y_observed)} and fitted = {len(y_fitted)}"
+    if len(y_observed) != len(y_fitted) or len(y_observed) == 0:
+        msg = f"The arrays of observed and fitted values must be equal length and non-empty. 
+            Currently observed = {len(y_observed)} and fitted = {len(y_fitted)}"
         raise ValueError(msg)
 
-    if len(y_observed) == 0:
-        raise ValueError("The y value arrays must not be empty")
-
-    _summed_difference = sum(_y_difference_squared)
-    for item in np.nditer(_y_difference_squared):
-        k = (_summed_difference - item) / (rank * _mse)
+    # This is leave one out, so summing over all and then individually subtracting.
+    summed_difference = sum(y_difference_squared)
+    for item in np.nditer(y_difference_squared):
+        k = (summed_difference - item) / (rank * mse)
         
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod(verbose=True)
+    mdl = LinearRegression()
+    mdl.fit(df.Jumps.shape(-1, 1), df.Pulse)
+
+    main()
+
diff --git a/maths/cooks_distance_BAK.py b/maths/cooks_distance_BAK.py
@@ -0,0 +1,49 @@
+"""
+Cook's Distance is used to estimate the influence of a data point in 
+in least squares regression. 
+
+Cook's Distance removes each data point and measures the effect of removing the
+data point. 
+
+The algorithm works as follows:
+For each data point in the regression, remove the point from the set 
+    and calculate the effect of removing that point. 
+
+D_i = (sum over all other points(y_actual - y_observed)^2) / (rank * MSE^2) 
+
+
+https://en.wikipedia.org/wiki/Cook's_distance
+"""
+from machine_learning.loss_functions.mean_squared_error import mean_squared_error
+import numpy as np
+
+def calculate_cooks_distance(y_observed: array, y_fitted: array, rank: int) -> array:
+    """Calculate Cook's Distance
+        Input: 
+            y_observed: numpy array of observed y values
+            y_fitted: numpy array of fitted y values from linear regression model
+            rank: int representing the number of coefficients
+        Output:
+            cooks_distance: numpy array of Cook's distance for each y value.
+         
+    """
+    import numpy as np
+    _mse = mean_squared_error(y_observed, y_fitted)
+    _y_difference_squared = (y_observed - y_fitted)**2
+
+    if isinstance(rank) is not int:
+        msg = f"Rank is an integer representing the number of predictors. Input: {rank}"
+        raise TypeError(msg)
+    
+    if len(y_observed) != len(y_fitted):
+        msg = f"The arrays of observed and fitted values must be equal length. Currently 
+            observed = {len(y_observed)} and fitted = {len(y_fitted)}"
+        raise ValueError(msg)
+
+    if len(y_observed) == 0:
+        raise ValueError("The y value arrays must not be empty")
+
+    _summed_difference = sum(_y_difference_squared)
+    for item in np.nditer(_y_difference_squared):
+        k = (_summed_difference - item) / (rank * _mse)
+