From db13bdeeb54e18ec07e96e65c282839cc0f36132 Mon Sep 17 00:00:00 2001
From: Michal Raczycki <michalr265@gmail.com>
Date: Wed, 10 May 2023 14:57:10 +0100
Subject: [PATCH 01/11] splitting create_sample_input, adapting tests

---
 pymc_experimental/model_builder.py            | 150 +++++++++++-------
 pymc_experimental/tests/test_model_builder.py |  61 +++----
 2 files changed, 115 insertions(+), 96 deletions(-)

diff --git a/pymc_experimental/model_builder.py b/pymc_experimental/model_builder.py
index 48f168d8..4914ea15 100644
--- a/pymc_experimental/model_builder.py
+++ b/pymc_experimental/model_builder.py
@@ -38,7 +38,7 @@ class ModelBuilder:
 
     def __init__(
         self,
-        data: Dict[str, Union[np.ndarray, pd.DataFrame, pd.Series]],
+        data: Union[np.ndarray, pd.DataFrame, pd.Series] = None,
         model_config: Dict = None,
         sampler_config: Dict = None,
     ):
@@ -47,12 +47,12 @@ def __init__(
 
         Parameters
         ----------
-        model_config : Dictionary, optional
-            dictionary of parameters that initialise model configuration. Generated by the user defined create_sample_input method.
-        data : Dictionary, required
+        data : Dictionary, optional
             It is the data we need to train the model on.
+        model_config : Dictionary, optional
+            dictionary of parameters that initialise model configuration. Class-default defined by the user default_model_config method.
         sampler_config : Dictionary, optional
-            dictionary of parameters that initialise sampler configuration. Generated by the user defined create_sample_input method.
+            dictionary of parameters that initialise sampler configuration. Class-default defined by the user default_sampler_config method.
         Examples
         --------
         >>> class LinearModel(ModelBuilder):
@@ -61,15 +61,16 @@ def __init__(
         """
 
         if sampler_config is None:
-            sampler_config = {}
+            sampler_config = self.default_sampler_config
+        self.sampler_config = sampler_config
         if model_config is None:
-            model_config = {}
+            model_config = self.default_model_config
         self.model_config = model_config  # parameters for priors etc.
-        self.sampler_config = sampler_config  # parameters for sampling
-        self.data = data
-        self.idata = (
-            None  # inference data object placeholder, idata is generated during build execution
-        )
+        self.data = self.generate_model_data(data=data)
+        self.model = None  # Set by build_model
+        self.output_var = None  # Set by build_model
+        self.idata = None  # idata is generated during fitting
+        self.is_fitted_ = False
 
     @abstractmethod
     def _data_setter(
@@ -98,57 +99,87 @@ def _data_setter(
 
         raise NotImplementedError
 
-    @staticmethod
+    @property
     @abstractmethod
-    def create_sample_input():
+    def default_model_config(self) -> Dict:
         """
-        Needs to be implemented by the user in the child class.
-        Returns examples for data, model_config, sampler_config.
-        This is useful for understanding the required
-        data structures for the user model.
-
+        Returns a class default config dict for model builder if no model_config is provided on class initialization
+        Useful for understanding structure of required model_config to allow its customization by users
         Examples
         --------
-        >>> @classmethod
-        >>> def create_sample_input(cls):
-        >>>    x = np.linspace(start=1, stop=50, num=100)
-        >>>    y = 5 * x + 3 + np.random.normal(0, 1, len(x)) * np.random.rand(100)*10 +  np.random.rand(100)*6.4
-        >>>    data = pd.DataFrame({'input': x, 'output': y})
-
-        >>>    model_config = {
-        >>>          'a' : {
-        >>>              'loc': 7,
-        >>>              'scale' : 3
-        >>>           },
-        >>>          'b' : {
-        >>>              'loc': 3,
-        >>>              'scale': 5
-        >>>          }
-        >>>          'obs_error': 2
-
-        >>>    sampler_config = {
-        >>>       'draws': 1_000,
-        >>>       'tune': 1_000,
-        >>>       'chains': 1,
-        >>>       'target_accept': 0.95,
-        >>>    }
-        >>>    return data, model_config, sampler_config
+        >>>     @classmethod
+        >>>     def default_model_config(self):
+        >>>         Return {
+        >>>             'a' : {
+        >>>                 'loc': 7,
+        >>>                 'scale' : 3
+        >>>             },
+        >>>             'b' : {
+        >>>                 'loc': 3,
+        >>>                 'scale': 5
+        >>>             }
+        >>>              'obs_error': 2
+        >>>         }
 
         Returns
         -------
-        data : dict
-            The data we want to train the model on
         model_config : dict
-            A set of parameters for predictor distributions that allow to save and recreate the model
+            A set of default parameters for predictor distributions that allow to save and recreate the model.
+        """
+        raise NotImplementedError
+
+    @property
+    @abstractmethod
+    def default_sampler_config(self) -> Dict:
+        """
+        Returns a class default sampler dict for model builder if no sampler_config is provided on class initialization
+        Useful for understanding structure of required sampler_config to allow its customization by users
+        Examples
+        --------
+        >>>     @classmethod
+        >>>     def default_model_config(self):
+        >>>         Return {
+        >>>             'draws': 1_000,
+        >>>             'tune': 1_000,
+        >>>             'chains': 1,
+        >>>             'target_accept': 0.95,
+        >>>         }
+
+        Returns
+        -------
         sampler_config : dict
-            A set of default settings for sampler config, customization of contents of sampler_config allows introducing new settings to the sampler
+            A set of default settings for used by model in fit process.
+        """
+        raise NotImplementedError
+
+    @classmethod
+    @abstractmethod
+    def generate_model_data(
+        cls, data: Union[np.ndarray, pd.DataFrame, pd.Series] = None
+    ) -> pd.DataFrame:
+        """
+        Returns a default dataset for a class, can be used as a hint to data formatting required for the class
+
+        Examples
+        --------
+        >>>     @classmethod
+        >>>     def generate_model_data(self):
+        >>>         x = np.linspace(start=1, stop=50, num=100)
+        >>>         y = 5 * x + 3 + np.random.normal(0, 1, len(x)) * np.random.rand(100)*10 +  np.random.rand(100)*6.4
+        >>>         data = pd.DataFrame({'input': x, 'output': y})
+
+        Returns
+        -------
+        data : pd.DataFrame
+            The data we want to train the model on.
+
         """
         raise NotImplementedError
 
     @abstractmethod
     def build_model(
-        model_data: Dict[str, Union[np.ndarray, pd.DataFrame, pd.Series]],
-        model_config: Dict[str, Union[int, float, Dict]],
+        model_data: Dict[str, Union[np.ndarray, pd.DataFrame, pd.Series]] = None,
+        model_config: Dict[str, Union[int, float, Dict]] = None,
     ) -> None:
         """
         Needs to be implemented by the user in the child class.
@@ -160,14 +191,14 @@ def build_model(
         model_data : dict
             Preformated data that is going to be used in the model. For efficiency reasons it should contain only the necesary data columns,
             not entire available dataset since it's going to be encoded into data used to recreate the model.
+            If not provided uses data from self.data
         model_config : dict
             Dictionary where keys are strings representing names of parameters of the model, values are dictionaries of parameters
-            needed for creating model parameters
+            needed for creating model parameters. If not provided uses data from self.model_config
 
         See Also
         --------
-        create_model_input : Creates all required input for the model builder based on the data given. Shows the examples of data structures on which the specific
-        inherited version of model builder operates on.
+        default_model_config : returns default model config
 
         Returns:
         ----------
@@ -311,17 +342,18 @@ def fit(
         # If a new data was provided, assign it to the model
         if data is not None:
             self.data = data
-        self.model_data, model_config, sampler_config = self.create_sample_input(data=self.data)
+        self.model_data = self.generate_model_data(data=self.data)
         if self.model_config is None:
-            self.model_config = model_config
+            self.model_config = self.default_model_config
         if self.sampler_config is None:
-            self.sampler_config = sampler_config
-        self.build_model(self.model_data, self.model_config)
+            self.sampler_config = self.default_sampler_config
+        if self.model is None:
+            self.build_model(self.model_data, self.model_config)
 
-        sampler_config["progressbar"] = progressbar
-        sampler_config["random_seed"] = random_seed
+        self.sampler_config["progressbar"] = progressbar
+        self.sampler_config["random_seed"] = random_seed
 
-        self.idata = self.sample_model(**sampler_config)
+        self.idata = self.sample_model(**self.sampler_config)
         self.idata.add_groups(fit_data=self.data.to_xarray())
         return self.idata
 
diff --git a/pymc_experimental/tests/test_model_builder.py b/pymc_experimental/tests/test_model_builder.py
index 37eb0dab..7dfdbba4 100644
--- a/pymc_experimental/tests/test_model_builder.py
+++ b/pymc_experimental/tests/test_model_builder.py
@@ -15,6 +15,7 @@
 import hashlib
 import sys
 import tempfile
+from typing import Dict
 
 import numpy as np
 import pandas as pd
@@ -28,11 +29,12 @@ class test_ModelBuilder(ModelBuilder):
     _model_type = "LinearModel"
     version = "0.1"
 
-    def build_model(self, model_data, model_config):
+    def build_model(self, model_data=None, model_config=None):
+
         with pm.Model() as self.model:
-            if model_data is not None:
-                x = pm.MutableData("x", model_data["input"].values)
-                y_data = pm.MutableData("y_data", model_data["output"].values)
+
+            x = pm.MutableData("x", model_data["input"].values)
+            y_data = pm.MutableData("y_data", model_data["output"].values)
 
             # prior parameters
             a_loc = model_config["a"]["loc"]
@@ -47,8 +49,7 @@ def build_model(self, model_data, model_config):
             obs_error = pm.HalfNormal("σ_model_fmc", obs_error)
 
             # observed data
-            if model_data is not None:
-                y_model = pm.Normal("y_model", a + b * x, obs_error, shape=x.shape, observed=y_data)
+            y_model = pm.Normal("y_model", a + b * x, obs_error, shape=x.shape, observed=y_data)
 
     def _data_setter(self, data: pd.DataFrame):
         with self.model:
@@ -61,33 +62,34 @@ def _serializable_model_config(self):
         return self.model_config
 
     @classmethod
-    def create_sample_input(self, data=None):
+    def generate_model_data(cls, data=None):
         x = np.linspace(start=0, stop=1, num=100)
         y = 5 * x + 3
         y = y + np.random.normal(0, 1, len(x))
         data = pd.DataFrame({"input": x, "output": y})
+        return data
 
-        model_config = {
+    @property
+    def default_model_config(self) -> Dict:
+        return {
             "a": {"loc": 0, "scale": 10},
             "b": {"loc": 0, "scale": 10},
             "obs_error": 2,
         }
 
-        sampler_config = {
+    @property
+    def default_sampler_config(self) -> Dict:
+        return {
             "draws": 1_000,
             "tune": 1_000,
             "chains": 3,
             "target_accept": 0.95,
         }
 
-        return data, model_config, sampler_config
-
     @staticmethod
     def initial_build_and_fit(check_idata=True) -> ModelBuilder:
-        data, model_config, sampler_config = test_ModelBuilder.create_sample_input()
-        model_builder = test_ModelBuilder(
-            model_config=model_config, sampler_config=sampler_config, data=data
-        )
+        data = test_ModelBuilder.generate_model_data()
+        model_builder = test_ModelBuilder()
         model_builder.idata = model_builder.fit(data=data)
         if check_idata:
             assert model_builder.idata is not None
@@ -96,34 +98,19 @@ def initial_build_and_fit(check_idata=True) -> ModelBuilder:
 
 
 def test_save_without_fit_raises_runtime_error():
-    data, model_config, sampler_config = test_ModelBuilder.create_sample_input()
-    model_builder = test_ModelBuilder(
-        model_config=model_config, sampler_config=sampler_config, data=data
-    )
+    model_builder = test_ModelBuilder()
     with pytest.raises(RuntimeError):
         model_builder.save("saved_model")
 
 
 def test_empty_sampler_config_fit():
-    data, model_config, sampler_config = test_ModelBuilder.create_sample_input()
     sampler_config = {}
-    model_builder = test_ModelBuilder(
-        model_config=model_config, sampler_config=sampler_config, data=data
-    )
-    model_builder.idata = model_builder.fit(data=data)
+    model_builder = test_ModelBuilder(sampler_config=sampler_config)
+    model_builder.idata = model_builder.fit()
     assert model_builder.idata is not None
     assert "posterior" in model_builder.idata.groups()
 
 
-def test_empty_model_config_fit():
-    data, model_config, sampler_config = test_ModelBuilder.create_sample_input()
-    model_config = {}
-    model_builder = test_ModelBuilder(
-        model_config=model_config, sampler_config=sampler_config, data=data
-    )
-    assert model_builder.model_config == {}
-
-
 def test_fit():
     model = test_ModelBuilder.initial_build_and_fit()
     x_pred = np.random.uniform(low=0, high=1, size=100)
@@ -177,11 +164,11 @@ def test_predict_posterior(combined):
 
 
 def test_id():
-    data, model_config, sampler_config = test_ModelBuilder.create_sample_input()
-    model = test_ModelBuilder(model_config=model_config, sampler_config=sampler_config, data=data)
-
+    model = test_ModelBuilder()
     expected_id = hashlib.sha256(
-        str(model_config.values()).encode() + model.version.encode() + model._model_type.encode()
+        str(model.model_config.values()).encode()
+        + model.version.encode()
+        + model._model_type.encode()
     ).hexdigest()[:16]
 
     assert model.id == expected_id

From c2720526248fc1c70595a532236ce6ac3d22b647 Mon Sep 17 00:00:00 2001
From: Michal Raczycki <michalr265@gmail.com>
Date: Wed, 10 May 2023 17:01:54 +0100
Subject: [PATCH 02/11] step 2: removing duplications and adapting
 BayesianEstimator

---
 .../bayesian_estimator_linearmodel.py         | 25 ++++++-------------
 pymc_experimental/model_builder.py            |  3 +--
 .../test_bayesian_estimator_linearmodel.py    |  4 +--
 3 files changed, 10 insertions(+), 22 deletions(-)

diff --git a/pymc_experimental/bayesian_estimator_linearmodel.py b/pymc_experimental/bayesian_estimator_linearmodel.py
index 4feaf244..49a1a07c 100644
--- a/pymc_experimental/bayesian_estimator_linearmodel.py
+++ b/pymc_experimental/bayesian_estimator_linearmodel.py
@@ -60,6 +60,7 @@ class BayesianEstimator(ModelBuilder):
 
     def __init__(
         self,
+        data: Union[np.ndarray, pd.DataFrame, pd.Series] = None,
         model_config: Dict = None,
         sampler_config: Dict = None,
     ):
@@ -75,16 +76,9 @@ def __init__(
         """
         if model_config is None:
             model_config = self.default_model_config
-        self.model_config = model_config
-
         if sampler_config is None:
             sampler_config = self.default_sampler_config
-        self.sampler_config = sampler_config
-
-        self.model = None  # Set by build_model
-        self.output_var = None  # Set by build_model
-        self.idata = None  # idata is generated during fitting
-        self.is_fitted_ = False
+        super().__init__(data=data, model_config=model_config, sampler_config=sampler_config)
 
     @property
     @abstractmethod
@@ -103,16 +97,11 @@ def _validate_data(self, X, y=None):
             return check_array(X, accept_sparse=False)
 
     @abstractmethod
-    def build_model(self) -> None:
-        """
-        Build the PYMC model. The model is built with placeholder data.
-        Actual data will be set by _data_setter when fitting or evaluating the model.
-        Data array size can change but number of dimensions must stay the same.
+    def build_model(
+        model_data: Dict[str, Union[np.ndarray, pd.DataFrame, pd.Series]] = None,
+        model_config: Dict[str, Union[int, float, Dict]] = None,
+    ) -> None:
 
-        Returns:
-        ----------
-        None
-        """
         raise NotImplementedError
 
     @abstractmethod
@@ -462,7 +451,7 @@ def _data_setter(self, X, y=None):
                 pm.set_data({"y_data": y.squeeze()})
 
     @classmethod
-    def create_sample_input(cls, nsamples=100):
+    def generate_model_data(cls, nsamples=100, data=None):
         x = np.linspace(start=0, stop=1, num=nsamples)
         y = 5 * x + 3
         y = y + np.random.normal(0, 1, len(x))
diff --git a/pymc_experimental/model_builder.py b/pymc_experimental/model_builder.py
index 4914ea15..12fe9f65 100644
--- a/pymc_experimental/model_builder.py
+++ b/pymc_experimental/model_builder.py
@@ -182,11 +182,10 @@ def build_model(
         model_config: Dict[str, Union[int, float, Dict]] = None,
     ) -> None:
         """
-        Needs to be implemented by the user in the child class.
         Creates an instance of pm.Model based on provided model_data and model_config, and
         attaches it to self.
 
-        Required Parameters
+        Parameters
         ----------
         model_data : dict
             Preformated data that is going to be used in the model. For efficiency reasons it should contain only the necesary data columns,
diff --git a/pymc_experimental/tests/test_bayesian_estimator_linearmodel.py b/pymc_experimental/tests/test_bayesian_estimator_linearmodel.py
index 9fc9b575..38000d46 100644
--- a/pymc_experimental/tests/test_bayesian_estimator_linearmodel.py
+++ b/pymc_experimental/tests/test_bayesian_estimator_linearmodel.py
@@ -34,7 +34,7 @@
 
 @pytest.fixture(scope="module")
 def sample_input():
-    x, y = LinearModel.create_sample_input()
+    x, y = LinearModel.generate_model_data()
     return x, y
 
 
@@ -53,7 +53,7 @@ def fitted_linear_model_instance(sample_input):
 
 
 def test_save_without_fit_raises_runtime_error():
-    x, y = LinearModel.create_sample_input()
+    x, y = LinearModel.generate_model_data()
     test_model = LinearModel()
     with pytest.raises(RuntimeError):
         test_model.save("saved_model")

From 539659d4a23f6b0da104904781fc5449cdd34218 Mon Sep 17 00:00:00 2001
From: Michal Raczycki <michalr265@gmail.com>
Date: Thu, 11 May 2023 13:15:16 +0100
Subject: [PATCH 03/11] adding and updating doctests

fixing indentation issue, adding exception for pytest

adding forgotten decorator to generate_model_data

making doctest more user-manual like, renaming example model for consistency

chaning YourClass to MyClass for consistency
---
 .../bayesian_estimator_linearmodel.py         |  43 +++++
 pymc_experimental/model_builder.py            | 159 +++++++++++++-----
 pytest.ini                                    |   2 +-
 3 files changed, 162 insertions(+), 42 deletions(-)

diff --git a/pymc_experimental/bayesian_estimator_linearmodel.py b/pymc_experimental/bayesian_estimator_linearmodel.py
index 49a1a07c..06d7f624 100644
--- a/pymc_experimental/bayesian_estimator_linearmodel.py
+++ b/pymc_experimental/bayesian_estimator_linearmodel.py
@@ -414,6 +414,25 @@ def default_sampler_config(self):
         }
 
     def build_model(self):
+        """
+        Build the PyMC model.
+
+        Returns
+        -------
+        None
+
+        Examples
+        --------
+        >>> self.build_model()
+        >>> assert self.model is not None
+        >>> assert isinstance(self.model, pm.Model)
+        >>> assert "intercept" in self.model.named_vars
+        >>> assert "slope" in self.model.named_vars
+        >>> assert "σ_model_fmc" in self.model.named_vars
+        >>> assert "y_model" in self.model.named_vars
+        >>> assert "y_hat" in self.model.named_vars
+        >>> assert self.output_var == "y_hat"
+        """
         cfg = self.model_config
 
         # The model is built with placeholder data.
@@ -452,6 +471,30 @@ def _data_setter(self, X, y=None):
 
     @classmethod
     def generate_model_data(cls, nsamples=100, data=None):
+        """
+        Generate model data for linear regression.
+
+        Parameters
+        ----------
+        nsamples : int, optional
+            The number of samples to generate. Default is 100.
+        data : np.ndarray, optional
+            An optional data array to add noise to.
+
+        Returns
+        -------
+        tuple
+            A tuple of two np.ndarrays representing the feature matrix and target vector, respectively.
+
+        Examples
+        --------
+        >>> import numpy as np
+        >>> x, y = cls.generate_model_data()
+        >>> assert isinstance(x, np.ndarray)
+        >>> assert isinstance(y, np.ndarray)
+        >>> assert x.shape == (100, 1)
+        >>> assert y.shape == (100,)
+        """
         x = np.linspace(start=0, stop=1, num=nsamples)
         y = 5 * x + 3
         y = y + np.random.normal(0, 1, len(x))
diff --git a/pymc_experimental/model_builder.py b/pymc_experimental/model_builder.py
index 12fe9f65..28dc2dbe 100644
--- a/pymc_experimental/model_builder.py
+++ b/pymc_experimental/model_builder.py
@@ -55,9 +55,9 @@ def __init__(
             dictionary of parameters that initialise sampler configuration. Class-default defined by the user default_sampler_config method.
         Examples
         --------
-        >>> class LinearModel(ModelBuilder):
+        >>> class MyModel(ModelBuilder):
         >>>     ...
-        >>> model = LinearModel(model_config, sampler_config)
+        >>> model = MyModel(model_config, sampler_config)
         """
 
         if sampler_config is None:
@@ -159,6 +159,12 @@ def generate_model_data(
     ) -> pd.DataFrame:
         """
         Returns a default dataset for a class, can be used as a hint to data formatting required for the class
+        If data is not None, dataset will be created from it's content.
+
+        Parameters:
+        data : Union[np.ndarray, pd.DataFrame, pd.Series], optional
+            dataset that will replace the default sample data
+
 
         Examples
         --------
@@ -178,16 +184,16 @@ def generate_model_data(
 
     @abstractmethod
     def build_model(
-        model_data: Dict[str, Union[np.ndarray, pd.DataFrame, pd.Series]] = None,
+        data: Dict[str, Union[np.ndarray, pd.DataFrame, pd.Series]] = None,
         model_config: Dict[str, Union[int, float, Dict]] = None,
     ) -> None:
         """
-        Creates an instance of pm.Model based on provided model_data and model_config, and
+        Creates an instance of pm.Model based on provided data and model_config, and
         attaches it to self.
 
         Parameters
         ----------
-        model_data : dict
+        data : dict
             Preformated data that is going to be used in the model. For efficiency reasons it should contain only the necesary data columns,
             not entire available dataset since it's going to be encoded into data used to recreate the model.
             If not provided uses data from self.data
@@ -207,7 +213,34 @@ def build_model(
         raise NotImplementedError
 
     def sample_model(self, **kwargs):
+        """
+        Sample from the PyMC model.
+
+        Parameters
+        ----------
+        **kwargs : dict
+            Additional keyword arguments to pass to the PyMC sampler.
+
+        Returns
+        -------
+        xarray.Dataset
+            The PyMC3 samples dataset.
+
+        Raises
+        ------
+        RuntimeError
+            If the PyMC model hasn't been built yet.
 
+        Examples
+        --------
+        >>> self.build_model()
+        >>> idata = self.sample_model(draws=100, tune=10)
+        >>> assert isinstance(idata, xr.Dataset)
+        >>> assert "posterior" in idata
+        >>> assert "prior" in idata
+        >>> assert "observed_data" in idata
+        >>> assert "log_likelihood" in idata
+        """
         if self.model is None:
             raise RuntimeError(
                 "The model hasn't been built yet, call .build_model() first or call .fit() instead."
@@ -223,6 +256,34 @@ def sample_model(self, **kwargs):
         return idata
 
     def set_idata_attrs(self, idata=None):
+        """
+        Set attributes on an InferenceData object.
+
+        Parameters
+        ----------
+        idata : arviz.InferenceData, optional
+            The InferenceData object to set attributes on.
+
+        Raises
+        ------
+        RuntimeError
+            If no InferenceData object is provided.
+
+        Returns
+        -------
+        None
+
+        Examples
+        --------
+        >>> model = MyModel(ModelBuilder)
+        >>> idata = az.InferenceData(your_dataset)
+        >>> model.set_idata_attrs(idata=idata)
+        >>> assert "id" in idata.attrs #this and the following lines are part of doctest, not user manual
+        >>> assert "model_type" in idata.attrs
+        >>> assert "version" in idata.attrs
+        >>> assert "sampler_config" in idata.attrs
+        >>> assert "model_config" in idata.attrs
+        """
         if idata is None:
             idata = self.idata
         if idata is None:
@@ -235,22 +296,33 @@ def set_idata_attrs(self, idata=None):
 
     def save(self, fname: str) -> None:
         """
-        Saves inference data of the model.
+        Save the model's inference data to a file.
 
         Parameters
         ----------
-        fname : string
-            This denotes the name with path from where idata should be saved.
+        fname : str
+            The name and path of the file to save the inference data with model parameters.
+
+        Returns
+        -------
+        None
+
+        Raises
+        ------
+        RuntimeError
+            If the model hasn't been fit yet (no inference data available).
 
         Examples
         --------
-        >>> class LinearModel(ModelBuilder):
-        >>>     ...
-        >>> data, model_config, sampler_config = LinearModel.create_sample_input()
-        >>> model = LinearModel(model_config, sampler_config)
-        >>> idata = model.fit(data)
-        >>> name = './mymodel.nc'
-        >>> model.save(name)
+        This method is meant to be overridden and implemented by subclasses.
+        It should not be called directly on the base abstract class or its instances.
+
+        >>> class MyModel(ModelBuilder):
+        >>>     def __init__(self):
+        >>>         super().__init__()
+        >>> model = MyModel()
+        >>> model.fit(data)
+        >>> model.save('model_results.nc')  # This will call the overridden method in MyModel
         """
         if self.idata is not None and "posterior" in self.idata:
             file = Path(str(fname))
@@ -259,33 +331,32 @@ def save(self, fname: str) -> None:
             raise RuntimeError("The model hasn't been fit yet, call .fit() first")
 
     @classmethod
-    def load(cls, fname: str):
+    def load(cls, fname: str) -> "ModelBuilder":
         """
-        Creates a ModelBuilder instance from a file,
-        Loads inference data for the model.
+        Create a ModelBuilder instance from a file and load inference data for the model.
 
         Parameters
         ----------
-        fname : string
-            This denotes the name with path from where idata should be loaded from.
+        fname : str
+            The name and path from which the inference data should be loaded.
 
         Returns
         -------
-        Returns an instance of ModelBuilder.
+        ModelBuilder
+            An instance of the ModelBuilder class.
 
         Raises
         ------
         ValueError
-            If the inference data that is loaded doesn't match with the model.
+            If the loaded inference data does not match the model.
 
         Examples
         --------
-        >>> class LinearModel(ModelBuilder):
+        >>> class MyModel(ModelBuilder):
         >>>     ...
         >>> name = './mymodel.nc'
-        >>> imported_model = LinearModel.load(name)
+        >>> imported_model = MyModel.load(name)
         """
-
         filepath = Path(str(fname))
         idata = az.from_netcdf(filepath)
         model_builder = cls(
@@ -297,7 +368,7 @@ def load(cls, fname: str):
         model_builder.build_model(model_builder.data, model_builder.model_config)
         if model_builder.id != idata.attrs["id"]:
             raise ValueError(
-                f"The file '{fname}' does not contain an inference data of the same model or configuration as '{cls._model_type}'"
+                f"The file '{fname}' does not contain inference data of the same model or configuration as '{cls._model_type}'"
             )
 
         return model_builder
@@ -331,8 +402,7 @@ def fit(
 
         Examples
         --------
-        >>> data, model_config, sampler_config = LinearModel.create_sample_input()
-        >>> model = LinearModel(model_config, sampler_config)
+        >>> model = MyModel()
         >>> idata = model.fit(data)
         Auto-assigning NUTS sampler...
         Initializing NUTS using jitter+adapt_diag...
@@ -340,19 +410,19 @@ def fit(
 
         # If a new data was provided, assign it to the model
         if data is not None:
-            self.data = data
-        self.model_data = self.generate_model_data(data=self.data)
+            self.data = self.generate_model_data(data=self.data)
+
         if self.model_config is None:
             self.model_config = self.default_model_config
         if self.sampler_config is None:
             self.sampler_config = self.default_sampler_config
         if self.model is None:
-            self.build_model(self.model_data, self.model_config)
+            self.build_model(self.data, self.model_config)
 
         self.sampler_config["progressbar"] = progressbar
         self.sampler_config["random_seed"] = random_seed
-
-        self.idata = self.sample_model(**self.sampler_config)
+        temp_sampler_config = {**self.sampler_config, **kwargs}
+        self.idata = self.sample_model(**temp_sampler_config)
         self.idata.add_groups(fit_data=self.data.to_xarray())
         return self.idata
 
@@ -377,8 +447,7 @@ def predict(
 
         Examples
         --------
-        >>> data, model_config, sampler_config = LinearModel.create_sample_input()
-        >>> model = LinearModel(model_config, sampler_config)
+        >>> model = MyModel()
         >>> idata = model.fit(data)
         >>> x_pred = []
         >>> prediction_data = pd.DataFrame({'input':x_pred})
@@ -398,7 +467,7 @@ def predict_posterior(
         Generate posterior predictive samples on unseen data.
 
         Parameters
-        ---------
+        ----------
         data_prediction : Dictionary of string and either of numpy array, pandas dataframe or pandas Series
             It is the data we need to make prediction on using the model.
         extend_idata : Boolean determining whether the predictions should be added to inference data object.
@@ -412,8 +481,7 @@ def predict_posterior(
 
         Examples
         --------
-        >>> data, model_config, sampler_config = LinearModel.create_sample_input()
-        >>> model = LinearModel(model_config, sampler_config)
+        >>> model = MyModel()
         >>> idata = model.fit(data)
         >>> x_pred = []
         >>> prediction_data = pd.DataFrame({'input': x_pred})
@@ -450,13 +518,22 @@ def _serializable_model_config(self) -> Dict[str, Union[int, float, Dict]]:
     @property
     def id(self) -> str:
         """
-        It creates a hash value to match the model version using last 16 characters of hash encoding.
+        Generate a unique hash value for the model.
+
+        The hash value is created using the last 16 characters of the SHA256 hash encoding, based on the model configuration,
+        version, and model type.
 
         Returns
         -------
-        Returns string of length 16 characters contains unique hash of the model
-        """
+        str
+            A string of length 16 characters containing a unique hash of the model.
 
+        Examples
+        --------
+        >>> model = MyModel()
+        >>> model.id
+        '0123456789abcdef'
+        """
         hasher = hashlib.sha256()
         hasher.update(str(self.model_config.values()).encode())
         hasher.update(self.version.encode())
diff --git a/pytest.ini b/pytest.ini
index ba9c51ad..1347a375 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -1,6 +1,6 @@
 [pytest]
 filterwarnings =
     error
-    ignore:.*?(\b(pkg_resources\.declare_namespace)\b).*:DeprecationWarning
+    ignore:.*(\b(pkg_resources\.declare_namespace|np\.bool8)\b).*:DeprecationWarning
     ignore::UserWarning:arviz.data.inference_data
     ignore::DeprecationWarning:pkg_resources

From a17539cbd2555748ccdf75eab1512ffe9d1c1730 Mon Sep 17 00:00:00 2001
From: Michal Raczycki <michalr265@gmail.com>
Date: Mon, 15 May 2023 13:31:23 +0100
Subject: [PATCH 04/11] moving new load to ModelBuilder

---
 .../bayesian_estimator_linearmodel.py         | 41 -------------------
 pymc_experimental/model_builder.py            | 29 ++++++-------
 pymc_experimental/tests/test_model_builder.py |  5 ++-
 3 files changed, 19 insertions(+), 56 deletions(-)

diff --git a/pymc_experimental/bayesian_estimator_linearmodel.py b/pymc_experimental/bayesian_estimator_linearmodel.py
index 06d7f624..197fa86f 100644
--- a/pymc_experimental/bayesian_estimator_linearmodel.py
+++ b/pymc_experimental/bayesian_estimator_linearmodel.py
@@ -1,6 +1,4 @@
-import json
 from abc import abstractmethod
-from pathlib import Path
 from typing import Any, Dict, Union
 
 import arviz as az
@@ -60,7 +58,6 @@ class BayesianEstimator(ModelBuilder):
 
     def __init__(
         self,
-        data: Union[np.ndarray, pd.DataFrame, pd.Series] = None,
         model_config: Dict = None,
         sampler_config: Dict = None,
     ):
@@ -331,44 +328,6 @@ def sample_posterior_predictive(self, X_pred, extend_idata, combined):
 
         return posterior_predictive_samples
 
-    @classmethod
-    def load(cls, fname: str):
-        """
-        Creates a BayesianEstimator instance from a file,
-        Loads inference data for the model.
-
-        Parameters
-        ----------
-        fname : string
-            This denotes the name with path from where idata should be loaded from.
-
-        Returns
-        -------
-        Returns an instance of BayesianEstimator.
-
-        Raises
-        ------
-        ValueError
-            If the inference data that is loaded doesn't match with the model.
-        """
-
-        filepath = Path(str(fname))
-        idata = az.from_netcdf(filepath)
-        model = cls(
-            model_config=json.loads(idata.attrs["model_config"]),
-            sampler_config=json.loads(idata.attrs["sampler_config"]),
-        )
-        model.idata = idata
-        model.build_model()
-        # All previously used data is in idata.
-
-        if model.id != idata.attrs["id"]:
-            raise ValueError(
-                f"The file '{fname}' does not contain an inference data of the same model or configuration as '{cls._model_type}'"
-            )
-
-        return model
-
     @property
     def _serializable_model_config(self):
         return self.model_config
diff --git a/pymc_experimental/model_builder.py b/pymc_experimental/model_builder.py
index 28dc2dbe..bf4ff997 100644
--- a/pymc_experimental/model_builder.py
+++ b/pymc_experimental/model_builder.py
@@ -331,25 +331,24 @@ def save(self, fname: str) -> None:
             raise RuntimeError("The model hasn't been fit yet, call .fit() first")
 
     @classmethod
-    def load(cls, fname: str) -> "ModelBuilder":
+    def load(cls, fname: str):
         """
-        Create a ModelBuilder instance from a file and load inference data for the model.
+        Creates a ModelBuilder instance from a file,
+        Loads inference data for the model.
 
         Parameters
         ----------
-        fname : str
-            The name and path from which the inference data should be loaded.
+        fname : string
+            This denotes the name with path from where idata should be loaded from.
 
         Returns
         -------
-        ModelBuilder
-            An instance of the ModelBuilder class.
+        Returns an instance of ModelBuilder.
 
         Raises
         ------
         ValueError
-            If the loaded inference data does not match the model.
-
+            If the inference data that is loaded doesn't match with the model.
         Examples
         --------
         >>> class MyModel(ModelBuilder):
@@ -359,19 +358,21 @@ def load(cls, fname: str) -> "ModelBuilder":
         """
         filepath = Path(str(fname))
         idata = az.from_netcdf(filepath)
-        model_builder = cls(
+        model = cls(
             data=idata.fit_data.to_dataframe(),
             model_config=json.loads(idata.attrs["model_config"]),
             sampler_config=json.loads(idata.attrs["sampler_config"]),
         )
-        model_builder.idata = idata
-        model_builder.build_model(model_builder.data, model_builder.model_config)
-        if model_builder.id != idata.attrs["id"]:
+        model.idata = idata
+        model.build_model()
+        # All previously used data is in idata.
+
+        if model.id != idata.attrs["id"]:
             raise ValueError(
-                f"The file '{fname}' does not contain inference data of the same model or configuration as '{cls._model_type}'"
+                f"The file '{fname}' does not contain an inference data of the same model or configuration as '{cls._model_type}'"
             )
 
-        return model_builder
+        return model
 
     def fit(
         self,
diff --git a/pymc_experimental/tests/test_model_builder.py b/pymc_experimental/tests/test_model_builder.py
index 7dfdbba4..c9ba5bb6 100644
--- a/pymc_experimental/tests/test_model_builder.py
+++ b/pymc_experimental/tests/test_model_builder.py
@@ -32,7 +32,10 @@ class test_ModelBuilder(ModelBuilder):
     def build_model(self, model_data=None, model_config=None):
 
         with pm.Model() as self.model:
-
+            if model_data is None:
+                model_data = test_ModelBuilder.generate_model_data()
+            if model_config is None:
+                model_config = self.default_model_config
             x = pm.MutableData("x", model_data["input"].values)
             y_data = pm.MutableData("y_data", model_data["output"].values)
 

From 02154043e2cec6fd22287d6bf1a69fd7ef8f5e07 Mon Sep 17 00:00:00 2001
From: Michal Raczycki <michalr265@gmail.com>
Date: Mon, 15 May 2023 15:45:37 +0100
Subject: [PATCH 05/11] merging fit from BayesianEstimator to ModelBuilder,
 adapting tests

---
 .../bayesian_estimator_linearmodel.py         | 48 +------------------
 pymc_experimental/model_builder.py            | 43 ++++++++---------
 pymc_experimental/tests/test_model_builder.py | 25 +++++-----
 3 files changed, 35 insertions(+), 81 deletions(-)

diff --git a/pymc_experimental/bayesian_estimator_linearmodel.py b/pymc_experimental/bayesian_estimator_linearmodel.py
index 197fa86f..b98be828 100644
--- a/pymc_experimental/bayesian_estimator_linearmodel.py
+++ b/pymc_experimental/bayesian_estimator_linearmodel.py
@@ -1,12 +1,11 @@
 from abc import abstractmethod
-from typing import Any, Dict, Union
+from typing import Dict, Union
 
 import arviz as az
 import numpy as np
 import pandas as pd
 import pymc as pm
 import xarray as xr
-from pymc.util import RandomState
 
 from pymc_experimental.model_builder import ModelBuilder
 
@@ -121,51 +120,6 @@ def _data_setter(self, X, y=None):
 
         raise NotImplementedError
 
-    def fit(
-        self,
-        X: Union[np.ndarray, pd.DataFrame, pd.Series],
-        y: Union[np.ndarray, pd.DataFrame, pd.Series],
-        progressbar: bool = True,
-        random_seed: RandomState = None,
-        **kwargs: Any,
-    ) -> "BayesianEstimator":
-        """
-        Fit a model using the data passed as a parameter.
-        Sets attrs to inference data of the model.
-
-
-        Parameters
-        ----------
-        X : array-like if sklearn is available, otherwise array, shape (n_obs, n_features)
-            The training input samples.
-        y : array-like if sklearn is available, otherwise array, shape (n_obs,)
-            The target values (real numbers).
-        progressbar : bool
-            Specifies whether the fit progressbar should be displayed
-        random_seed : RandomState
-            Provides sampler with initial random seed for obtaining reproducible samples
-        **kwargs : Any
-            Custom sampler settings can be provided in form of keyword arguments.
-
-        Returns
-        -------
-        self : object
-            Returns self.
-        """
-
-        X, y = self._validate_data(X, y)
-
-        self.build_model()
-        self._data_setter(X, y)
-
-        sampler_config = self.sampler_config.copy()
-        sampler_config["progressbar"] = progressbar
-        sampler_config["random_seed"] = random_seed
-        sampler_config.update(**kwargs)
-
-        self.idata = self.sample_model(**sampler_config)
-        return self
-
     def predict(
         self,
         X_pred: Union[np.ndarray, pd.DataFrame, pd.Series],
diff --git a/pymc_experimental/model_builder.py b/pymc_experimental/model_builder.py
index bf4ff997..5a61724b 100644
--- a/pymc_experimental/model_builder.py
+++ b/pymc_experimental/model_builder.py
@@ -376,7 +376,8 @@ def load(cls, fname: str):
 
     def fit(
         self,
-        data: Dict[str, Union[np.ndarray, pd.DataFrame, pd.Series]] = None,
+        X: Union[np.ndarray, pd.DataFrame, pd.Series],
+        y: Union[np.ndarray, pd.DataFrame, pd.Series],
         progressbar: bool = True,
         random_seed: RandomState = None,
         **kwargs: Any,
@@ -385,22 +386,24 @@ def fit(
         Fit a model using the data passed as a parameter.
         Sets attrs to inference data of the model.
 
-        Parameter
-        ---------
-        data : dict
-            Dictionary of string and either of numpy array, pandas dataframe or pandas Series. It is the data we need to train the model on.
+
+        Parameters
+        ----------
+        X : array-like if sklearn is available, otherwise array, shape (n_obs, n_features)
+            The training input samples.
+        y : array-like if sklearn is available, otherwise array, shape (n_obs,)
+            The target values (real numbers).
         progressbar : bool
             Specifies whether the fit progressbar should be displayed
         random_seed : RandomState
             Provides sampler with initial random seed for obtaining reproducible samples
         **kwargs : Any
-            Custom sampler settings can be provided in form of keyword arguments. The recommended way is to add custom settings to sampler_config provided by
-            create_sample_input, because arguments provided in the form of kwargs will not be saved into the model, therefore will not be available after loading the model
+            Custom sampler settings can be provided in form of keyword arguments.
 
         Returns
         -------
-        returns inference data of the fitted model.
-
+        self : az.InferenceData
+            returns inference data of the fitted model.
         Examples
         --------
         >>> model = MyModel()
@@ -409,21 +412,17 @@ def fit(
         Initializing NUTS using jitter+adapt_diag...
         """
 
-        # If a new data was provided, assign it to the model
-        if data is not None:
-            self.data = self.generate_model_data(data=self.data)
+        X, y = X, y
 
-        if self.model_config is None:
-            self.model_config = self.default_model_config
-        if self.sampler_config is None:
-            self.sampler_config = self.default_sampler_config
-        if self.model is None:
-            self.build_model(self.data, self.model_config)
+        self.build_model()
+        self._data_setter(X, y)
+
+        sampler_config = self.sampler_config.copy()
+        sampler_config["progressbar"] = progressbar
+        sampler_config["random_seed"] = random_seed
+        sampler_config.update(**kwargs)
 
-        self.sampler_config["progressbar"] = progressbar
-        self.sampler_config["random_seed"] = random_seed
-        temp_sampler_config = {**self.sampler_config, **kwargs}
-        self.idata = self.sample_model(**temp_sampler_config)
+        self.idata = self.sample_model(**sampler_config)
         self.idata.add_groups(fit_data=self.data.to_xarray())
         return self.idata
 
diff --git a/pymc_experimental/tests/test_model_builder.py b/pymc_experimental/tests/test_model_builder.py
index c9ba5bb6..f0ea4b81 100644
--- a/pymc_experimental/tests/test_model_builder.py
+++ b/pymc_experimental/tests/test_model_builder.py
@@ -54,11 +54,11 @@ def build_model(self, model_data=None, model_config=None):
             # observed data
             y_model = pm.Normal("y_model", a + b * x, obs_error, shape=x.shape, observed=y_data)
 
-    def _data_setter(self, data: pd.DataFrame):
+    def _data_setter(self, x: pd.Series, y: pd.Series = None):
         with self.model:
-            pm.set_data({"x": data["input"].values})
-            if "output" in data.columns:
-                pm.set_data({"y_data": data["output"].values})
+            pm.set_data({"x": x.values})
+            if y is not None:
+                pm.set_data({"y_data": y.values})
 
     @property
     def _serializable_model_config(self):
@@ -93,7 +93,7 @@ def default_sampler_config(self) -> Dict:
     def initial_build_and_fit(check_idata=True) -> ModelBuilder:
         data = test_ModelBuilder.generate_model_data()
         model_builder = test_ModelBuilder()
-        model_builder.idata = model_builder.fit(data=data)
+        model_builder.idata = model_builder.fit(X=data["input"], y=data["output"])
         if check_idata:
             assert model_builder.idata is not None
             assert "posterior" in model_builder.idata.groups()
@@ -109,7 +109,8 @@ def test_save_without_fit_raises_runtime_error():
 def test_empty_sampler_config_fit():
     sampler_config = {}
     model_builder = test_ModelBuilder(sampler_config=sampler_config)
-    model_builder.idata = model_builder.fit()
+    data = test_ModelBuilder.generate_model_data()
+    model_builder.idata = model_builder.fit(X=data["input"], y=data["output"])
     assert model_builder.idata is not None
     assert "posterior" in model_builder.idata.groups()
 
@@ -118,9 +119,9 @@ def test_fit():
     model = test_ModelBuilder.initial_build_and_fit()
     x_pred = np.random.uniform(low=0, high=1, size=100)
     prediction_data = pd.DataFrame({"input": x_pred})
-    pred = model.predict(prediction_data)
+    pred = model.predict(prediction_data["input"])
     assert "y_model" in pred.keys()
-    post_pred = model.predict_posterior(prediction_data)
+    post_pred = model.predict_posterior(prediction_data["input"])
     assert "y_model" in post_pred.keys()
 
 
@@ -136,8 +137,8 @@ def test_save_load():
 
     x_pred = np.random.uniform(low=0, high=1, size=100)
     prediction_data = pd.DataFrame({"input": x_pred})
-    pred1 = test_builder.predict(prediction_data)
-    pred2 = test_builder2.predict(prediction_data)
+    pred1 = test_builder.predict(prediction_data["input"])
+    pred2 = test_builder2.predict(prediction_data["input"])
     assert pred1["y_model"].shape == pred2["y_model"].shape
     temp.close()
 
@@ -146,7 +147,7 @@ def test_predict():
     model = test_ModelBuilder.initial_build_and_fit()
     x_pred = np.random.uniform(low=0, high=1, size=100)
     prediction_data = pd.DataFrame({"input": x_pred})
-    pred = model.predict(prediction_data)
+    pred = model.predict(prediction_data["input"])
     assert "y_model" in pred
     assert len(prediction_data.input.values) == len(pred["y_model"])
     assert np.issubdtype(pred["y_model"].dtype, np.floating)
@@ -158,7 +159,7 @@ def test_predict_posterior(combined):
     n_pred = 100
     x_pred = np.random.uniform(low=0, high=1, size=n_pred)
     prediction_data = pd.DataFrame({"input": x_pred})
-    pred = model.predict_posterior(prediction_data, combined=combined)
+    pred = model.predict_posterior(prediction_data["input"], combined=combined)
     chains = model.idata.sample_stats.dims["chain"]
     draws = model.idata.sample_stats.dims["draw"]
     expected_shape = (n_pred, chains * draws) if combined else (chains, draws, n_pred)

From 4e7879b06a96320427283c52e6d118434fc01ae7 Mon Sep 17 00:00:00 2001
From: Michal Raczycki <michalr265@gmail.com>
Date: Mon, 15 May 2023 17:10:35 +0100
Subject: [PATCH 06/11] moving predict and sample_posterior_predictive,
 adapting tests

---
 .../bayesian_estimator_linearmodel.py         | 42 +-----------
 pymc_experimental/model_builder.py            | 68 +++++++++----------
 pymc_experimental/tests/test_model_builder.py | 22 +++---
 3 files changed, 48 insertions(+), 84 deletions(-)

diff --git a/pymc_experimental/bayesian_estimator_linearmodel.py b/pymc_experimental/bayesian_estimator_linearmodel.py
index b98be828..d15bd3e1 100644
--- a/pymc_experimental/bayesian_estimator_linearmodel.py
+++ b/pymc_experimental/bayesian_estimator_linearmodel.py
@@ -74,7 +74,7 @@ def __init__(
             model_config = self.default_model_config
         if sampler_config is None:
             sampler_config = self.default_sampler_config
-        super().__init__(data=data, model_config=model_config, sampler_config=sampler_config)
+        super().__init__(model_config=model_config, sampler_config=sampler_config)
 
     @property
     @abstractmethod
@@ -120,46 +120,6 @@ def _data_setter(self, X, y=None):
 
         raise NotImplementedError
 
-    def predict(
-        self,
-        X_pred: Union[np.ndarray, pd.DataFrame, pd.Series],
-        extend_idata: bool = True,
-    ) -> np.ndarray:
-        """
-        Uses model to predict on unseen data and return point prediction of all the samples. The point prediction
-        for each input row is the expected output value, computed as the mean of MCMC samples.
-
-        Parameters
-        ---------
-        X_pred : array-like if sklearn is available, otherwise array, shape (n_pred, n_features)
-            The input data used for prediction.
-        extend_idata : Boolean determining whether the predictions should be added to inference data object.
-            Defaults to True.
-
-        Returns
-        -------
-        y_pred : ndarray, shape (n_pred,)
-            Predicted output corresponding to input X_pred.
-        """
-        if not hasattr(self, "output_var"):
-            raise NotImplementedError(f"Subclasses of {__class__} should set self.output_var")
-
-        X_pred = self._validate_data(X_pred)
-
-        posterior_predictive_samples = self.sample_posterior_predictive(
-            X_pred, extend_idata, combined=False
-        )
-
-        if self.output_var not in posterior_predictive_samples:
-            raise KeyError(
-                f"Output variable {self.output_var} not found in posterior predictive samples."
-            )
-
-        posterior_means = posterior_predictive_samples[self.output_var].mean(
-            dim=["chain", "draw"], keep_attrs=False
-        )
-        return posterior_means.data
-
     def predict_posterior(
         self,
         X_pred: Union[np.ndarray, pd.DataFrame, pd.Series],
diff --git a/pymc_experimental/model_builder.py b/pymc_experimental/model_builder.py
index 5a61724b..17965d19 100644
--- a/pymc_experimental/model_builder.py
+++ b/pymc_experimental/model_builder.py
@@ -23,7 +23,6 @@
 import numpy as np
 import pandas as pd
 import pymc as pm
-import xarray as xr
 from pymc.util import RandomState
 
 
@@ -428,22 +427,24 @@ def fit(
 
     def predict(
         self,
-        data_prediction: Dict[str, Union[np.ndarray, pd.DataFrame, pd.Series]] = None,
+        X_pred: Union[np.ndarray, pd.DataFrame, pd.Series],
         extend_idata: bool = True,
-    ) -> xr.Dataset:
+    ) -> np.ndarray:
         """
-        Uses model to predict on unseen data and return point prediction of all the samples
+        Uses model to predict on unseen data and return point prediction of all the samples. The point prediction
+        for each input row is the expected output value, computed as the mean of MCMC samples.
 
         Parameters
         ---------
-        data_prediction : Dictionary of string and either of numpy array, pandas dataframe or pandas Series
-            It is the data we need to make prediction on using the model.
+        X_pred : array-like if sklearn is available, otherwise array, shape (n_pred, n_features)
+            The input data used for prediction.
         extend_idata : Boolean determining whether the predictions should be added to inference data object.
             Defaults to True.
 
         Returns
         -------
-        returns posterior mean of predictive samples
+        y_pred : ndarray, shape (n_pred,)
+            Predicted output corresponding to input X_pred.
 
         Examples
         --------
@@ -453,43 +454,42 @@ def predict(
         >>> prediction_data = pd.DataFrame({'input':x_pred})
         >>> pred_mean = model.predict(prediction_data)
         """
-        posterior_predictive_samples = self.predict_posterior(data_prediction, extend_idata)
-        posterior_means = posterior_predictive_samples.mean(dim=["chain", "draw"], keep_attrs=True)
-        return posterior_means
+        if not hasattr(self, "output_var"):
+            raise NotImplementedError(f"Subclasses of {__class__} should set self.output_var")
 
-    def predict_posterior(
-        self,
-        data_prediction: Dict[str, Union[np.ndarray, pd.DataFrame, pd.Series]] = None,
-        extend_idata: bool = True,
-        combined: bool = False,
-    ) -> xr.Dataset:
+        posterior_predictive_samples = self.sample_posterior_predictive(
+            X_pred, extend_idata, combined=False
+        )
+
+        if self.output_var not in posterior_predictive_samples:
+            raise KeyError(
+                f"Output variable {self.output_var} not found in posterior predictive samples."
+            )
+
+        posterior_means = posterior_predictive_samples[self.output_var].mean(
+            dim=["chain", "draw"], keep_attrs=True
+        )
+        return posterior_means.data
+
+    def sample_posterior_predictive(self, X_pred, extend_idata, combined):
         """
-        Generate posterior predictive samples on unseen data.
+        Sample from the model's posterior predictive distribution.
 
         Parameters
-        ----------
-        data_prediction : Dictionary of string and either of numpy array, pandas dataframe or pandas Series
-            It is the data we need to make prediction on using the model.
+        ---------
+        X_pred : array, shape (n_pred, n_features)
+            The input data used for prediction using prior distribution..
         extend_idata : Boolean determining whether the predictions should be added to inference data object.
-            Defaults to True.
-        combined: Combine chain and draw dims into sample. Won’t work if a dim named sample already exists.
             Defaults to False.
+        combined: Combine chain and draw dims into sample. Won't work if a dim named sample already exists.
+            Defaults to True.
 
         Returns
         -------
-        returns posterior predictive samples
-
-        Examples
-        --------
-        >>> model = MyModel()
-        >>> idata = model.fit(data)
-        >>> x_pred = []
-        >>> prediction_data = pd.DataFrame({'input': x_pred})
-        >>> pred_samples = model.predict_posterior(prediction_data)
+        posterior_predictive_samples : DataArray, shape (n_pred, samples)
+            Posterior predictive samples for each input X_pred
         """
-
-        if data_prediction is not None:  # set new input data
-            self._data_setter(data_prediction)
+        self._data_setter(X_pred)
 
         with self.model:  # sample with new input data
             post_pred = pm.sample_posterior_predictive(self.idata)
diff --git a/pymc_experimental/tests/test_model_builder.py b/pymc_experimental/tests/test_model_builder.py
index f0ea4b81..eb507cbd 100644
--- a/pymc_experimental/tests/test_model_builder.py
+++ b/pymc_experimental/tests/test_model_builder.py
@@ -53,6 +53,7 @@ def build_model(self, model_data=None, model_config=None):
 
             # observed data
             y_model = pm.Normal("y_model", a + b * x, obs_error, shape=x.shape, observed=y_data)
+            self.output_var = "y_model"
 
     def _data_setter(self, x: pd.Series, y: pd.Series = None):
         with self.model:
@@ -120,9 +121,10 @@ def test_fit():
     x_pred = np.random.uniform(low=0, high=1, size=100)
     prediction_data = pd.DataFrame({"input": x_pred})
     pred = model.predict(prediction_data["input"])
-    assert "y_model" in pred.keys()
-    post_pred = model.predict_posterior(prediction_data["input"])
-    assert "y_model" in post_pred.keys()
+    post_pred = model.sample_posterior_predictive(
+        prediction_data["input"], extend_idata=True, combined=True
+    )
+    post_pred.y_model.shape[0] == prediction_data.input.shape
 
 
 @pytest.mark.skipif(
@@ -139,7 +141,7 @@ def test_save_load():
     prediction_data = pd.DataFrame({"input": x_pred})
     pred1 = test_builder.predict(prediction_data["input"])
     pred2 = test_builder2.predict(prediction_data["input"])
-    assert pred1["y_model"].shape == pred2["y_model"].shape
+    assert pred1.shape == pred2.shape
     temp.close()
 
 
@@ -148,18 +150,20 @@ def test_predict():
     x_pred = np.random.uniform(low=0, high=1, size=100)
     prediction_data = pd.DataFrame({"input": x_pred})
     pred = model.predict(prediction_data["input"])
-    assert "y_model" in pred
-    assert len(prediction_data.input.values) == len(pred["y_model"])
-    assert np.issubdtype(pred["y_model"].dtype, np.floating)
+    # Perform elementwise comparison using numpy
+    assert type(pred) == np.ndarray
+    assert len(pred) > 0
 
 
 @pytest.mark.parametrize("combined", [True, False])
-def test_predict_posterior(combined):
+def test_sample_posterior_predictive(combined):
     model = test_ModelBuilder.initial_build_and_fit()
     n_pred = 100
     x_pred = np.random.uniform(low=0, high=1, size=n_pred)
     prediction_data = pd.DataFrame({"input": x_pred})
-    pred = model.predict_posterior(prediction_data["input"], combined=combined)
+    pred = model.sample_posterior_predictive(
+        prediction_data["input"], combined=combined, extend_idata=True
+    )
     chains = model.idata.sample_stats.dims["chain"]
     draws = model.idata.sample_stats.dims["draw"]
     expected_shape = (n_pred, chains * draws) if combined else (chains, draws, n_pred)

From ece54066f672467092dd9c6a60cfa916498ddda8 Mon Sep 17 00:00:00 2001
From: Michal Raczycki <michalr265@gmail.com>
Date: Tue, 16 May 2023 10:29:56 +0100
Subject: [PATCH 07/11] canibalize BayesianEstimator by ModelBuilder

---
 .../bayesian_estimator_linearmodel.py         | 376 ------------------
 pymc_experimental/linearmodel.py              | 120 ++++++
 pymc_experimental/model_builder.py            | 146 ++++++-
 3 files changed, 257 insertions(+), 385 deletions(-)
 delete mode 100644 pymc_experimental/bayesian_estimator_linearmodel.py
 create mode 100644 pymc_experimental/linearmodel.py

diff --git a/pymc_experimental/bayesian_estimator_linearmodel.py b/pymc_experimental/bayesian_estimator_linearmodel.py
deleted file mode 100644
index d15bd3e1..00000000
--- a/pymc_experimental/bayesian_estimator_linearmodel.py
+++ /dev/null
@@ -1,376 +0,0 @@
-from abc import abstractmethod
-from typing import Dict, Union
-
-import arviz as az
-import numpy as np
-import pandas as pd
-import pymc as pm
-import xarray as xr
-
-from pymc_experimental.model_builder import ModelBuilder
-
-# If scikit-learn is available, use its data validator
-try:
-    from sklearn.utils.validation import check_array, check_X_y
-# If scikit-learn is not available, return the data unchanged
-except ImportError:
-
-    def check_X_y(X, y, **kwargs):
-        return X, y
-
-    def check_array(X, **kwargs):
-        return X
-
-
-class BayesianEstimator(ModelBuilder):
-    """
-    Base class similar to ModelBuilder but customized for integration with a scikit-learn workflow.
-    It is designed to encapsulate parameter inference ("fit") and posterior prediction ("predict")
-    for simple models with the following characteristics:
-     - One or more input features for each observation
-     - One observed output feature for each observation
-
-    Estimators derived from this base class can utilize scikit-learn transformers for input and
-    output accessed via `fit` and `predict`. (`TransformedTargetRegressor` would need to be extended
-    in order to transform the output of `predict_proba` or `predict_posterior`.)
-
-    Example scikit-learn usage:
-    >>> from sklearn.pipeline import Pipeline
-    >>> from sklearn.compose import TransformedTargetRegressor
-    >>> model = Pipeline([
-    >>>     ('input_scaling', StandardScaler()),
-    >>>     ('linear_model', TransformedTargetRegressor(LinearModel(model_config),
-    >>>                                                 transformer=StandardScaler()),)
-    >>> ])
-    >>> model.fit(X_obs, y_obs)
-    >>> y_pred = model.predict(X_pred)
-
-    The format for probabilistic output forecasts is currently an xarray `DataSet` of the posterior
-    prediction draws for each input prediction. This
-
-    The `sklearn` package is not a dependency for using this class, although if imports from `sklearn`
-    are successful, then scikit-learn's data validation functions are used to accept "array-like" input.
-    """
-
-    model_type = "BaseClass"
-    version = "None"
-
-    def __init__(
-        self,
-        model_config: Dict = None,
-        sampler_config: Dict = None,
-    ):
-        """
-        Initializes model configuration and sampler configuration for the model
-
-        Parameters
-        ----------
-        model_config : dict
-            Parameters that initialise model configuration
-        sampler_config : dict
-            Parameters that initialise sampler configuration
-        """
-        if model_config is None:
-            model_config = self.default_model_config
-        if sampler_config is None:
-            sampler_config = self.default_sampler_config
-        super().__init__(model_config=model_config, sampler_config=sampler_config)
-
-    @property
-    @abstractmethod
-    def default_model_config(self):
-        raise NotImplementedError
-
-    @property
-    @abstractmethod
-    def default_sampler_config(self):
-        raise NotImplementedError
-
-    def _validate_data(self, X, y=None):
-        if y is not None:
-            return check_X_y(X, y, accept_sparse=False, y_numeric=True, multi_output=False)
-        else:
-            return check_array(X, accept_sparse=False)
-
-    @abstractmethod
-    def build_model(
-        model_data: Dict[str, Union[np.ndarray, pd.DataFrame, pd.Series]] = None,
-        model_config: Dict[str, Union[int, float, Dict]] = None,
-    ) -> None:
-
-        raise NotImplementedError
-
-    @abstractmethod
-    def _data_setter(self, X, y=None):
-        """
-        Sets new data in the model.
-
-        Parameters
-        ----------
-        X : array, shape (n_obs, n_features)
-            The training input samples.
-        y : array, shape (n_obs,)
-            The target values (real numbers).
-
-        Returns:
-        ----------
-        None
-
-        """
-
-        raise NotImplementedError
-
-    def predict_posterior(
-        self,
-        X_pred: Union[np.ndarray, pd.DataFrame, pd.Series],
-        extend_idata: bool = True,
-        combined: bool = True,
-    ) -> xr.DataArray:
-        """
-        Generate posterior predictive samples on unseen data.
-
-        Parameters
-        ---------
-        X_pred : array-like if sklearn is available, otherwise array, shape (n_pred, n_features)
-            The input data used for prediction.
-        extend_idata : Boolean determining whether the predictions should be added to inference data object.
-            Defaults to True.
-        combined: Combine chain and draw dims into sample. Won't work if a dim named sample already exists.
-            Defaults to True.
-
-        Returns
-        -------
-        y_pred : DataArray, shape (n_pred, chains * draws) if combined is True, otherwise (chains, draws, n_pred)
-            Posterior predictive samples for each input X_pred
-        """
-        if not hasattr(self, "output_var"):
-            raise NotImplementedError(f"Subclasses of {__class__} should set self.output_var")
-
-        X_pred = self._validate_data(X_pred)
-        posterior_predictive_samples = self.sample_posterior_predictive(
-            X_pred, extend_idata, combined
-        )
-
-        if self.output_var not in posterior_predictive_samples:
-            raise KeyError(
-                f"Output variable {self.output_var} not found in posterior predictive samples."
-            )
-
-        return posterior_predictive_samples[self.output_var]
-
-    def predict_proba(
-        self,
-        X_pred: Union[np.ndarray, pd.DataFrame, pd.Series],
-        extend_idata: bool = True,
-        combined: bool = False,
-    ) -> xr.DataArray:
-        """Alias for `predict_posterior`, for consistency with scikit-learn probabilistic estimators."""
-        return self.predict_posterior(X_pred, extend_idata, combined)
-
-    def sample_prior_predictive(
-        self, X_pred, samples: int = None, extend_idata: bool = False, combined: bool = True
-    ):
-        """
-        Sample from the model's prior predictive distribution.
-
-        Parameters
-        ---------
-        X_pred : array, shape (n_pred, n_features)
-            The input data used for prediction using prior distribution.
-        samples : int
-            Number of samples from the prior parameter distributions to generate.
-            If not set, uses sampler_config['draws'] if that is available, otherwise defaults to 500.
-        extend_idata : Boolean determining whether the predictions should be added to inference data object.
-            Defaults to False.
-        combined: Combine chain and draw dims into sample. Won't work if a dim named sample already exists.
-            Defaults to True.
-
-        Returns
-        -------
-        prior_predictive_samples : DataArray, shape (n_pred, samples)
-            Prior predictive samples for each input X_pred
-        """
-        if samples is None:
-            samples = self.sampler_config.get("draws", 500)
-
-        if self.model is None:
-            self.build_model()
-
-        self._data_setter(X_pred)
-
-        with self.model:  # sample with new input data
-            prior_pred = pm.sample_prior_predictive(samples)
-            self.set_idata_attrs(prior_pred)
-            if extend_idata:
-                if self.idata is not None:
-                    self.idata.extend(prior_pred)
-                else:
-                    self.idata = prior_pred
-
-        prior_predictive_samples = az.extract(prior_pred, "prior_predictive", combined=combined)
-
-        return prior_predictive_samples
-
-    def sample_posterior_predictive(self, X_pred, extend_idata, combined):
-        """
-        Sample from the model's posterior predictive distribution.
-
-        Parameters
-        ---------
-        X_pred : array, shape (n_pred, n_features)
-            The input data used for prediction using prior distribution..
-        extend_idata : Boolean determining whether the predictions should be added to inference data object.
-            Defaults to False.
-        combined: Combine chain and draw dims into sample. Won't work if a dim named sample already exists.
-            Defaults to True.
-
-        Returns
-        -------
-        posterior_predictive_samples : DataArray, shape (n_pred, samples)
-            Posterior predictive samples for each input X_pred
-        """
-        self._data_setter(X_pred)
-
-        with self.model:  # sample with new input data
-            post_pred = pm.sample_posterior_predictive(self.idata)
-            if extend_idata:
-                self.idata.extend(post_pred)
-
-        posterior_predictive_samples = az.extract(
-            post_pred, "posterior_predictive", combined=combined
-        )
-
-        return posterior_predictive_samples
-
-    @property
-    def _serializable_model_config(self):
-        return self.model_config
-
-    def get_params(self, deep=True):
-        """
-        Get all the model parameters needed to instantiate a copy of the model, not including training data.
-        """
-        return {"model_config": self.model_config, "sampler_config": self.sampler_config}
-
-    def set_params(self, **params):
-        """
-        Set all the model parameters needed to instantiate the model, not including training data.
-        """
-        self.model_config = params["model_config"]
-        self.sampler_config = params["sampler_config"]
-
-
-class LinearModel(BayesianEstimator):
-    """
-    This class is an implementation of a single-input linear regression model in PYMC using the
-    BayesianEstimator base class for interoperability with scikit-learn.
-    """
-
-    _model_type = "LinearModel"
-    version = "0.1"
-
-    @property
-    def default_model_config(self):
-        return {
-            "intercept": {"loc": 0, "scale": 10},
-            "slope": {"loc": 0, "scale": 10},
-            "obs_error": 2,
-        }
-
-    @property
-    def default_sampler_config(self):
-        return {
-            "draws": 1_000,
-            "tune": 1_000,
-            "chains": 3,
-            "target_accept": 0.95,
-        }
-
-    def build_model(self):
-        """
-        Build the PyMC model.
-
-        Returns
-        -------
-        None
-
-        Examples
-        --------
-        >>> self.build_model()
-        >>> assert self.model is not None
-        >>> assert isinstance(self.model, pm.Model)
-        >>> assert "intercept" in self.model.named_vars
-        >>> assert "slope" in self.model.named_vars
-        >>> assert "σ_model_fmc" in self.model.named_vars
-        >>> assert "y_model" in self.model.named_vars
-        >>> assert "y_hat" in self.model.named_vars
-        >>> assert self.output_var == "y_hat"
-        """
-        cfg = self.model_config
-
-        # The model is built with placeholder data.
-        # Actual data will be set by _data_setter when fitting or evaluating the model.
-        # Data array size can change but number of dimensions must stay the same.
-        with pm.Model() as self.model:
-            x = pm.MutableData("x", np.zeros((1,)), dims="observation")
-            y_data = pm.MutableData("y_data", np.zeros((1,)), dims="observation")
-
-            # priors
-            intercept = pm.Normal(
-                "intercept", cfg["intercept"]["loc"], sigma=cfg["intercept"]["scale"]
-            )
-            slope = pm.Normal("slope", cfg["slope"]["loc"], sigma=cfg["slope"]["scale"])
-            obs_error = pm.HalfNormal("σ_model_fmc", cfg["obs_error"])
-
-            # Model
-            y_model = pm.Deterministic("y_model", intercept + slope * x, dims="observation")
-
-            # observed data
-            y_hat = pm.Normal(
-                "y_hat",
-                y_model,
-                sigma=obs_error,
-                shape=x.shape,
-                observed=y_data,
-                dims="observation",
-            )
-            self.output_var = "y_hat"
-
-    def _data_setter(self, X, y=None):
-        with self.model:
-            pm.set_data({"x": X[:, 0]})
-            if y is not None:
-                pm.set_data({"y_data": y.squeeze()})
-
-    @classmethod
-    def generate_model_data(cls, nsamples=100, data=None):
-        """
-        Generate model data for linear regression.
-
-        Parameters
-        ----------
-        nsamples : int, optional
-            The number of samples to generate. Default is 100.
-        data : np.ndarray, optional
-            An optional data array to add noise to.
-
-        Returns
-        -------
-        tuple
-            A tuple of two np.ndarrays representing the feature matrix and target vector, respectively.
-
-        Examples
-        --------
-        >>> import numpy as np
-        >>> x, y = cls.generate_model_data()
-        >>> assert isinstance(x, np.ndarray)
-        >>> assert isinstance(y, np.ndarray)
-        >>> assert x.shape == (100, 1)
-        >>> assert y.shape == (100,)
-        """
-        x = np.linspace(start=0, stop=1, num=nsamples)
-        y = 5 * x + 3
-        y = y + np.random.normal(0, 1, len(x))
-
-        x = np.expand_dims(x, -1)  # scikit assumes a dimension for features.
-        return x, y
diff --git a/pymc_experimental/linearmodel.py b/pymc_experimental/linearmodel.py
new file mode 100644
index 00000000..8431c574
--- /dev/null
+++ b/pymc_experimental/linearmodel.py
@@ -0,0 +1,120 @@
+import numpy as np
+import pymc as pm
+
+from pymc_experimental.model_builder import ModelBuilder
+
+
+class LinearModel(ModelBuilder):
+    """
+    This class is an implementation of a single-input linear regression model in PYMC using the
+    BayesianEstimator base class for interoperability with scikit-learn.
+    """
+
+    _model_type = "LinearModel"
+    version = "0.1"
+
+    @property
+    def default_model_config(self):
+        return {
+            "intercept": {"loc": 0, "scale": 10},
+            "slope": {"loc": 0, "scale": 10},
+            "obs_error": 2,
+        }
+
+    @property
+    def default_sampler_config(self):
+        return {
+            "draws": 1_000,
+            "tune": 1_000,
+            "chains": 3,
+            "target_accept": 0.95,
+        }
+
+    def build_model(self):
+        """
+        Build the PyMC model.
+
+        Returns
+        -------
+        None
+
+        Examples
+        --------
+        >>> self.build_model()
+        >>> assert self.model is not None
+        >>> assert isinstance(self.model, pm.Model)
+        >>> assert "intercept" in self.model.named_vars
+        >>> assert "slope" in self.model.named_vars
+        >>> assert "σ_model_fmc" in self.model.named_vars
+        >>> assert "y_model" in self.model.named_vars
+        >>> assert "y_hat" in self.model.named_vars
+        >>> assert self.output_var == "y_hat"
+        """
+        cfg = self.model_config
+
+        # The model is built with placeholder data.
+        # Actual data will be set by _data_setter when fitting or evaluating the model.
+        # Data array size can change but number of dimensions must stay the same.
+        with pm.Model() as self.model:
+            x = pm.MutableData("x", np.zeros((1,)), dims="observation")
+            y_data = pm.MutableData("y_data", np.zeros((1,)), dims="observation")
+
+            # priors
+            intercept = pm.Normal(
+                "intercept", cfg["intercept"]["loc"], sigma=cfg["intercept"]["scale"]
+            )
+            slope = pm.Normal("slope", cfg["slope"]["loc"], sigma=cfg["slope"]["scale"])
+            obs_error = pm.HalfNormal("σ_model_fmc", cfg["obs_error"])
+
+            # Model
+            y_model = pm.Deterministic("y_model", intercept + slope * x, dims="observation")
+
+            # observed data
+            y_hat = pm.Normal(
+                "y_hat",
+                y_model,
+                sigma=obs_error,
+                shape=x.shape,
+                observed=y_data,
+                dims="observation",
+            )
+            self.output_var = "y_hat"
+
+    def _data_setter(self, X, y=None):
+        with self.model:
+            pm.set_data({"x": X[:, 0]})
+            if y is not None:
+                pm.set_data({"y_data": y.squeeze()})
+
+    @classmethod
+    def generate_model_data(cls, nsamples=100, data=None):
+        """
+        Generate model data for linear regression.
+
+        Parameters
+        ----------
+        nsamples : int, optional
+            The number of samples to generate. Default is 100.
+        data : np.ndarray, optional
+            An optional data array to add noise to.
+
+        Returns
+        -------
+        tuple
+            A tuple of two np.ndarrays representing the feature matrix and target vector, respectively.
+
+        Examples
+        --------
+        >>> import numpy as np
+        >>> x, y = cls.generate_model_data()
+        >>> assert isinstance(x, np.ndarray)
+        >>> assert isinstance(y, np.ndarray)
+        >>> assert x.shape == (100, 1)
+        >>> assert y.shape == (100,)
+        """
+        x = np.linspace(start=0, stop=1, num=nsamples)
+        y = 5 * x + 3
+        y = y + np.random.normal(0, 1, len(x))
+
+        x = np.expand_dims(x, -1)  # scikit assumes a dimension for features.
+        return x, y
diff --git a/pymc_experimental/model_builder.py b/pymc_experimental/model_builder.py
index 17965d19..a78bb3a1 100644
--- a/pymc_experimental/model_builder.py
+++ b/pymc_experimental/model_builder.py
@@ -17,14 +17,27 @@
 import json
 from abc import abstractmethod
 from pathlib import Path
-from typing import Any, Dict, Union
+from typing import Any, Dict, List, Union
 
 import arviz as az
 import numpy as np
 import pandas as pd
 import pymc as pm
+import xarray as xr
 from pymc.util import RandomState
 
+# If scikit-learn is available, use its data validator
+try:
+    from sklearn.utils.validation import check_array, check_X_y
+# If scikit-learn is not available, return the data unchanged
+except ImportError:
+
+    def check_X_y(X, y, **kwargs):
+        return X, y
+
+    def check_array(X, **kwargs):
+        return X
+
 
 class ModelBuilder:
     """
@@ -71,29 +84,40 @@ def __init__(
         self.idata = None  # idata is generated during fitting
         self.is_fitted_ = False
 
+    def _validate_data(self, X, y=None):
+        if y is not None:
+            return check_X_y(X, y, accept_sparse=False, y_numeric=True, multi_output=False)
+        else:
+            return check_array(X, accept_sparse=False)
+
     @abstractmethod
     def _data_setter(
-        self, data: Dict[str, Union[np.ndarray, pd.DataFrame, pd.Series]], x_only: bool = True
-    ):
+        self, X: Union[np.ndarray, pd.DataFrame], y: Union[np.ndarray, pd.DataFrame, List] = None
+    ) -> None:
         """
         Sets new data in the model.
 
         Parameters
         ----------
-        data : Dictionary of string and either of numpy array, pandas dataframe or pandas Series
-            It is the data we need to set as idata for the model
-        x_only : bool
-            if data only contains values of x and y is not present in the data
+        X : array, shape (n_obs, n_features)
+            The training input samples.
+        y : array, shape (n_obs,)
+            The target values (real numbers).
+
+        Returns:
+        ----------
+        None
 
         Examples
         --------
         >>> def _data_setter(self, data : pd.DataFrame):
         >>>     with self.model:
-        >>>         pm.set_data({'x': data['input'].values})
+        >>>         pm.set_data({'x': X['x'].values})
         >>>         try: # if y values in new data
-        >>>             pm.set_data({'y_data': data['output'].values})
+        >>>             pm.set_data({'y_data': y.values})
         >>>         except: # dummies otherwise
         >>>             pm.set_data({'y_data': np.zeros(len(data))})
+
         """
 
         raise NotImplementedError
@@ -471,6 +495,50 @@ def predict(
         )
         return posterior_means.data
 
+    def sample_prior_predictive(
+        self, X_pred, samples: int = None, extend_idata: bool = False, combined: bool = True
+    ):
+        """
+        Sample from the model's prior predictive distribution.
+
+        Parameters
+        ---------
+        X_pred : array, shape (n_pred, n_features)
+            The input data used for prediction using prior distribution.
+        samples : int
+            Number of samples from the prior parameter distributions to generate.
+            If not set, uses sampler_config['draws'] if that is available, otherwise defaults to 500.
+        extend_idata : Boolean determining whether the predictions should be added to inference data object.
+            Defaults to False.
+        combined: Combine chain and draw dims into sample. Won't work if a dim named sample already exists.
+            Defaults to True.
+
+        Returns
+        -------
+        prior_predictive_samples : DataArray, shape (n_pred, samples)
+            Prior predictive samples for each input X_pred
+        """
+        if samples is None:
+            samples = self.sampler_config.get("draws", 500)
+
+        if self.model is None:
+            self.build_model()
+
+        self._data_setter(X_pred)
+
+        with self.model:  # sample with new input data
+            prior_pred = pm.sample_prior_predictive(samples)
+            self.set_idata_attrs(prior_pred)
+            if extend_idata:
+                if self.idata is not None:
+                    self.idata.extend(prior_pred)
+                else:
+                    self.idata = prior_pred
+
+        prior_predictive_samples = az.extract(prior_pred, "prior_predictive", combined=combined)
+
+        return prior_predictive_samples
+
     def sample_posterior_predictive(self, X_pred, extend_idata, combined):
         """
         Sample from the model's posterior predictive distribution.
@@ -502,6 +570,19 @@ def sample_posterior_predictive(self, X_pred, extend_idata, combined):
 
         return posterior_predictive_samples
 
+    def get_params(self, deep=True):
+        """
+        Get all the model parameters needed to instantiate a copy of the model, not including training data.
+        """
+        return {"model_config": self.model_config, "sampler_config": self.sampler_config}
+
+    def set_params(self, **params):
+        """
+        Set all the model parameters needed to instantiate the model, not including training data.
+        """
+        self.model_config = params["model_config"]
+        self.sampler_config = params["sampler_config"]
+
     @property
     @abstractmethod
     def _serializable_model_config(self) -> Dict[str, Union[int, float, Dict]]:
@@ -515,6 +596,53 @@ def _serializable_model_config(self) -> Dict[str, Union[int, float, Dict]]:
         model_config: dict
         """
 
+    def predict_proba(
+        self,
+        X_pred: Union[np.ndarray, pd.DataFrame, pd.Series],
+        extend_idata: bool = True,
+        combined: bool = False,
+    ) -> xr.DataArray:
+        """Alias for `predict_posterior`, for consistency with scikit-learn probabilistic estimators."""
+        return self.predict_posterior(X_pred, extend_idata, combined)
+
+    def predict_posterior(
+        self,
+        X_pred: Union[np.ndarray, pd.DataFrame, pd.Series],
+        extend_idata: bool = True,
+        combined: bool = True,
+    ) -> xr.DataArray:
+        """
+        Generate posterior predictive samples on unseen data.
+
+        Parameters
+        ---------
+        X_pred : array-like if sklearn is available, otherwise array, shape (n_pred, n_features)
+            The input data used for prediction.
+        extend_idata : Boolean determining whether the predictions should be added to inference data object.
+            Defaults to True.
+        combined: Combine chain and draw dims into sample. Won't work if a dim named sample already exists.
+            Defaults to True.
+
+        Returns
+        -------
+        y_pred : DataArray, shape (n_pred, chains * draws) if combined is True, otherwise (chains, draws, n_pred)
+            Posterior predictive samples for each input X_pred
+        """
+        if not hasattr(self, "output_var"):
+            raise NotImplementedError(f"Subclasses of {__class__} should set self.output_var")
+
+        X_pred = self._validate_data(X_pred)
+        posterior_predictive_samples = self.sample_posterior_predictive(
+            X_pred, extend_idata, combined
+        )
+
+        if self.output_var not in posterior_predictive_samples:
+            raise KeyError(
+                f"Output variable {self.output_var} not found in posterior predictive samples."
+            )
+
+        return posterior_predictive_samples[self.output_var]
+
     @property
     def id(self) -> str:
         """

From 9e557abd224d4bd80fd5ed7ef073eba2c789de51 Mon Sep 17 00:00:00 2001
From: Michal Raczycki <michalr265@gmail.com>
Date: Tue, 16 May 2023 11:10:34 +0100
Subject: [PATCH 08/11] adaptation of ModelBuiler to make Linearmodel tests
 pass

---
 pymc_experimental/model_builder.py                  | 13 ++++++++++++-
 ...estimator_linearmodel.py => test_linearmodel.py} |  2 +-
 2 files changed, 13 insertions(+), 2 deletions(-)
 rename pymc_experimental/tests/{test_bayesian_estimator_linearmodel.py => test_linearmodel.py} (98%)

diff --git a/pymc_experimental/model_builder.py b/pymc_experimental/model_builder.py
index a78bb3a1..1ea8063c 100644
--- a/pymc_experimental/model_builder.py
+++ b/pymc_experimental/model_builder.py
@@ -402,6 +402,7 @@ def fit(
         X: Union[np.ndarray, pd.DataFrame, pd.Series],
         y: Union[np.ndarray, pd.DataFrame, pd.Series],
         progressbar: bool = True,
+        predictor_names: List[str] = None,
         random_seed: RandomState = None,
         **kwargs: Any,
     ) -> az.InferenceData:
@@ -418,6 +419,8 @@ def fit(
             The target values (real numbers).
         progressbar : bool
             Specifies whether the fit progressbar should be displayed
+        predictor_names: List[str] = None,
+            Allows for custom naming of predictors given in a form of 2dArray, if not provided the predictors will be named like predictor1, predictor2...
         random_seed : RandomState
             Provides sampler with initial random seed for obtaining reproducible samples
         **kwargs : Any
@@ -446,7 +449,15 @@ def fit(
         sampler_config.update(**kwargs)
 
         self.idata = self.sample_model(**sampler_config)
-        self.idata.add_groups(fit_data=self.data.to_xarray())
+        if type(X) is np.ndarray:
+            if predictor_names is not None:
+                X = pd.DataFrame(X, columns=predictor_names)
+            else:
+                X = pd.DataFrame(X, columns=[f"predictor{x}" for x in range(1, X.shape[1] + 1)])
+        if type(y) is np.ndarray:
+            y = pd.Series(y, name="target")
+        combined_data = pd.concat([X, y], axis=1)
+        self.idata.add_groups(fit_data=combined_data.to_xarray())
         return self.idata
 
     def predict(
diff --git a/pymc_experimental/tests/test_bayesian_estimator_linearmodel.py b/pymc_experimental/tests/test_linearmodel.py
similarity index 98%
rename from pymc_experimental/tests/test_bayesian_estimator_linearmodel.py
rename to pymc_experimental/tests/test_linearmodel.py
index 38000d46..21be54fc 100644
--- a/pymc_experimental/tests/test_bayesian_estimator_linearmodel.py
+++ b/pymc_experimental/tests/test_linearmodel.py
@@ -20,7 +20,7 @@
 import pytest
 import xarray as xr
 
-from pymc_experimental.bayesian_estimator_linearmodel import LinearModel
+from pymc_experimental.linearmodel import LinearModel
 
 try:
     from sklearn.compose import TransformedTargetRegressor

From ad21234c8d12e2abf1db51e134f901c3801f412b Mon Sep 17 00:00:00 2001
From: Michal Raczycki <michalr265@gmail.com>
Date: Fri, 19 May 2023 13:32:32 +0100
Subject: [PATCH 09/11] final adjustments for pymc-marketing compatibility with
 sklearn

---
 .pylintrc                                     |  1 -
 pymc_experimental/model_builder.py            | 79 ++++++++++---------
 pymc_experimental/tests/test_model_builder.py | 10 +--
 3 files changed, 48 insertions(+), 42 deletions(-)

diff --git a/.pylintrc b/.pylintrc
index 952050ed..a4593ef8 100644
--- a/.pylintrc
+++ b/.pylintrc
@@ -46,7 +46,6 @@ enable=import-self,
        used-before-assignment,
        cell-var-from-loop,
        global-variable-undefined,
-       dangerous-default-value,
        # redefined-builtin,
        redefine-in-handler,
        unused-import,
diff --git a/pymc_experimental/model_builder.py b/pymc_experimental/model_builder.py
index 1ea8063c..e878ad32 100644
--- a/pymc_experimental/model_builder.py
+++ b/pymc_experimental/model_builder.py
@@ -17,7 +17,7 @@
 import json
 from abc import abstractmethod
 from pathlib import Path
-from typing import Any, Dict, List, Union
+from typing import Any, Dict, List, Optional, Union
 
 import arviz as az
 import numpy as np
@@ -51,8 +51,8 @@ class ModelBuilder:
     def __init__(
         self,
         data: Union[np.ndarray, pd.DataFrame, pd.Series] = None,
-        model_config: Dict = None,
-        sampler_config: Dict = None,
+        model_config: Dict = {},
+        sampler_config: Dict = {},
     ):
         """
         Initializes model configuration and sampler configuration for the model
@@ -72,17 +72,17 @@ def __init__(
         >>> model = MyModel(model_config, sampler_config)
         """
 
-        if sampler_config is None:
+        if not sampler_config:
             sampler_config = self.default_sampler_config
         self.sampler_config = sampler_config
-        if model_config is None:
+        if not model_config:
             model_config = self.default_model_config
         self.model_config = model_config  # parameters for priors etc.
-        self.data = self.generate_model_data(data=data)
         self.model = None  # Set by build_model
-        self.output_var = None  # Set by build_model
-        self.idata = None  # idata is generated during fitting
+        self.output_var = ""  # Set by build_model
+        self.idata: Optional[az.InferenceData] = None  # idata is generated during fitting
         self.is_fitted_ = False
+        self.data = data
 
     def _validate_data(self, X, y=None):
         if y is not None:
@@ -92,7 +92,9 @@ def _validate_data(self, X, y=None):
 
     @abstractmethod
     def _data_setter(
-        self, X: Union[np.ndarray, pd.DataFrame], y: Union[np.ndarray, pd.DataFrame, List] = None
+        self,
+        X: Union[np.ndarray, pd.DataFrame],
+        y: Union[np.ndarray, pd.DataFrame, List] = None,
     ) -> None:
         """
         Sets new data in the model.
@@ -160,7 +162,7 @@ def default_sampler_config(self) -> Dict:
         Examples
         --------
         >>>     @classmethod
-        >>>     def default_model_config(self):
+        >>>     def default_sampler_config(self):
         >>>         Return {
         >>>             'draws': 1_000,
         >>>             'tune': 1_000,
@@ -175,10 +177,9 @@ def default_sampler_config(self) -> Dict:
         """
         raise NotImplementedError
 
-    @classmethod
     @abstractmethod
     def generate_model_data(
-        cls, data: Union[np.ndarray, pd.DataFrame, pd.Series] = None
+        self, data: Union[np.ndarray, pd.DataFrame, pd.Series] = None
     ) -> pd.DataFrame:
         """
         Returns a default dataset for a class, can be used as a hint to data formatting required for the class
@@ -207,8 +208,10 @@ def generate_model_data(
 
     @abstractmethod
     def build_model(
-        data: Dict[str, Union[np.ndarray, pd.DataFrame, pd.Series]] = None,
-        model_config: Dict[str, Union[int, float, Dict]] = None,
+        self,
+        data: Union[np.ndarray, pd.DataFrame, pd.Series] = {},
+        model_config: Dict = {},
+        **kwargs,
     ) -> None:
         """
         Creates an instance of pm.Model based on provided data and model_config, and
@@ -349,7 +352,7 @@ def save(self, fname: str) -> None:
         """
         if self.idata is not None and "posterior" in self.idata:
             file = Path(str(fname))
-            self.idata.to_netcdf(file)
+            self.idata.to_netcdf(str(file))
         else:
             raise RuntimeError("The model hasn't been fit yet, call .fit() first")
 
@@ -400,9 +403,9 @@ def load(cls, fname: str):
     def fit(
         self,
         X: Union[np.ndarray, pd.DataFrame, pd.Series],
-        y: Union[np.ndarray, pd.DataFrame, pd.Series],
+        y: Union[np.ndarray, pd.Series],
         progressbar: bool = True,
-        predictor_names: List[str] = None,
+        predictor_names: List[str] = [],
         random_seed: RandomState = None,
         **kwargs: Any,
     ) -> az.InferenceData:
@@ -420,7 +423,8 @@ def fit(
         progressbar : bool
             Specifies whether the fit progressbar should be displayed
         predictor_names: List[str] = None,
-            Allows for custom naming of predictors given in a form of 2dArray, if not provided the predictors will be named like predictor1, predictor2...
+            Allows for custom naming of predictors given in a form of 2dArray
+            allows for naming of predictors when given in a form of np.ndarray, if not provided the predictors will be named like predictor1, predictor2...
         random_seed : RandomState
             Provides sampler with initial random seed for obtaining reproducible samples
         **kwargs : Any
@@ -440,7 +444,7 @@ def fit(
 
         X, y = X, y
 
-        self.build_model()
+        self.build_model(data=self.data)
         self._data_setter(X, y)
 
         sampler_config = self.sampler_config.copy()
@@ -457,8 +461,8 @@ def fit(
         if type(y) is np.ndarray:
             y = pd.Series(y, name="target")
         combined_data = pd.concat([X, y], axis=1)
-        self.idata.add_groups(fit_data=combined_data.to_xarray())
-        return self.idata
+        self.idata.add_groups(fit_data=combined_data.to_xarray())  # type: ignore
+        return self.idata  # type: ignore
 
     def predict(
         self,
@@ -489,8 +493,6 @@ def predict(
         >>> prediction_data = pd.DataFrame({'input':x_pred})
         >>> pred_mean = model.predict(prediction_data)
         """
-        if not hasattr(self, "output_var"):
-            raise NotImplementedError(f"Subclasses of {__class__} should set self.output_var")
 
         posterior_predictive_samples = self.sample_posterior_predictive(
             X_pred, extend_idata, combined=False
@@ -507,7 +509,11 @@ def predict(
         return posterior_means.data
 
     def sample_prior_predictive(
-        self, X_pred, samples: int = None, extend_idata: bool = False, combined: bool = True
+        self,
+        X_pred,
+        samples: Optional[int] = None,
+        extend_idata: bool = False,
+        combined: bool = True,
     ):
         """
         Sample from the model's prior predictive distribution.
@@ -536,15 +542,15 @@ def sample_prior_predictive(
             self.build_model()
 
         self._data_setter(X_pred)
-
-        with self.model:  # sample with new input data
-            prior_pred = pm.sample_prior_predictive(samples)
-            self.set_idata_attrs(prior_pred)
-            if extend_idata:
-                if self.idata is not None:
-                    self.idata.extend(prior_pred)
-                else:
-                    self.idata = prior_pred
+        if self.model is not None:
+            with self.model:  # sample with new input data
+                prior_pred: az.InferenceData = pm.sample_prior_predictive(samples)
+                self.set_idata_attrs(prior_pred)
+                if extend_idata:
+                    if self.idata is not None:
+                        self.idata.extend(prior_pred)
+                    else:
+                        self.idata = prior_pred
 
         prior_predictive_samples = az.extract(prior_pred, "prior_predictive", combined=combined)
 
@@ -585,7 +591,10 @@ def get_params(self, deep=True):
         """
         Get all the model parameters needed to instantiate a copy of the model, not including training data.
         """
-        return {"model_config": self.model_config, "sampler_config": self.sampler_config}
+        return {
+            "model_config": self.model_config,
+            "sampler_config": self.sampler_config,
+        }
 
     def set_params(self, **params):
         """
@@ -639,8 +648,6 @@ def predict_posterior(
         y_pred : DataArray, shape (n_pred, chains * draws) if combined is True, otherwise (chains, draws, n_pred)
             Posterior predictive samples for each input X_pred
         """
-        if not hasattr(self, "output_var"):
-            raise NotImplementedError(f"Subclasses of {__class__} should set self.output_var")
 
         X_pred = self._validate_data(X_pred)
         posterior_predictive_samples = self.sample_posterior_predictive(
diff --git a/pymc_experimental/tests/test_model_builder.py b/pymc_experimental/tests/test_model_builder.py
index eb507cbd..e323aaea 100644
--- a/pymc_experimental/tests/test_model_builder.py
+++ b/pymc_experimental/tests/test_model_builder.py
@@ -29,15 +29,15 @@ class test_ModelBuilder(ModelBuilder):
     _model_type = "LinearModel"
     version = "0.1"
 
-    def build_model(self, model_data=None, model_config=None):
+    def build_model(self, data=None, model_config=None):
 
         with pm.Model() as self.model:
-            if model_data is None:
-                model_data = test_ModelBuilder.generate_model_data()
+            if data is None:
+                data = test_ModelBuilder.generate_model_data()
             if model_config is None:
                 model_config = self.default_model_config
-            x = pm.MutableData("x", model_data["input"].values)
-            y_data = pm.MutableData("y_data", model_data["output"].values)
+            x = pm.MutableData("x", data["input"].values)
+            y_data = pm.MutableData("y_data", data["output"].values)
 
             # prior parameters
             a_loc = model_config["a"]["loc"]

From 52354ff972582e46fec05941619884121d6f9220 Mon Sep 17 00:00:00 2001
From: Michal Raczycki <michalr265@gmail.com>
Date: Fri, 19 May 2023 15:19:27 +0100
Subject: [PATCH 10/11] fixing incorrect if clause

---
 pymc_experimental/linearmodel.py   | 2 +-
 pymc_experimental/model_builder.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pymc_experimental/linearmodel.py b/pymc_experimental/linearmodel.py
index 8431c574..995eeb33 100644
--- a/pymc_experimental/linearmodel.py
+++ b/pymc_experimental/linearmodel.py
@@ -30,7 +30,7 @@ def default_sampler_config(self):
             "target_accept": 0.95,
         }
 
-    def build_model(self):
+    def build_model(self, data=None):
         """
         Build the PyMC model.
 
diff --git a/pymc_experimental/model_builder.py b/pymc_experimental/model_builder.py
index e878ad32..2789ac1e 100644
--- a/pymc_experimental/model_builder.py
+++ b/pymc_experimental/model_builder.py
@@ -454,7 +454,7 @@ def fit(
 
         self.idata = self.sample_model(**sampler_config)
         if type(X) is np.ndarray:
-            if predictor_names is not None:
+            if len(predictor_names) > 0:
                 X = pd.DataFrame(X, columns=predictor_names)
             else:
                 X = pd.DataFrame(X, columns=[f"predictor{x}" for x in range(1, X.shape[1] + 1)])

From d8121384ae7a08792f7a98bdc214f6f77f7ea987 Mon Sep 17 00:00:00 2001
From: Michal Raczycki <michalr265@gmail.com>
Date: Fri, 26 May 2023 10:01:37 +0100
Subject: [PATCH 11/11] replacing dangerous default value

---
 .pylintrc                          |  1 +
 pymc_experimental/model_builder.py | 20 ++++++++++----------
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/.pylintrc b/.pylintrc
index a4593ef8..952050ed 100644
--- a/.pylintrc
+++ b/.pylintrc
@@ -46,6 +46,7 @@ enable=import-self,
        used-before-assignment,
        cell-var-from-loop,
        global-variable-undefined,
+       dangerous-default-value,
        # redefined-builtin,
        redefine-in-handler,
        unused-import,
diff --git a/pymc_experimental/model_builder.py b/pymc_experimental/model_builder.py
index 2789ac1e..d31bb633 100644
--- a/pymc_experimental/model_builder.py
+++ b/pymc_experimental/model_builder.py
@@ -51,8 +51,8 @@ class ModelBuilder:
     def __init__(
         self,
         data: Union[np.ndarray, pd.DataFrame, pd.Series] = None,
-        model_config: Dict = {},
-        sampler_config: Dict = {},
+        model_config: Dict = None,
+        sampler_config: Dict = None,
     ):
         """
         Initializes model configuration and sampler configuration for the model
@@ -71,12 +71,10 @@ def __init__(
         >>>     ...
         >>> model = MyModel(model_config, sampler_config)
         """
-
-        if not sampler_config:
-            sampler_config = self.default_sampler_config
+        sampler_config = self.default_sampler_config if sampler_config is None else sampler_config
         self.sampler_config = sampler_config
-        if not model_config:
-            model_config = self.default_model_config
+        model_config = self.default_model_config if model_config is None else model_config
+
         self.model_config = model_config  # parameters for priors etc.
         self.model = None  # Set by build_model
         self.output_var = ""  # Set by build_model
@@ -209,8 +207,8 @@ def generate_model_data(
     @abstractmethod
     def build_model(
         self,
-        data: Union[np.ndarray, pd.DataFrame, pd.Series] = {},
-        model_config: Dict = {},
+        data: Union[np.ndarray, pd.DataFrame, pd.Series] = None,
+        model_config: Dict = None,
         **kwargs,
     ) -> None:
         """
@@ -405,7 +403,7 @@ def fit(
         X: Union[np.ndarray, pd.DataFrame, pd.Series],
         y: Union[np.ndarray, pd.Series],
         progressbar: bool = True,
-        predictor_names: List[str] = [],
+        predictor_names: List[str] = None,
         random_seed: RandomState = None,
         **kwargs: Any,
     ) -> az.InferenceData:
@@ -441,6 +439,8 @@ def fit(
         Auto-assigning NUTS sampler...
         Initializing NUTS using jitter+adapt_diag...
         """
+        if predictor_names is None:
+            predictor_names = []
 
         X, y = X, y