From 455de0586742a72bc95c078ff4eb69a198cc86ed Mon Sep 17 00:00:00 2001 From: Michal Raczycki Date: Sun, 16 Apr 2023 17:54:19 +0100 Subject: [PATCH 1/6] adaptations to integrate with mmm --- pymc_experimental/model_builder.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/pymc_experimental/model_builder.py b/pymc_experimental/model_builder.py index 3f48f3f9..c5ba5bcd 100644 --- a/pymc_experimental/model_builder.py +++ b/pymc_experimental/model_builder.py @@ -23,6 +23,8 @@ import numpy as np import pandas as pd import pymc as pm +from pymc.util import RandomState +from traitlets import Any class ModelBuilder: @@ -191,7 +193,7 @@ def load(cls, fname: str): data=idata.fit_data.to_dataframe(), ) model_builder.idata = idata - model_builder.build() + model_builder.build_model() if model_builder.id != idata.attrs["id"]: raise ValueError( f"The file '{fname}' does not contain an inference data of the same model or configuration as '{cls._model_type}'" @@ -200,7 +202,12 @@ def load(cls, fname: str): return model_builder def fit( - self, data: Dict[str, Union[np.ndarray, pd.DataFrame, pd.Series]] = None + self, + progressbar: bool = True, + random_seed: RandomState = None, + data: Dict[str, Union[np.ndarray, pd.DataFrame, pd.Series]] = None, + *args: Any, + **kwargs: Any, ) -> az.InferenceData: """ Fit a model using the data passed as a parameter. @@ -227,9 +234,9 @@ def fit( # If a new data was provided, assign it to the model if data is not None: self.data = data - - self.build() - self._data_setter(data) + self.model_data, self.model_config = self.create_sample_input(data=self.data) + self.build_model(self.model_data, self.model_config) + self._data_setter(self.data) with self.model: self.idata = pm.sample(**self.sampler_config) @@ -240,7 +247,7 @@ def fit( self.idata.attrs["model_type"] = self._model_type self.idata.attrs["version"] = self.version self.idata.attrs["sampler_config"] = json.dumps(self.sampler_config) - self.idata.attrs["model_config"] = json.dumps(self.model_config) + self.idata.attrs["model_config"] = json.dumps(self.serializable_model_config) self.idata.add_groups(fit_data=self.data.to_xarray()) return self.idata From e62f9240c034b1261a009c5d1a1f298b3ea9d60a Mon Sep 17 00:00:00 2001 From: Michal Raczycki Date: Mon, 17 Apr 2023 08:48:54 +0100 Subject: [PATCH 2/6] adapted model_config and descriptions --- pymc_experimental/model_builder.py | 52 +++++++++++++++---- pymc_experimental/tests/test_model_builder.py | 33 ++++++------ 2 files changed, 58 insertions(+), 27 deletions(-) diff --git a/pymc_experimental/model_builder.py b/pymc_experimental/model_builder.py index c5ba5bcd..84a9ea8e 100644 --- a/pymc_experimental/model_builder.py +++ b/pymc_experimental/model_builder.py @@ -102,7 +102,7 @@ def _data_setter( @abstractmethod def create_sample_input(): """ - Needs to be implemented by the user in the inherited class. + Needs to be implemented by the user in the child class. Returns examples for data, model_config, sampler_config. This is useful for understanding the required data structures for the user model. @@ -116,12 +116,15 @@ def create_sample_input(): >>> data = pd.DataFrame({'input': x, 'output': y}) >>> model_config = { - >>> 'a_loc': 7, - >>> 'a_scale': 3, - >>> 'b_loc': 5, - >>> 'b_scale': 3, - >>> 'obs_error': 2, - >>> } + >>> 'a' : { + >>> 'a_loc': 7, + >>> 'a_scale' : 3 + >>> }, + >>> 'b' : { + >>> 'b_loc': 3, + >>> 'b_scale': 5 + >>> } + >>> 'obs_error': 2 >>> sampler_config = { >>> 'draws': 1_000, @@ -134,6 +137,31 @@ def create_sample_input(): raise NotImplementedError + @abstractmethod + def build_model( + model_data: Dict[str, Union[np.ndarray, pd.DataFrame, pd.Series]], + model_config: Dict[str, Union[int, float, Dict]], + ) -> None: + """ + Needs to be implemented by the user in the child class. + Creates an instance of pm.Model based on provided model_data and model_config, and + attaches it to self. + + Required Parameters + ---------- + model_data - preformated data that is going to be used in the model. + For efficiency reasons it should contain only the necesary data columns, not entire available + dataset since it's going to be encoded into data used to recreate the model. + model_config - dictionary where keys are strings representing names of parameters of the model, values are + dictionaries of parameters needed for creating model parameters (see example in create_model_input) + + Returns: + ---------- + None + + """ + raise NotImplementedError + def save(self, fname: str) -> None: """ Saves inference data of the model. @@ -193,7 +221,7 @@ def load(cls, fname: str): data=idata.fit_data.to_dataframe(), ) model_builder.idata = idata - model_builder.build_model() + model_builder.idata = model_builder.fit() if model_builder.id != idata.attrs["id"]: raise ValueError( f"The file '{fname}' does not contain an inference data of the same model or configuration as '{cls._model_type}'" @@ -234,9 +262,11 @@ def fit( # If a new data was provided, assign it to the model if data is not None: self.data = data - self.model_data, self.model_config = self.create_sample_input(data=self.data) + self.model_data, self.model_config, self.sampler_config = self.create_sample_input( + data=self.data + ) self.build_model(self.model_data, self.model_config) - self._data_setter(self.data) + self._data_setter(self.model_data) with self.model: self.idata = pm.sample(**self.sampler_config) @@ -248,7 +278,7 @@ def fit( self.idata.attrs["version"] = self.version self.idata.attrs["sampler_config"] = json.dumps(self.sampler_config) self.idata.attrs["model_config"] = json.dumps(self.serializable_model_config) - self.idata.add_groups(fit_data=self.data.to_xarray()) + self.idata.add_groups(fit_data=self.model_data.to_xarray()) return self.idata def predict( diff --git a/pymc_experimental/tests/test_model_builder.py b/pymc_experimental/tests/test_model_builder.py index 20845fee..e17e9d1c 100644 --- a/pymc_experimental/tests/test_model_builder.py +++ b/pymc_experimental/tests/test_model_builder.py @@ -28,19 +28,18 @@ class test_ModelBuilder(ModelBuilder): _model_type = "LinearModel" version = "0.1" - def build(self): - + def build_model(self, model_data, model_config): with pm.Model() as self.model: - if self.data is not None: - x = pm.MutableData("x", self.data["input"].values) - y_data = pm.MutableData("y_data", self.data["output"].values) + if model_data is not None: + x = pm.MutableData("x", model_data["input"].values) + y_data = pm.MutableData("y_data", model_data["output"].values) # prior parameters - a_loc = self.model_config["a_loc"] - a_scale = self.model_config["a_scale"] - b_loc = self.model_config["b_loc"] - b_scale = self.model_config["b_scale"] - obs_error = self.model_config["obs_error"] + a_loc = model_config["a"]["loc"] + a_scale = model_config["a"]["scale"] + b_loc = model_config["b"]["loc"] + b_scale = model_config["b"]["scale"] + obs_error = model_config["obs_error"] # priors a = pm.Normal("a", a_loc, sigma=a_scale) @@ -48,7 +47,7 @@ def build(self): obs_error = pm.HalfNormal("σ_model_fmc", obs_error) # observed data - if self.data is not None: + if model_data is not None: y_model = pm.Normal("y_model", a + b * x, obs_error, shape=x.shape, observed=y_data) def _data_setter(self, data: pd.DataFrame): @@ -57,18 +56,20 @@ def _data_setter(self, data: pd.DataFrame): if "output" in data.columns: pm.set_data({"y_data": data["output"].values}) + @property + def serializable_model_config(self): + return self.model_config + @classmethod - def create_sample_input(self): + def create_sample_input(self, data=None): x = np.linspace(start=0, stop=1, num=100) y = 5 * x + 3 y = y + np.random.normal(0, 1, len(x)) data = pd.DataFrame({"input": x, "output": y}) model_config = { - "a_loc": 0, - "a_scale": 10, - "b_loc": 0, - "b_scale": 10, + "a": {"loc": 0, "scale": 10}, + "b": {"loc": 0, "scale": 10}, "obs_error": 2, } From 6f4bb259dafa258c715cd17b4e3367615621326f Mon Sep 17 00:00:00 2001 From: Michal Raczycki Date: Mon, 17 Apr 2023 08:59:24 +0100 Subject: [PATCH 3/6] fixed ModuleNotFoundError from build --- pymc_experimental/model_builder.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pymc_experimental/model_builder.py b/pymc_experimental/model_builder.py index 84a9ea8e..e6ca59bb 100644 --- a/pymc_experimental/model_builder.py +++ b/pymc_experimental/model_builder.py @@ -17,14 +17,13 @@ import json from abc import abstractmethod from pathlib import Path -from typing import Dict, Union +from typing import Any, Dict, Union import arviz as az import numpy as np import pandas as pd import pymc as pm from pymc.util import RandomState -from traitlets import Any class ModelBuilder: From 4c9987755a4c8b48d1428752f388437376d1f206 Mon Sep 17 00:00:00 2001 From: Michal Raczycki Date: Mon, 17 Apr 2023 11:41:00 +0100 Subject: [PATCH 4/6] small tweaks to make mmm tests work smoother --- pymc_experimental/model_builder.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pymc_experimental/model_builder.py b/pymc_experimental/model_builder.py index e6ca59bb..6021443a 100644 --- a/pymc_experimental/model_builder.py +++ b/pymc_experimental/model_builder.py @@ -266,9 +266,8 @@ def fit( ) self.build_model(self.model_data, self.model_config) self._data_setter(self.model_data) - with self.model: - self.idata = pm.sample(**self.sampler_config) + self.idata = pm.sample(**self.sampler_config, **kwargs) self.idata.extend(pm.sample_prior_predictive()) self.idata.extend(pm.sample_posterior_predictive(self.idata)) @@ -277,7 +276,7 @@ def fit( self.idata.attrs["version"] = self.version self.idata.attrs["sampler_config"] = json.dumps(self.sampler_config) self.idata.attrs["model_config"] = json.dumps(self.serializable_model_config) - self.idata.add_groups(fit_data=self.model_data.to_xarray()) + self.idata.add_groups(fit_data=self.data.to_xarray()) return self.idata def predict( From e89724c0ea2ae1617971388fffe25ad0742b06fd Mon Sep 17 00:00:00 2001 From: Michal Raczycki Date: Wed, 19 Apr 2023 12:17:24 +0100 Subject: [PATCH 5/6] new test for save, fit allows custom configs --- pymc_experimental/model_builder.py | 34 ++++++++++++++----- pymc_experimental/tests/test_model_builder.py | 15 ++++++-- 2 files changed, 37 insertions(+), 12 deletions(-) diff --git a/pymc_experimental/model_builder.py b/pymc_experimental/model_builder.py index 6021443a..da28cc70 100644 --- a/pymc_experimental/model_builder.py +++ b/pymc_experimental/model_builder.py @@ -180,9 +180,11 @@ def save(self, fname: str) -> None: >>> name = './mymodel.nc' >>> model.save(name) """ - - file = Path(str(fname)) - self.idata.to_netcdf(file) + if self.idata is not None and "fit_data" in self.idata: + file = Path(str(fname)) + self.idata.to_netcdf(file) + else: + raise RuntimeError("The model hasn't been fit yet, call .fit() first") @classmethod def load(cls, fname: str): @@ -220,7 +222,7 @@ def load(cls, fname: str): data=idata.fit_data.to_dataframe(), ) model_builder.idata = idata - model_builder.idata = model_builder.fit() + model_builder.build_model(model_builder.data, model_builder.model_config) if model_builder.id != idata.attrs["id"]: raise ValueError( f"The file '{fname}' does not contain an inference data of the same model or configuration as '{cls._model_type}'" @@ -261,11 +263,12 @@ def fit( # If a new data was provided, assign it to the model if data is not None: self.data = data - self.model_data, self.model_config, self.sampler_config = self.create_sample_input( - data=self.data - ) + self.model_data, model_config, sampler_config = self.create_sample_input(data=self.data) + if self.model_config is None: + self.model_config = model_config + if self.sampler_config is None: + self.sampler_config = sampler_config self.build_model(self.model_data, self.model_config) - self._data_setter(self.model_data) with self.model: self.idata = pm.sample(**self.sampler_config, **kwargs) self.idata.extend(pm.sample_prior_predictive()) @@ -275,7 +278,7 @@ def fit( self.idata.attrs["model_type"] = self._model_type self.idata.attrs["version"] = self.version self.idata.attrs["sampler_config"] = json.dumps(self.sampler_config) - self.idata.attrs["model_config"] = json.dumps(self.serializable_model_config) + self.idata.attrs["model_config"] = json.dumps(self._serializable_model_config) self.idata.add_groups(fit_data=self.data.to_xarray()) return self.idata @@ -386,6 +389,19 @@ def _extract_samples(post_pred: az.data.inference_data.InferenceData) -> Dict[st return post_pred_dict + @property + @abstractmethod + def _serializable_model_config(self) -> Dict[str, Union[int, float, Dict]]: + """ + Converts non-serializable values from model_config to their serializable reversable equivalent. + Data types like pandas DataFrame, Series or datetime aren't JSON serializable, + so in order to save the model they need to be formatted. + + Returns + ------- + model_config: dict + """ + @property def id(self) -> str: """ diff --git a/pymc_experimental/tests/test_model_builder.py b/pymc_experimental/tests/test_model_builder.py index e17e9d1c..5ff32846 100644 --- a/pymc_experimental/tests/test_model_builder.py +++ b/pymc_experimental/tests/test_model_builder.py @@ -57,7 +57,7 @@ def _data_setter(self, data: pd.DataFrame): pm.set_data({"y_data": data["output"].values}) @property - def serializable_model_config(self): + def _serializable_model_config(self): return self.model_config @classmethod @@ -95,7 +95,16 @@ def initial_build_and_fit(check_idata=True) -> ModelBuilder: return model_builder -def test_empty_model_config(): +def test_save_without_fit_raises_runtime_error(): + data, model_config, sampler_config = test_ModelBuilder.create_sample_input() + model_builder = test_ModelBuilder( + model_config=model_config, sampler_config=sampler_config, data=data + ) + with pytest.raises(RuntimeError): + model_builder.save("saved_model") + + +def test_empty_sampler_config_fit(): data, model_config, sampler_config = test_ModelBuilder.create_sample_input() sampler_config = {} model_builder = test_ModelBuilder( @@ -106,7 +115,7 @@ def test_empty_model_config(): assert "posterior" in model_builder.idata.groups() -def test_empty_model_config(): +def test_empty_model_config_fit(): data, model_config, sampler_config = test_ModelBuilder.create_sample_input() model_config = {} model_builder = test_ModelBuilder( From 947a67de5a1bfb0d92419be8d0e6087bd3d0617b Mon Sep 17 00:00:00 2001 From: Michal Raczycki Date: Wed, 19 Apr 2023 13:01:16 +0100 Subject: [PATCH 6/6] updating create_sample_input example --- pymc_experimental/model_builder.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pymc_experimental/model_builder.py b/pymc_experimental/model_builder.py index da28cc70..79562f9e 100644 --- a/pymc_experimental/model_builder.py +++ b/pymc_experimental/model_builder.py @@ -116,12 +116,12 @@ def create_sample_input(): >>> model_config = { >>> 'a' : { - >>> 'a_loc': 7, - >>> 'a_scale' : 3 + >>> 'loc': 7, + >>> 'scale' : 3 >>> }, >>> 'b' : { - >>> 'b_loc': 3, - >>> 'b_scale': 5 + >>> 'loc': 3, + >>> 'scale': 5 >>> } >>> 'obs_error': 2