Add start_sigma to ADVI

markusschmaus · markusschmaus · commit e9d7757f83d4 · 2022-09-02T20:02:14.000+02:00
diff --git a/pymc/variational/approximations.py b/pymc/variational/approximations.py
@@ -67,17 +67,33 @@ def std(self):
     def __init_group__(self, group):
         super().__init_group__(group)
         if not self._check_user_params():
-            self.shared_params = self.create_shared_params(self._kwargs.get("start", None))
+            self.shared_params = self.create_shared_params(
+                self._kwargs.get("start", None), self._kwargs.get("start_sigma", None)
+            )
         self._finalize_init()
 
-    def create_shared_params(self, start=None):
+    def create_shared_params(self, start=None, start_sigma=None):
+        # NOTE: `Group._prepare_start` uses `self.model.free_RVs` to identify free variables and
+        # `DictToArrayBijection` to turn them into a flat array, while `Approximation.rslice` assumes that the free
+        # variables are given by `self.group` and that the mapping between original variables and flat array is given
+        # by `self.ordering`. In the cases I looked into these turn out to be the same, but there may be edge cases or
+        # future code changes that break this assumption.
         start = self._prepare_start(start)
-        rho = np.zeros((self.ddim,))
+        rho = self._prepare_start_sigma(start_sigma)
         return {
             "mu": aesara.shared(pm.floatX(start), "mu"),
             "rho": aesara.shared(pm.floatX(rho), "rho"),
         }
 
+    def _prepare_start_sigma(self, start_sigma):
+        rho = np.zeros((self.ddim,))
+        if start_sigma is not None:
+            for name, slice_, *_ in self.ordering.items():
+                sigma = start_sigma.get(name)
+                if sigma is not None:
+                    rho[slice_] = np.log(np.exp(np.abs(sigma)) - 1.0)
+        return rho
+
     @node_property
     def symbolic_random(self):
         initial = self.symbolic_initial
diff --git a/pymc/variational/inference.py b/pymc/variational/inference.py
@@ -433,8 +433,10 @@ class ADVI(KLqp):
     random_seed: None or int
         leave None to use package global RandomStream or other
         valid value to create instance specific one
-    start: `Point`
+    start: `dict[str, np.ndarray]` or `StartDict`
         starting point for inference
+    start_sigma: `dict[str, np.ndarray]`
+        starting standard deviation for inference, only available for method 'advi'
 
     References
     ----------
@@ -660,6 +662,7 @@ def fit(
     model=None,
     random_seed=None,
     start=None,
+    start_sigma=None,
     inf_kwargs=None,
     **kwargs,
 ):
@@ -684,8 +687,10 @@ def fit(
         valid value to create instance specific one
     inf_kwargs: dict
         additional kwargs passed to :class:`Inference`
-    start: `Point`
+    start: `dict[str, np.ndarray]` or `StartDict`
         starting point for inference
+    start_sigma: `dict[str, np.ndarray]`
+        starting standard deviation for inference, only available for method 'advi'
 
     Other Parameters
     ----------------
@@ -728,6 +733,10 @@ def fit(
         inf_kwargs["random_seed"] = random_seed
     if start is not None:
         inf_kwargs["start"] = start
+    if start_sigma is not None:
+        if method != "advi":
+            raise NotImplementedError("start_sigma is only available for method advi")
+        inf_kwargs["start_sigma"] = start_sigma
     if model is None:
         model = pm.modelcontext(model)
     _select = dict(advi=ADVI, fullrank_advi=FullRankADVI, svgd=SVGD, asvgd=ASVGD)