refactor scalings (#2107)

ferrine · web-flow · commit 72c50f455512 · 2017-05-08T00:30:05.000+03:00
* reset commits, fix the problem

* pretty docs

* add test for scaling argument
diff --git a/pymc3/tests/test_variational_inference.py b/pymc3/tests/test_variational_inference.py
@@ -207,7 +207,7 @@ def cb(*_):
                     data_t.set_value(next(minibatches))
                 mu_ = Normal('mu', mu=mu0, sd=sd0, testval=0)
                 Normal('x', mu=mu_, sd=sd, observed=data_t, total_size=n)
-                inf = self.inference()
+                inf = self.inference(scale_cost_to_minibatch=True)
                 approx = inf.fit(self.NITER * 3, callbacks=
                 [cb, pm.callbacks.CheckParametersConvergence()],
                                  obj_n_mc=10, obj_optimizer=self.optimizer)
diff --git a/pymc3/theanof.py b/pymc3/theanof.py
@@ -23,7 +23,6 @@
            'join_nonshared_inputs',
            'make_shared_replacements',
            'generator',
-           'GradScale',
            'set_tt_rng',
            'tt_rng']
 
@@ -417,13 +416,5 @@ def set_tt_rng(new_rng):
     launch_rng(_tt_rng)
 
 
-class GradScale(theano.compile.ViewOp):
-    def __init__(self, multiplier):
-        self.multiplier = multiplier
-
-    def grad(self, args, g_outs):
-        return [self.multiplier * g_out for g_out in g_outs]
-
-
 def floatX_array(x):
     return floatX(np.array(x))
diff --git a/pymc3/util.py b/pymc3/util.py
@@ -6,11 +6,13 @@ def get_transformed_name(name, transform):
     ----------
     name : str
         Name to transform
-    transform : object
+    transform : transforms.Transform
         Should be a subclass of `transforms.Transform`
 
-    Returns:
-    A string to use for the transformed variable
+    Returns
+    -------
+    str 
+        A string to use for the transformed variable
     """
     return "{}_{}__".format(name, transform.name)
 
@@ -24,8 +26,10 @@ def is_transformed_name(name):
     name : str
         Name to check
 
-    Returns:
-    Boolean, whether the string could have been produced by `get_transormed_name`
+    Returns
+    -------
+    bool
+        Boolean, whether the string could have been produced by `get_transormed_name`
     """
     return name.endswith('__') and name.count('_') >= 3
 
@@ -39,8 +43,10 @@ def get_untransformed_name(name):
     name : str
         Name to untransform
 
-    Returns:
-    String with untransformed version of the name.
+    Returns
+    -------
+    str
+        String with untransformed version of the name.
     """
     if not is_transformed_name(name):
         raise ValueError(u'{} does not appear to be a transformed name'.format(name))
@@ -57,8 +63,10 @@ def get_default_varnames(var_iterator, include_transformed):
     include_transformed : boolean
         Should transformed variable names be included in return value
 
-    Returns:
-    List of variables, possibly filtered
+    Returns
+    -------
+    list
+        List of variables, possibly filtered
     """
     if include_transformed:
         return list(var_iterator)
diff --git a/pymc3/variational/approximations.py b/pymc3/variational/approximations.py
@@ -28,25 +28,23 @@ class MeanField(Approximation):
         mapping {model_variable -> local_variable (:math:`\\mu`, :math:`\\rho`)}
         Local Vars are used for Autoencoding Variational Bayes
         See (AEVB; Kingma and Welling, 2014) for details
-
     model : PyMC3 model for inference
-
     start : Point
         initial mean
-
     cost_part_grad_scale : float or scalar tensor
         Scaling score part of gradient can be useful near optimum for
         archiving better convergence properties. Common schedule is
         1 at the start and 0 in the end. So slow decay will be ok.
         See (Sticking the Landing; Geoffrey Roeder,
         Yuhuai Wu, David Duvenaud, 2016) for details
-
+    scale_cost_to_minibatch : bool, default False
+        Scale cost to minibatch instead of full dataset
     seed : None or int
         leave None to use package global RandomStream or other
         valid value to create instance specific one
 
     References
-    ----------
+    ----------   
     Geoffrey Roeder, Yuhuai Wu, David Duvenaud, 2016
         Sticking the Landing: A Simple Reduced-Variance Gradient for ADVI
         approximateinference.org/accepted/RoederEtAl2016.pdf
@@ -109,19 +107,17 @@ class FullRank(Approximation):
         mapping {model_variable -> local_variable (:math:`\\mu`, :math:`\\rho`)}
         Local Vars are used for Autoencoding Variational Bayes
         See (AEVB; Kingma and Welling, 2014) for details
-
     model : PyMC3 model for inference
-
     start : Point
         initial mean
-
     cost_part_grad_scale : float or scalar tensor
         Scaling score part of gradient can be useful near optimum for
         archiving better convergence properties. Common schedule is
         1 at the start and 0 in the end. So slow decay will be ok.
         See (Sticking the Landing; Geoffrey Roeder,
-        Yuhuai Wu, David Duvenaud, 2016) for details
-
+        Yuhuai Wu, David Duvenaud, 2016) for details   
+    scale_cost_to_minibatch : bool, default False
+        Scale cost to minibatch instead of full dataset
     seed : None or int
         leave None to use package global RandomStream or other
         valid value to create instance specific one
@@ -133,12 +129,13 @@ class FullRank(Approximation):
         approximateinference.org/accepted/RoederEtAl2016.pdf
     """
     def __init__(self, local_rv=None, model=None, cost_part_grad_scale=1,
+                 scale_cost_to_minibatch=False,
                  gpu_compat=False, seed=None, **kwargs):
         super(FullRank, self).__init__(
             local_rv=local_rv, model=model,
             cost_part_grad_scale=cost_part_grad_scale,
-            seed=seed,
-            **kwargs
+            scale_cost_to_minibatch=scale_cost_to_minibatch,
+            seed=seed, **kwargs
         )
         self.gpu_compat = gpu_compat
 
@@ -213,7 +210,7 @@ def from_mean_field(cls, mean_field, gpu_compat=False):
         """Construct FullRank from MeanField approximation
 
         Parameters
-        ----------
+        ----------  
         mean_field : MeanField
             approximation to start with
 
@@ -256,9 +253,9 @@ class Empirical(Approximation):
         mapping {model_variable -> local_variable (:math:`\\mu`, :math:`\\rho`)}
         Local Vars are used for Autoencoding Variational Bayes
         See (AEVB; Kingma and Welling, 2014) for details
-
+    scale_cost_to_minibatch : bool, default False
+        Scale cost to minibatch instead of full dataset
     model : PyMC3 model
-
     seed : None or int
         leave None to use package global RandomStream or other
         valid value to create instance specific one
@@ -270,11 +267,12 @@ class Empirical(Approximation):
     ...     trace = sample(1000, step=step)
     ...     histogram = Empirical(trace[100:])
     """
-    def __init__(self, trace, local_rv=None, model=None, seed=None, **kwargs):
+    def __init__(self, trace, local_rv=None,
+                 scale_cost_to_minibatch=False,
+                 model=None, seed=None, **kwargs):
         super(Empirical, self).__init__(
-            local_rv=local_rv, model=model, trace=trace, seed=seed,
-            **kwargs
-        )
+            local_rv=local_rv, scale_cost_to_minibatch=scale_cost_to_minibatch,
+            model=model, trace=trace, seed=seed, **kwargs)
 
     def check_model(self, model, **kwargs):
         trace = kwargs.get('trace')
@@ -352,7 +350,8 @@ def cov(self):
         return x.T.dot(x) / self.histogram.shape[0]
 
     @classmethod
-    def from_noise(cls, size, jitter=.01, local_rv=None, start=None, model=None, seed=None):
+    def from_noise(cls, size, jitter=.01, local_rv=None,
+                   start=None, model=None, seed=None, **kwargs):
         """Initialize Histogram with random noise
 
         Parameters
@@ -366,12 +365,16 @@ def from_noise(cls, size, jitter=.01, local_rv=None, start=None, model=None, see
         start : initial point
         model : pm.Model
             PyMC3 Model
+        seed : None or int
+            leave None to use package global RandomStream or other
+            valid value to create instance specific one
+        kwargs : other kwargs passed to init
 
         Returns
-        -------
+        -------    
         Empirical
         """
-        hist = cls(None, local_rv=local_rv, model=model, seed=seed)
+        hist = cls(None, local_rv=local_rv, model=model, seed=seed, **kwargs)
         if start is None:
             start = hist.model.test_point
         else:
@@ -390,15 +393,15 @@ def sample_approx(approx, draws=100, include_transformed=True):
     """Draw samples from variational posterior.
 
     Parameters
-    ----------
+    ----------  
     approx : Approximation
     draws : int
         Number of random samples.
     include_transformed : bool
         If True, transformed variables are also sampled. Default is True.
 
     Returns
-    -------
+    -------    
     trace : pymc3.backends.base.MultiTrace
         Samples drawn from variational posterior.
     """
diff --git a/pymc3/variational/inference.py b/pymc3/variational/inference.py
@@ -304,6 +304,8 @@ class ADVI(Inference):
         1 at the start and 0 in the end. So slow decay will be ok.
         See (Sticking the Landing; Geoffrey Roeder,
         Yuhuai Wu, David Duvenaud, 2016) for details
+    scale_cost_to_minibatch : bool, default False
+        Scale cost to minibatch instead of full dataset
     seed : None or int
         leave None to use package global RandomStream or other
         valid value to create instance specific one    
@@ -323,11 +325,15 @@ class ADVI(Inference):
     - Kingma, D. P., & Welling, M. (2014).
       Auto-Encoding Variational Bayes. stat, 1050, 1.
     """
-    def __init__(self, local_rv=None, model=None, cost_part_grad_scale=1,
+    def __init__(self, local_rv=None, model=None,
+                 cost_part_grad_scale=1,
+                 scale_cost_to_minibatch=False,
                  seed=None, start=None):
         super(ADVI, self).__init__(
             KL, MeanField, None,
-            local_rv=local_rv, model=model, cost_part_grad_scale=cost_part_grad_scale,
+            local_rv=local_rv, model=model,
+            cost_part_grad_scale=cost_part_grad_scale,
+            scale_cost_to_minibatch=scale_cost_to_minibatch,
             seed=seed, start=start)
 
     @classmethod
@@ -372,7 +378,8 @@ class FullRankADVI(Inference):
         1 at the start and 0 in the end. So slow decay will be ok.
         See (Sticking the Landing; Geoffrey Roeder,
         Yuhuai Wu, David Duvenaud, 2016) for details
-
+    scale_cost_to_minibatch : bool, default False
+        Scale cost to minibatch instead of full dataset
     seed : None or int
         leave None to use package global RandomStream or other
         valid value to create instance specific one
@@ -392,11 +399,15 @@ class FullRankADVI(Inference):
     - Kingma, D. P., & Welling, M. (2014).
       Auto-Encoding Variational Bayes. stat, 1050, 1.
     """
-    def __init__(self, local_rv=None, model=None, cost_part_grad_scale=1,
+    def __init__(self, local_rv=None, model=None,
+                 cost_part_grad_scale=1,
+                 scale_cost_to_minibatch=False,
                  gpu_compat=False, seed=None, start=None):
         super(FullRankADVI, self).__init__(
             KL, FullRank, None,
-            local_rv=local_rv, model=model, cost_part_grad_scale=cost_part_grad_scale,
+            local_rv=local_rv, model=model,
+            cost_part_grad_scale=cost_part_grad_scale,
+            scale_cost_to_minibatch=scale_cost_to_minibatch,
             gpu_compat=gpu_compat, seed=seed, start=start)
 
     @classmethod
@@ -497,6 +508,8 @@ class SVGD(Inference):
     model : pm.Model
     kernel : callable
         kernel function for KSD f(histogram) -> (k(x,.), \nabla_x k(x,.))
+    scale_cost_to_minibatch : bool, default False
+        Scale cost to minibatch instead of full dataset
     start : dict
         initial point for inference
     histogram : Empirical
@@ -514,10 +527,13 @@ class SVGD(Inference):
         arXiv:1608.04471
     """
     def __init__(self, n_particles=100, jitter=.01, model=None, kernel=test_functions.rbf,
-                 start=None, histogram=None, seed=None, local_rv=None):
+                 scale_cost_to_minibatch=False, start=None, histogram=None,
+                 seed=None, local_rv=None):
         if histogram is None:
             histogram = Empirical.from_noise(
-                n_particles, jitter=jitter, start=start, model=model, local_rv=local_rv, seed=seed)
+                n_particles, jitter=jitter,
+                scale_cost_to_minibatch=scale_cost_to_minibatch,
+                start=start, model=model, local_rv=local_rv, seed=seed)
         super(SVGD, self).__init__(
             KSD, histogram,
             kernel,
diff --git a/pymc3/variational/opvi.py b/pymc3/variational/opvi.py