Skip to content

Commit 642fb7f

Browse files
committed
Merge pull request #685 from pymc-devs/refactor_glm
Refactor glm
2 parents 284e869 + 12dd4f0 commit 642fb7f

File tree

7 files changed

+51
-139
lines changed

7 files changed

+51
-139
lines changed

pymc3/examples/glm_linear.py

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,7 @@
1-
from __future__ import print_function
2-
3-
import numpy as np
41
import sys
52

6-
try:
7-
import statsmodels.api as sm
8-
except ImportError:
9-
print("Example requires statsmodels")
10-
sys.exit(0)
3+
import numpy as np
4+
import scipy.optimize as opt
115

126
from pymc3 import *
137

@@ -31,13 +25,12 @@ def run(n=2000):
3125
import matplotlib.pyplot as plt
3226

3327
with model:
34-
trace = sample(n, Slice())
28+
start = find_MAP(fmin=opt.fmin_powell)
29+
trace = sample(n, Slice(), start=start)
3530

3631
plt.plot(x, y, 'x')
3732
glm.plot_posterior_predictive(trace)
3833
# plt.show()
3934

4035
if __name__ == '__main__':
4136
run()
42-
43-

pymc3/examples/glm_robust.py

Lines changed: 3 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,6 @@
1-
from __future__ import print_function
2-
31
import numpy as np
42
import sys
53

6-
try:
7-
import statsmodels.api as sm
8-
except ImportError:
9-
print("Example requires statsmodels")
10-
sys.exit(0)
11-
124
from pymc3 import *
135

146
# Generate data
@@ -26,9 +18,9 @@
2618
data_outlier = dict(x=x, y=y)
2719

2820
with Model() as model:
29-
family = glm.families.T(link=glm.links.Identity,
30-
priors={'nu': 1.5,
31-
'lam': ('sigma', Uniform.dist(0, 20))})
21+
family = glm.families.T(#link=glm.families.identity,
22+
priors={'nu': 1.5,
23+
'lam': Uniform.dist(0, 20)})
3224
glm.glm('y ~ x', data_outlier, family=family)
3325

3426
def run(n=2000):
@@ -45,5 +37,3 @@ def run(n=2000):
4537

4638
if __name__ == '__main__':
4739
run()
48-
49-

pymc3/glm/__init__.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
try:
2-
import statsmodels.api as sm
3-
from . import links
42
from . import families
53
from .glm import *
64
except ImportError:
7-
print("Warning: statsmodels and/or patsy not found, not importing glm submodule.")
5+
print("Warning: patsy not found, not importing glm submodule.")

pymc3/glm/families.py

Lines changed: 32 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,29 @@
11
import numbers
22
from copy import copy
33

4-
try:
5-
from statsmodels.genmod.families.family import (Gaussian, Binomial, Poisson)
6-
except ImportError:
7-
Gaussian = None
8-
Binomial = None
9-
Poisson = None
10-
11-
from .links import *
4+
import theano.tensor
125
from ..model import modelcontext
13-
from ..distributions import Normal, T, Uniform, Bernoulli, Poisson
6+
from .. import distributions as pm_dists
147

158
__all__ = ['Normal', 'T', 'Binomial', 'Poisson']
169

10+
# Define link functions
11+
12+
# Hack as assigning a function in the class definition automatically binds it as a method.
13+
class Identity():
14+
def __call__(self, x):
15+
return x
16+
17+
identity = Identity()
18+
logit = theano.tensor.nnet.sigmoid
19+
inverse = theano.tensor.inv
20+
log = theano.tensor.log
21+
1722
class Family(object):
1823
"""Base class for Family of likelihood distribution and link functions.
1924
"""
2025
priors = {}
21-
link = Identity
26+
link = None
2227

2328
def __init__(self, **kwargs):
2429
# Overwrite defaults
@@ -29,9 +34,6 @@ def __init__(self, **kwargs):
2934
else:
3035
setattr(self, key, val)
3136

32-
# Instantiate link function
33-
self.link_func = self.link()
34-
3537
def _get_priors(self, model=None):
3638
"""Return prior distributions of the likelihood.
3739
@@ -45,7 +47,7 @@ def _get_priors(self, model=None):
4547
if isinstance(val, numbers.Number):
4648
priors[key] = val
4749
else:
48-
priors[key] = model.Var(val[0], val[1])
50+
priors[key] = model.Var(key, val)
4951

5052
return priors
5153

@@ -61,50 +63,38 @@ def create_likelihood(self, y_est, y_data, model=None):
6163
"""
6264
priors = self._get_priors(model=model)
6365
# Wrap y_est in link function
64-
priors[self.parent] = self.link_func.theano(y_est)
66+
priors[self.parent] = self.link(y_est)
6567
return self.likelihood('y', observed=y_data, **priors)
6668

67-
def create_statsmodel_family(self):
68-
"""Instantiate and return statsmodel family object.
69-
"""
70-
if self.sm_family is None:
71-
return None
72-
else:
73-
return self.sm_family(self.link.sm)
74-
7569
def __repr__(self):
7670
return """Family {klass}:
7771
Likelihood : {likelihood}({parent})
7872
Priors : {priors}
7973
Link function: {link}.""".format(klass=self.__class__, likelihood=self.likelihood.__name__, parent=self.parent, priors=self.priors, link=self.link)
8074

8175

82-
class Normal(Family):
83-
sm_family = Gaussian
84-
link = Identity
85-
likelihood = Normal
86-
parent = 'mu'
87-
priors = {'sd': ('sigma', Uniform.dist(0, 100))}
88-
89-
9076
class T(Family):
91-
sm_family = Gaussian
92-
link = Identity
93-
likelihood = T
77+
link = identity
78+
likelihood = pm_dists.T
9479
parent = 'mu'
95-
priors = {'lam': ('sigma', Uniform.dist(0, 100)),
80+
priors = {'lam': pm_dists.HalfCauchy.dist(beta=10),
9681
'nu': 1}
9782

83+
class Normal(Family):
84+
link = identity
85+
likelihood = pm_dists.Normal
86+
parent = 'mu'
87+
priors = {'sd': pm_dists.HalfCauchy.dist(beta=10)}
9888

9989
class Binomial(Family):
100-
link = Logit
101-
sm_family = Binomial
102-
likelihood = Bernoulli
90+
link = logit
91+
likelihood = pm_dists.Bernoulli
10392
parent = 'p'
93+
priors = {'p': pm_dists.Beta.dist(alpha=1, beta=1)}
10494

10595

10696
class Poisson(Family):
107-
link = Log
108-
sm_family = Poisson
109-
likelihood = Poisson
110-
parent = ''
97+
link = log
98+
likelihood = pm_dists.Poisson
99+
parent = 'mu'
100+
priors = {'mu': pm_dists.HalfCauchy.dist(beta=10)}

pymc3/glm/glm.py

Lines changed: 10 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -6,17 +6,14 @@
66
import theano
77
import pandas as pd
88
from collections import defaultdict
9-
from statsmodels.formula.api import glm as glm_sm
10-
import statsmodels.api as sm
119
from pandas.tools.plotting import scatter_matrix
1210

13-
from . import links
1411
from . import families
1512

1613
def linear_component(formula, data, priors=None,
1714
intercept_prior=None,
1815
regressor_prior=None,
19-
init=True, init_vals=None, family=None,
16+
init_vals=None, family=None,
2017
model=None):
2118
"""Create linear model according to patsy specification.
2219
@@ -35,9 +32,6 @@ def linear_component(formula, data, priors=None,
3532
regressor_prior : pymc3 distribution
3633
Prior to use for all regressor(s).
3734
Default: Normal.dist(mu=0, tau=1.0E-12)
38-
init : bool
39-
Whether to set the starting values via statsmodels
40-
Default: True
4135
init_vals : dict
4236
Set starting values externally: parameter -> value
4337
Default: None
@@ -70,10 +64,8 @@ def linear_component(formula, data, priors=None,
7064
_, dmatrix = patsy.dmatrices(formula, data)
7165
reg_names = dmatrix.design_info.column_names
7266

73-
if init_vals is None and init:
74-
init_vals = glm_sm(formula, data, family=family).fit().params
75-
else:
76-
init_vals = defaultdict(lambda: None)
67+
if init_vals is None:
68+
init_vals = {}
7769

7870
# Create individual coefficients
7971
model = modelcontext(model)
@@ -82,13 +74,15 @@ def linear_component(formula, data, priors=None,
8274
if reg_names[0] == 'Intercept':
8375
prior = priors.get('Intercept', intercept_prior)
8476
coeff = model.Var(reg_names.pop(0), prior)
85-
coeff.tag.test_value = init_vals['Intercept']
77+
if 'Intercept' in init_vals:
78+
coeff.tag.test_value = init_vals['Intercept']
8679
coeffs.append(coeff)
8780

8881
for reg_name in reg_names:
8982
prior = priors.get(reg_name, regressor_prior)
9083
coeff = model.Var(reg_name, prior)
91-
coeff.tag.test_value = init_vals[reg_name]
84+
if reg_name in init_vals:
85+
coeff.tag.test_value = init_vals[reg_name]
9286
coeffs.append(coeff)
9387

9488
y_est = theano.dot(np.asarray(dmatrix), theano.tensor.stack(*coeffs)).reshape((1, -1))
@@ -114,15 +108,10 @@ def glm(*args, **kwargs):
114108
regressor_prior : pymc3 distribution
115109
Prior to use for all regressor(s).
116110
Default: Normal.dist(mu=0, tau=1.0E-12)
117-
init : bool
118-
Whether initialize test values via statsmodels
119-
Default: True
120111
init_vals : dict
121112
Set starting values externally: parameter -> value
122113
Default: None
123-
find_MAP : bool
124-
Whether to call find_MAP on non-initialized nodes.
125-
family : statsmodels.family
114+
family : Family object
126115
Distribution of likelihood, see pymc3.glm.families
127116
(init has to be True).
128117
@@ -135,7 +124,7 @@ def glm(*args, **kwargs):
135124
# Logistic regression
136125
vars = glm('male ~ height + weight',
137126
data,
138-
family=glm.families.Binomial(link=glm.links.Logit))
127+
family=glm.families.Binomial(link=glm.families.logit))
139128
"""
140129

141130
model = modelcontext(kwargs.get('model'))
@@ -147,21 +136,10 @@ def glm(*args, **kwargs):
147136
data = args[1]
148137
y_data = np.asarray(patsy.dmatrices(formula, data)[0]).T
149138

150-
# Create GLM
151-
kwargs['family'] = family.create_statsmodel_family()
152-
153139
y_est, coeffs = linear_component(*args, **kwargs)
154140
family.create_likelihood(y_est, y_data)
155141

156-
# Find vars we have not initialized yet
157-
non_init_vars = set(model.vars).difference(set(coeffs))
158-
if len(non_init_vars) != 0 and call_find_map:
159-
start = find_MAP(vars=non_init_vars)
160-
161-
for var in non_init_vars:
162-
var.tag.test_value = start[var.name]
163-
164-
return [y_est] + coeffs + list(non_init_vars)
142+
return [y_est] + coeffs
165143

166144

167145
def plot_posterior_predictive(trace, eval=None, lm=None, samples=30, **kwargs):

pymc3/glm/links.py

Lines changed: 0 additions & 37 deletions
This file was deleted.

readme.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ PyMC is tested on Python 2.7 and 3.3 and depends on Theano, NumPy, SciPy, and Ma
5050

5151
### Optional
5252

53-
The GLM submodule relies on Pandas, Patsy, Statsmodels.
53+
The GLM submodule relies on Pandas and Patsy.
5454

5555
[`scikits.sparse`](https://github.com/njsmith/scikits-sparse) enables sparse scaling matrices which are useful for large problems. Installation on Ubuntu is easy:
5656

0 commit comments

Comments
 (0)