Skip to content

Commit 787de22

Browse files
committed
Add, fix impute method, and add more impute tests
1 parent 8878749 commit 787de22

File tree

2 files changed

+107
-53
lines changed

2 files changed

+107
-53
lines changed

_delphi_utils_python/delphi_utils/smooth.py

Lines changed: 71 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -24,17 +24,18 @@ class Smoother:
2424
2525
Instantiating a smoother class specifies a smoother with a host of parameters,
2626
which can then be applied to an np.ndarray with the function smooth:
27-
> smoother = Smoother(method_name='savgol', window_length=28, gaussian_bandwidth=100)
27+
> smoother = Smoother(smoother_name='savgol', window_length=28, gaussian_bandwidth=144)
2828
> smoothed_signal = smoother.smooth(signal)
2929
3030
Parameters
3131
----------
32-
method_name: {'savgol', 'left_gauss_linear', 'moving_average'}
33-
This variable specifies the smoothing method. We have three methods, currently:
34-
* 'savgol' or a Savtizky-Golay smoother
32+
smoother_name: {'savgol', 'left_gauss_linear', 'moving_average', 'identity'}
33+
This variable specifies the smoother. We have four smoothers, currently:
34+
* 'savgol' or a Savtizky-Golay smoother (default)
3535
* 'left_gauss_linear' or a Gaussian-weight linear regression smoother
3636
* 'moving_average' or a moving window average smoother
37-
Descriptions of the methods are available in the doc strings. Full details are
37+
* 'identity' or the trivial smoother (no smoothing)
38+
Descriptions of the smoothers are available in the doc strings. Full details are
3839
in: https://github.com/cmu-delphi/covidcast-modeling/indicators_smoother.
3940
window_length: int
4041
The length of the averaging window for 'savgol' and 'moving_average'.
@@ -51,39 +52,40 @@ class Smoother:
5152
28 579
5253
35 905
5354
42 1303
54-
impute: bool
55-
If True, will fill nan values before smoothing. Currently uses the 'savgol' method
56-
for imputation.
55+
impute: {'savgol', 'zeros', None}
56+
If 'savgol' (default), will fill nan values with a savgol fit on the largest available time
57+
window prior (up to window_length). If 'zeros', will fill nan values with zeros.
58+
If None, leaves the nans in place.
5759
minval: float or None
5860
The smallest value to allow in a signal. If None, there is no smallest value.
5961
Currently only implemented for 'left_gauss_linear'.
6062
poly_fit_degree: int
61-
A parameter for the 'savgol' method which sets the degree of the polynomial fit.
63+
A parameter for the 'savgol' smoother which sets the degree of the polynomial fit.
6264
6365
Methods
6466
----------
6567
smooth: np.ndarray
66-
Takes a 1D signal and returns a smoothed version.
68+
Takes a 1D signal and returns a smoothed version. Both arrays have the same length.
6769
"""
6870

6971
def __init__(
7072
self,
71-
method_name="savgol",
73+
smoother_name="savgol",
7274
poly_fit_degree=2,
7375
window_length=28,
7476
gaussian_bandwidth=144, # a ~2 week window
75-
impute=True,
77+
impute_method="savgol",
7678
minval=None,
7779
boundary_method="shortened_window",
7880
):
79-
self.method_name = method_name
81+
self.smoother_name = smoother_name
8082
self.poly_fit_degree = poly_fit_degree
8183
self.window_length = window_length
8284
self.gaussian_bandwidth = gaussian_bandwidth
83-
self.impute = impute
85+
self.impute_method = impute_method
8486
self.minval = minval
8587
self.boundary_method = boundary_method
86-
if method_name == "savgol":
88+
if smoother_name == "savgol":
8789
self.coeffs = self.savgol_coeffs(
8890
-self.window_length + 1,
8991
0,
@@ -93,15 +95,20 @@ def __init__(
9395
else:
9496
self.coeffs = None
9597

96-
METHODS = {"savgol", "left_gauss_linear", "moving_average", "identity"}
98+
SMOOTHERS = {"savgol", "left_gauss_linear", "moving_average", "identity"}
9799

98-
if self.method_name not in METHODS:
99-
raise ValueError("Invalid method name given.")
100+
if self.smoother_name not in SMOOTHERS:
101+
raise ValueError("Invalid smoother name given.")
102+
103+
IMPUTE_METHODS = {"savgol", "zeros", None}
104+
105+
if self.impute_method not in IMPUTE_METHODS:
106+
raise ValueError("Invalid impute method given.")
100107

101108
def smooth(self, signal):
102109
"""
103110
The major workhorse smoothing function. Can use one of three smoothing
104-
methods, as specified by the class variable method_name.
111+
methods, as specified by the class variable smoother_name.
105112
106113
Parameters
107114
----------
@@ -111,20 +118,42 @@ def smooth(self, signal):
111118
signal_smoothed: np.ndarray
112119
A smoothed 1D signal.
113120
"""
114-
if self.impute:
115-
signal = self.savgol_impute(signal)
121+
signal = self.impute(signal)
116122

117-
if self.method_name == "savgol":
123+
if self.smoother_name == "savgol":
118124
signal_smoothed = self.savgol_smoother(signal)
119-
elif self.method_name == "left_gauss_linear":
125+
elif self.smoother_name == "left_gauss_linear":
120126
signal_smoothed = self.left_gauss_linear_smoother(signal)
121-
elif self.method_name == "moving_average":
127+
elif self.smoother_name == "moving_average":
122128
signal_smoothed = self.moving_average_smoother(signal)
123-
elif self.method_name == "identity":
129+
elif self.smoother_name == "identity":
124130
signal_smoothed = signal
125131

126132
return signal_smoothed
127133

134+
def impute(self, signal):
135+
"""
136+
Imputes the nan values in the signal.
137+
138+
Parameters
139+
----------
140+
signal: np.ndarray
141+
1D signal to be imputed.
142+
143+
Returns
144+
-------
145+
imputed_signal: np.ndarray
146+
Imputed signal.
147+
"""
148+
if self.impute_method == "savgol":
149+
imputed_signal = self.savgol_impute(signal)
150+
elif self.impute_method == "zeros":
151+
imputed_signal = np.nan_to_num(signal)
152+
elif self.impute_method is None:
153+
imputed_signal = np.copy(signal)
154+
155+
return imputed_signal
156+
128157
def moving_average_smoother(self, signal):
129158
"""
130159
Computes a moving average on the signal.
@@ -322,10 +351,19 @@ def savgol_smoother(self, signal):
322351

323352
def savgol_impute(self, signal):
324353
"""
325-
This method looks through the signal, finds the nan values, and imputes them
326-
using an M-degree polynomial fit on the previous window_length data points.
327-
The boundary cases, i.e. nans within wl of the start of the array
328-
are imputed with a window length shrunk to the data available.
354+
This method fills the nan values in the signal with an M-degree polynomial fit
355+
on a rolling window of the immediate past up to window_length data points.
356+
357+
In the case of a single data point in the past, the single data point is
358+
continued. In the case of no data points in the past (i.e. the signal starts
359+
with nan), an error is raised.
360+
361+
Note that in the case of many adjacent nans, the method will use previously
362+
imputed values to do the fitting for later values. E.g. for
363+
>>> x = np.array([1.0, 2.0, np.nan, 1.0, np.nan])
364+
the last np.nan will be fit on np.array([1.0, 2.0, *, 1.0]), where * is the
365+
result of imputing based on np.array([1.0, 2.0]) (depends on the savgol
366+
settings).
329367
330368
Parameters
331369
----------
@@ -342,25 +380,21 @@ def savgol_impute(self, signal):
342380
raise ValueError("The signal should not begin with a nan value.")
343381
for ix in np.where(np.isnan(signal))[0]:
344382
if ix < self.window_length:
345-
if ix == 0:
346-
signal_imputed[ix] = signal[ix]
347-
elif ix == 1:
348-
signal_imputed[ix] = (
349-
signal[ix] if not np.isnan(signal[ix]) else signal[0]
350-
)
383+
if ix == 1:
384+
signal_imputed[ix] = signal_imputed[0]
351385
else:
352386
coeffs = self.savgol_coeffs(
353387
-ix, -1, self.poly_fit_degree, self.gaussian_bandwidth
354388
)
355-
signal_imputed[ix] = signal[:ix] @ coeffs
389+
signal_imputed[ix] = signal_imputed[:ix] @ coeffs
356390
else:
357391
coeffs = self.savgol_coeffs(
358392
-self.window_length,
359393
-1,
360394
self.poly_fit_degree,
361395
self.gaussian_bandwidth,
362396
)
363-
signal_imputed[ix] = signal[ix - self.window_length : ix] @ coeffs
397+
signal_imputed[ix] = signal_imputed[ix - self.window_length : ix] @ coeffs
364398
return signal_imputed
365399

366400

_delphi_utils_python/tests/test_smooth.py

Lines changed: 36 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -15,20 +15,20 @@
1515
class TestSmoothers:
1616
def test_identity_smoother(self):
1717
signal = np.arange(30) + np.random.rand(30)
18-
assert np.allclose(signal, Smoother(method_name="identity").smooth(signal))
18+
assert np.allclose(signal, Smoother(smoother_name="identity").smooth(signal))
1919

2020
def test_moving_average_smoother(self):
2121
# The raw and smoothed lengths should match
2222
signal = np.ones(30)
23-
smoother = Smoother(method_name="moving_average")
23+
smoother = Smoother(smoother_name="moving_average")
2424
smoothed_signal = smoother.smooth(signal)
2525
assert len(signal) == len(smoothed_signal)
2626

2727
# The raw and smoothed arrays should be identical on constant data
2828
# modulo the nans
2929
signal = np.ones(30)
3030
window_length = 10
31-
smoother = Smoother(method_name="moving_average", window_length=window_length)
31+
smoother = Smoother(smoother_name="moving_average", window_length=window_length)
3232
smoothed_signal = smoother.smooth(signal)
3333
assert np.allclose(
3434
signal[window_length - 1 :], smoothed_signal[window_length - 1 :]
@@ -37,7 +37,7 @@ def test_moving_average_smoother(self):
3737
def test_left_gauss_linear_smoother(self):
3838
# The raw and smoothed lengths should match
3939
signal = np.ones(30)
40-
smoother = Smoother(method_name="left_gauss_linear")
40+
smoother = Smoother(smoother_name="left_gauss_linear")
4141
smoothed_signal = smoother.smooth(signal)
4242
assert len(signal) == len(smoothed_signal)
4343
# The raw and smoothed arrays should be identical on constant data
@@ -47,15 +47,15 @@ def test_left_gauss_linear_smoother(self):
4747
# The smoother should basically be the identity when the Gaussian kernel
4848
# is set to weigh the present value overwhelmingly
4949
signal = np.arange(1, 10) + np.random.normal(0, 1, 9)
50-
smoother = Smoother(method_name="left_gauss_linear", gaussian_bandwidth=0.1)
50+
smoother = Smoother(smoother_name="left_gauss_linear", gaussian_bandwidth=0.1)
5151
assert np.allclose(smoother.smooth(signal)[1:], signal[1:])
5252

5353
def test_causal_savgol_coeffs(self):
5454
# The coefficients should return standard average weights for M=0
5555
nl, nr = -10, 0
5656
window_length = nr - nl + 1
5757
smoother = Smoother(
58-
method_name="savgol",
58+
smoother_name="savgol",
5959
window_length=window_length,
6060
poly_fit_degree=0,
6161
gaussian_bandwidth=None,
@@ -67,7 +67,7 @@ def test_causal_savgol_smoother(self):
6767
signal = np.ones(30)
6868
window_length = 10
6969
smoother = Smoother(
70-
method_name="savgol", window_length=window_length, poly_fit_degree=0
70+
smoother_name="savgol", window_length=window_length, poly_fit_degree=0
7171
)
7272
smoothed_signal = smoother.smooth(signal)
7373
assert len(signal) == len(smoothed_signal)
@@ -81,7 +81,7 @@ def test_causal_savgol_smoother(self):
8181
# modulo the nans, when M >= 1
8282
signal = np.arange(30)
8383
smoother = Smoother(
84-
method_name="savgol", window_length=window_length, poly_fit_degree=1
84+
smoother_name="savgol", window_length=window_length, poly_fit_degree=1
8585
)
8686
smoothed_signal = smoother.smooth(signal)
8787
assert np.allclose(
@@ -92,7 +92,7 @@ def test_causal_savgol_smoother(self):
9292
# modulo the nans, when M >= 2
9393
signal = np.arange(30) ** 2
9494
smoother = Smoother(
95-
method_name="savgol", window_length=window_length, poly_fit_degree=2
95+
smoother_name="savgol", window_length=window_length, poly_fit_degree=2
9696
)
9797
smoothed_signal = smoother.smooth(signal)
9898
assert np.allclose(
@@ -104,26 +104,46 @@ def test_causal_savgol_smoother(self):
104104
# and the polynomial fit degree is set to 1.
105105
window_length = 20
106106
signal = np.arange(window_length) + np.random.randn(window_length)
107-
smoother = Smoother(method_name="left_gauss_linear")
107+
smoother = Smoother(smoother_name="left_gauss_linear")
108108
smoothed_signal1 = smoother.smooth(signal)
109109
smoother = Smoother(
110-
method_name="savgol", window_length=window_length, poly_fit_degree=1,
110+
smoother_name="savgol", window_length=window_length, poly_fit_degree=1,
111111
)
112112
smoothed_signal2 = smoother.smooth(signal)
113113

114114
assert np.allclose(smoothed_signal1, smoothed_signal2)
115115

116-
def test_impute_with_savgol(self):
116+
def test_impute(self):
117+
# test the nan imputer
118+
signal = np.array([i if i % 3 else np.nan for i in range(1, 40)])
119+
assert np.allclose(Smoother(impute_method=None).impute(signal), signal, equal_nan=True)
120+
121+
# test the zeros imputer
122+
signal = np.array([i if i % 3 else np.nan for i in range(1, 40)])
123+
assert np.allclose(
124+
Smoother(impute_method="zeros").impute(signal),
125+
np.array([i if i % 3 else 0.0 for i in range(1, 40)])
126+
)
127+
128+
# make a signal with periodic nans to test the imputer
129+
signal = np.array([i if i % 3 else np.nan for i in range(1, 40)])
130+
# test that the non-nan values are unchanged
131+
not_nans_ixs = np.bitwise_xor(np.isnan(signal, where=True), np.full(len(signal), True))
132+
smoothed_signal = Smoother().savgol_impute(signal)
133+
assert np.allclose(signal[not_nans_ixs], smoothed_signal[not_nans_ixs])
134+
# test that the imputer is close to the true line
135+
assert np.allclose(range(1, 40), smoothed_signal, atol=0.5)
136+
117137
# should impute the next value in a linear progression with M>=1
118138
signal = np.hstack([np.arange(10), [np.nan], np.arange(10)])
119139
window_length = 10
120140
smoother = Smoother(
121-
method_name="savgol", window_length=window_length, poly_fit_degree=1
141+
smoother_name="savgol", window_length=window_length, poly_fit_degree=1
122142
)
123143
imputed_signal = smoother.savgol_impute(signal)
124144
assert np.allclose(imputed_signal, np.hstack([np.arange(11), np.arange(10)]))
125145
smoother = Smoother(
126-
method_name="savgol", window_length=window_length, poly_fit_degree=2
146+
smoother_name="savgol", window_length=window_length, poly_fit_degree=2
127147
)
128148
imputed_signal = smoother.savgol_impute(signal)
129149
assert np.allclose(imputed_signal, np.hstack([np.arange(11), np.arange(10)]))
@@ -133,7 +153,7 @@ def test_impute_with_savgol(self):
133153
[np.arange(5), [np.nan], np.arange(20), [np.nan], np.arange(5)]
134154
)
135155
smoother = Smoother(
136-
method_name="savgol", window_length=window_length, poly_fit_degree=2
156+
smoother_name="savgol", window_length=window_length, poly_fit_degree=2
137157
)
138158
imputed_signal = smoother.savgol_impute(signal)
139159
assert np.allclose(
@@ -143,7 +163,7 @@ def test_impute_with_savgol(self):
143163
# if the array begins with np.nan, we should tell the user to peel it off before sending
144164
signal = np.hstack([[np.nan], np.arange(20), [np.nan], np.arange(5)])
145165
smoother = Smoother(
146-
method_name="savgol", window_length=window_length, poly_fit_degree=2
166+
smoother_name="savgol", window_length=window_length, poly_fit_degree=2
147167
)
148168
with pytest.raises(ValueError):
149169
imputed_signal = smoother.savgol_impute(signal)

0 commit comments

Comments
 (0)