1
- """
1
+ """Smoother utility.
2
+
2
3
This file contains the smoothing utility functions. We have a number of
3
4
possible smoothers to choose from: windowed average, local weighted regression,
4
5
and a causal Savitzky-Golay filter.
10
11
docstrings for details.
11
12
"""
12
13
14
+ from typing import Union
13
15
import warnings
14
16
15
17
import numpy as np
16
18
import pandas as pd
17
19
18
20
19
21
class Smoother : # pylint: disable=too-many-instance-attributes
20
- """
22
+ """Smoother class.
23
+
21
24
This is the smoothing utility class. This class holds the parameter settings for its smoother
22
25
methods and provides reasonable defaults. Basic usage can be found in the examples below.
23
26
@@ -36,9 +39,13 @@ class Smoother: # pylint: disable=too-many-instance-attributes
36
39
Descriptions of the smoothers are available in the doc strings. Full mathematical
37
40
details are in: https://github.com/cmu-delphi/covidcast-modeling/ in the folder
38
41
'indicator_smoother'.
42
+ poly_fit_degree: int
43
+ A parameter for the 'savgol' smoother which sets the degree of the polynomial fit.
39
44
window_length: int
40
- The length of the averaging window for 'savgol' and 'moving_average'.
41
- This value is in the units of the data, which tends to be days.
45
+ The length of the fitting window for 'savgol' and the averaging window 'moving_average'.
46
+ This value is in the units provided by the data, which are likely to be days for Delphi.
47
+ Note that if window_length is smaller than the length of the signal, then only the
48
+ imputation method is run on the signal.
42
49
gaussian_bandwidth: float or None
43
50
If float, all regression is done with Gaussian weights whose variance is
44
51
half the gaussian_bandwidth. If None, performs unweighted regression. (Applies
@@ -60,14 +67,19 @@ class Smoother: # pylint: disable=too-many-instance-attributes
60
67
The smallest value to allow in a signal. If None, there is no smallest value.
61
68
Currently only implemented for 'left_gauss_linear'. This should probably not be in the scope
62
69
of the smoothing utility.
63
- poly_fit_degree: int
64
- A parameter for the 'savgol' smoother which sets the degree of the polynomial fit.
70
+ boundary_method: {'shortened_window', 'identity', 'nan'}
71
+ Determines how the 'savgol' method handles smoothing at the (left) boundary, where the past
72
+ data length is shorter than the window_length parameter. If 'shortened_window', it uses the
73
+ maximum window available; at the very edge (generally up to poly_fit_degree) it keeps the
74
+ same value as the raw signal. If 'identity', it just keeps the raw signal. If 'nan', it
75
+ writes nans. For the other smoothing methods, 'moving_average' writes nans and
76
+ 'left_gauss_linear' uses a shortened window.
65
77
66
78
Methods
67
79
----------
68
80
smooth: np.ndarray or pd.Series
69
81
Takes a 1D signal and returns a smoothed version.
70
- The input and the output have the same lengthand type.
82
+ The input and the output have the same length and type.
71
83
72
84
Example Usage
73
85
-------------
@@ -108,6 +120,7 @@ def __init__(
108
120
minval = None ,
109
121
boundary_method = "shortened_window" ,
110
122
):
123
+ """See class docstring."""
111
124
self .smoother_name = smoother_name
112
125
self .poly_fit_degree = poly_fit_degree
113
126
self .window_length = window_length
@@ -118,35 +131,38 @@ def __init__(
118
131
119
132
valid_smoothers = {"savgol" , "left_gauss_linear" , "moving_average" , "identity" }
120
133
valid_impute_methods = {"savgol" , "zeros" , None }
134
+ valid_boundary_methods = {"shortened_window" , "identity" , "nan" }
121
135
if self .smoother_name not in valid_smoothers :
122
- raise ValueError ("Invalid smoother name given." )
136
+ raise ValueError ("Invalid smoother_name given." )
123
137
if self .impute_method not in valid_impute_methods :
124
- raise ValueError ("Invalid impute method given." )
138
+ raise ValueError ("Invalid impute_method given." )
139
+ if self .boundary_method not in valid_boundary_methods :
140
+ raise ValueError ("Invalid boundary_method given." )
125
141
126
142
if smoother_name == "savgol" :
143
+ # The polynomial fitting is done on a past window of size window_length
144
+ # including the current day value.
127
145
self .coeffs = self .savgol_coeffs (- self .window_length + 1 , 0 )
128
146
else :
129
147
self .coeffs = None
130
148
131
- def smooth (self , signal ):
132
- """
133
- The major workhorse smoothing function. Can use one of three smoothing
134
- methods, as specified by the class variable smoother_name.
149
+ def smooth (self , signal : Union [np .ndarray , pd .Series ]) -> Union [np .ndarray , pd .Series ]:
150
+ """Apply a smoother to a signal.
151
+
152
+ The major workhorse smoothing function. Imputes the nans and then applies
153
+ a smoother to the signal.
135
154
136
155
Parameters
137
156
----------
138
157
signal: np.ndarray or pd.Series
139
158
A 1D signal to be smoothed.
140
159
160
+ Returns
161
+ ----------
141
162
signal_smoothed: np.ndarray or pd.Series
142
163
A smoothed 1D signal. Returns an array of the same type and length as
143
164
the input.
144
165
"""
145
- if len (signal ) < self .window_length :
146
- raise ValueError (
147
- "The window_length must be smaller than the length of the signal."
148
- )
149
-
150
166
is_pandas_series = isinstance (signal , pd .Series )
151
167
signal = signal .to_numpy () if is_pandas_series else signal
152
168
@@ -158,14 +174,16 @@ def smooth(self, signal):
158
174
signal_smoothed = self .left_gauss_linear_smoother (signal )
159
175
elif self .smoother_name == "moving_average" :
160
176
signal_smoothed = self .moving_average_smoother (signal )
161
- elif self . smoother_name == "identity" :
162
- signal_smoothed = signal
177
+ else :
178
+ signal_smoothed = signal . copy ()
163
179
164
- return signal_smoothed if not is_pandas_series else pd .Series (signal_smoothed )
180
+ signal_smoothed = signal_smoothed if not is_pandas_series else pd .Series (signal_smoothed )
181
+ return signal_smoothed
165
182
166
183
def impute (self , signal ):
167
- """
168
- Imputes the nan values in the signal.
184
+ """Impute the nan values in the signal.
185
+
186
+ See the class docstring for an explanation of the impute methods.
169
187
170
188
Parameters
171
189
----------
@@ -191,8 +209,7 @@ def impute(self, signal):
191
209
return imputed_signal
192
210
193
211
def moving_average_smoother (self , signal ):
194
- """
195
- Computes a moving average on the signal.
212
+ """Compute a moving average on the signal.
196
213
197
214
Parameters
198
215
----------
@@ -219,11 +236,10 @@ def moving_average_smoother(self, signal):
219
236
return signal_smoothed
220
237
221
238
def left_gauss_linear_smoother (self , signal ):
222
- """
223
- DEPRECATED: This method is available to help sanity check the 'savgol' method.
224
- It is a little slow, so use 'savgol' with poly_fit_degree=1 instead.
239
+ """Smooth the y-values using a local linear regression with Gaussian weights.
225
240
226
- Smooth the y-values using a local linear regression with Gaussian weights.
241
+ DEPRECATED: This method is available to help sanity check the 'savgol' method.
242
+ Use 'savgol' with poly_fit_degree=1 and the appropriate gaussian_bandwidth instead.
227
243
228
244
At each time t, we use the data from times 1, ..., t-dt, weighted
229
245
using the Gaussian kernel, to produce the estimate at time t.
@@ -264,7 +280,8 @@ def left_gauss_linear_smoother(self, signal):
264
280
return signal_smoothed
265
281
266
282
def savgol_predict (self , signal ):
267
- """
283
+ """Predict a single value using the savgol method.
284
+
268
285
Fits a polynomial through the values given by the signal and returns the value
269
286
of the polynomial at the right-most signal-value. More precisely, fits a polynomial
270
287
f(t) of degree poly_fit_degree through the points signal[-n], signal[-n+1] ..., signal[-1],
@@ -280,14 +297,15 @@ def savgol_predict(self, signal):
280
297
predicted_value: float
281
298
The anticipated value that comes after the end of the signal based on a polynomial fit.
282
299
"""
300
+ # Add one
283
301
coeffs = self .savgol_coeffs (- len (signal ) + 1 , 0 )
284
302
predicted_value = signal @ coeffs
285
303
return predicted_value
286
304
287
305
def savgol_coeffs (self , nl , nr ):
288
- """
289
- Solves for the Savitzky-Golay coefficients. The coefficients c_i
290
- give a filter so that
306
+ """Solve for the Savitzky-Golay coefficients.
307
+
308
+ The coefficients c_i give a filter so that
291
309
y = sum_{i=-{n_l}}^{n_r} c_i x_i
292
310
is the value at 0 (thus the constant term) of the polynomial fit
293
311
through the points {x_i}. The coefficients are c_i are calculated as
@@ -299,9 +317,9 @@ def savgol_coeffs(self, nl, nr):
299
317
Parameters
300
318
----------
301
319
nl: int
302
- The left window bound for the polynomial fit.
320
+ The left window bound for the polynomial fit, inclusive .
303
321
nr: int
304
- The right window bound for the polynomial fit.
322
+ The right window bound for the polynomial fit, inclusive .
305
323
poly_fit_degree: int
306
324
The degree of the polynomial to be fit.
307
325
gaussian_bandwidth: float or None
@@ -337,14 +355,10 @@ def savgol_coeffs(self, nl, nr):
337
355
return coeffs
338
356
339
357
def savgol_smoother (self , signal ):
340
- """
341
- Returns a specific type of convolution of the 1D signal with the 1D signal
342
- coeffs, respecting boundary effects. That is, the output y is
343
- signal_smoothed_i = signal_i
344
- signal_smoothed_i = sum_{j=0}^n coeffs_j signal_{i+j}, if i >= len(coeffs) - 1
345
- In words, entries close to the left boundary are not smoothed, the window does
346
- not proceed over the right boundary, and the rest of the values are regular
347
- convolution.
358
+ """Smooth signal with the savgol smoother.
359
+
360
+ Returns a convolution of the 1D signal with the Savitzky-Golay coefficients, respecting
361
+ boundary effects. For an explanation of boundary effects methods, see the class docstring.
348
362
349
363
Parameters
350
364
----------
@@ -356,15 +370,14 @@ def savgol_smoother(self, signal):
356
370
signal_smoothed: np.ndarray
357
371
A smoothed 1D signal of same length as signal.
358
372
"""
359
-
360
- # reverse because np.convolve reverses the second argument
373
+ # Reverse because np.convolve reverses the second argument
361
374
temp_reversed_coeffs = np .array (list (reversed (self .coeffs )))
362
375
363
- # does the majority of the smoothing, with the calculated coefficients
376
+ # Smooth the part of the signal away from the boundary first
364
377
signal_padded = np .append (np .nan * np .ones (len (self .coeffs ) - 1 ), signal )
365
378
signal_smoothed = np .convolve (signal_padded , temp_reversed_coeffs , mode = "valid" )
366
379
367
- # this section handles the smoothing behavior at the (left) boundary:
380
+ # This section handles the smoothing behavior at the (left) boundary:
368
381
# - shortened_window (default) applies savgol with a smaller window to do the fit
369
382
# - identity keeps the original signal (doesn't smooth)
370
383
# - nan writes nans
@@ -373,22 +386,23 @@ def savgol_smoother(self, signal):
373
386
if ix == 0 :
374
387
signal_smoothed [ix ] = signal [ix ]
375
388
else :
389
+ # At the very edge, the design matrix is often singular, in which case
390
+ # we just fall back to the raw signal
376
391
try :
377
- signal_smoothed [ix ] = self .savgol_predict (signal [: ( ix + 1 ) ])
378
- except np .linalg .LinAlgError : # for small ix, the design matrix is singular
392
+ signal_smoothed [ix ] = self .savgol_predict (signal [:ix + 1 ])
393
+ except np .linalg .LinAlgError :
379
394
signal_smoothed [ix ] = signal [ix ]
380
395
return signal_smoothed
381
396
elif self .boundary_method == "identity" :
382
- for ix in range (len (self .coeffs )):
397
+ for ix in range (min ( len (self .coeffs ), len ( signal ) )):
383
398
signal_smoothed [ix ] = signal [ix ]
384
399
return signal_smoothed
385
400
elif self .boundary_method == "nan" :
386
401
return signal_smoothed
387
- else :
388
- raise ValueError ("Unknown boundary method." )
389
402
390
403
def savgol_impute (self , signal ):
391
- """
404
+ """Impute the nan values in signal using savgol.
405
+
392
406
This method fills the nan values in the signal with an M-degree polynomial fit
393
407
on a rolling window of the immediate past up to window_length data points.
394
408
0 commit comments