Skip to content

Commit dfee042

Browse files
committed
Refactor JHU smoother to use util:
* remove local smoothing module * update and lint run.py * a few linter / formatting changes in jhu tests * a few linter / formatting changes in smoother util
1 parent dabd7a9 commit dfee042

File tree

5 files changed

+49
-82
lines changed

5 files changed

+49
-82
lines changed

_delphi_utils_python/delphi_utils/smooth.py

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,9 @@ def __init__(
150150
else:
151151
self.coeffs = None
152152

153-
def smooth(self, signal: Union[np.ndarray, pd.Series], impute_order=2) -> Union[np.ndarray, pd.Series]:
153+
def smooth(
154+
self, signal: Union[np.ndarray, pd.Series], impute_order=2
155+
) -> Union[np.ndarray, pd.Series]:
154156
"""Apply a smoother to a signal.
155157
156158
The major workhorse smoothing function. Imputes the nans and then applies
@@ -200,7 +202,7 @@ def smooth(self, signal: Union[np.ndarray, pd.Series], impute_order=2) -> Union[
200202
signal_smoothed = signal
201203

202204
# Append the nans back, since we want to preserve length
203-
signal_smoothed = np.hstack([np.nan*np.ones(ix), signal_smoothed])
205+
signal_smoothed = np.hstack([np.nan * np.ones(ix), signal_smoothed])
204206
# Convert back to pandas if necessary
205207
if is_pandas_series:
206208
signal_smoothed = pd.Series(signal_smoothed)
@@ -295,21 +297,28 @@ def left_gauss_linear_smoother(self, signal):
295297
weights = np.exp(
296298
-((np.arange(idx + 1) - idx) ** 2) / self.gaussian_bandwidth
297299
)
298-
AwA = np.dot(A[: (idx + 1), :].T * weights, A[: (idx + 1), :]) # pylint: disable=invalid-name
300+
AwA = np.dot( # pylint: disable=invalid-name
301+
A[: (idx + 1), :].T * weights, A[: (idx + 1), :]
302+
)
299303
Awy = np.dot( # pylint: disable=invalid-name
300304
A[: (idx + 1), :].T * weights, signal[: (idx + 1)].reshape(-1, 1)
301305
)
302306
try:
303307
beta = np.linalg.solve(AwA, Awy)
304308
signal_smoothed[idx] = np.dot(A[: (idx + 1), :], beta)[-1]
305309
except np.linalg.LinAlgError:
306-
signal_smoothed[idx] = signal[idx] if self.impute else np.nan # pylint: disable=using-constant-test
310+
signal_smoothed[idx] = (
311+
signal[idx] # pylint: disable=using-constant-test
312+
if self.impute
313+
else np.nan
314+
)
307315
if self.minval is not None:
308316
signal_smoothed[signal_smoothed <= self.minval] = self.minval
309317
return signal_smoothed
310318

311319
def savgol_predict(self, signal, poly_fit_degree, nr):
312320
"""Predict a single value using the savgol method.
321+
313322
Fits a polynomial through the values given by the signal and returns the value
314323
of the polynomial at the right-most signal-value. More precisely, for a signal of length
315324
n, fits a poly_fit_degree polynomial through the points signal[-n+1+nr], signal[-n+2+nr],
@@ -368,7 +377,7 @@ def savgol_coeffs(self, nl, nr, poly_fit_degree):
368377
if nr > 0:
369378
warnings.warn("The filter is no longer causal.")
370379

371-
A = np.vstack(
380+
A = np.vstack( # pylint: disable=invalid-name
372381
[np.arange(nl, nr + 1) ** j for j in range(poly_fit_degree + 1)]
373382
).T
374383

@@ -471,7 +480,7 @@ def savgol_impute(self, signal, impute_order):
471480
# imputation order is larger than the available data)
472481
else:
473482
signal_imputed[ix] = self.savgol_predict(
474-
signal_imputed[:ix], min(ix-1, impute_order), -1
483+
signal_imputed[:ix], min(ix - 1, impute_order), -1
475484
)
476485
# Away from the boundary, use savgol fitting on a fixed window
477486
else:

jhu/delphi_jhu/run.py

Lines changed: 19 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -6,26 +6,21 @@
66
"""
77
from datetime import datetime
88
from itertools import product
9-
from functools import partial
109

1110
import numpy as np
1211
from delphi_utils import (
1312
read_params,
1413
create_export_csv,
1514
S3ArchiveDiffer,
15+
Smoother,
16+
GeoMapper,
1617
)
1718

18-
from delphi_utils import GeoMapper
1919
from .geo import geo_map
2020
from .pull import pull_jhu_data
21-
from .smooth import (
22-
identity,
23-
kday_moving_average,
24-
)
2521

2622

2723
# global constants
28-
seven_day_moving_average = partial(kday_moving_average, k=7)
2924
METRICS = [
3025
"deaths",
3126
"confirmed",
@@ -41,10 +36,10 @@
4136
"seven_day_average",
4237
]
4338
SENSOR_NAME_MAP = {
44-
"new_counts": ("incidence_num", False),
45-
"cumulative_counts": ("cumulative_num", False),
46-
"incidence": ("incidence_prop", False),
47-
"cumulative_prop": ("cumulative_prop", False),
39+
"new_counts": ("incidence_num", False),
40+
"cumulative_counts": ("cumulative_num", False),
41+
"incidence": ("incidence_prop", False),
42+
"cumulative_prop": ("cumulative_prop", False),
4843
}
4944
# Temporarily added for wip_ signals
5045
# WIP_SENSOR_NAME_MAP = {
@@ -54,8 +49,8 @@
5449
# "cumulative_prop": ("cumul_prop", False),
5550
# }
5651
SMOOTHERS_MAP = {
57-
"unsmoothed": (identity, ''),
58-
"seven_day_average": (seven_day_moving_average, '7dav_'),
52+
"unsmoothed": (Smoother("identity").smooth, ""),
53+
"seven_day_average": (Smoother("moving_average", window_length=7).smooth, "7dav_"),
5954
}
6055
GEO_RESOLUTIONS = [
6156
"county",
@@ -75,17 +70,21 @@ def run_module():
7570

7671
if len(params["bucket_name"]) > 0:
7772
arch_diff = S3ArchiveDiffer(
78-
cache_dir, export_dir,
79-
params["bucket_name"], "jhu",
80-
params["aws_credentials"])
73+
cache_dir,
74+
export_dir,
75+
params["bucket_name"],
76+
"jhu",
77+
params["aws_credentials"],
78+
)
8179
arch_diff.update_cache()
8280
else:
8381
arch_diff = None
8482

8583
gmpr = GeoMapper()
8684
dfs = {metric: pull_jhu_data(base_url, metric, gmpr) for metric in METRICS}
8785
for metric, geo_res, sensor, smoother in product(
88-
METRICS, GEO_RESOLUTIONS, SENSORS, SMOOTHERS):
86+
METRICS, GEO_RESOLUTIONS, SENSORS, SMOOTHERS
87+
):
8988
print(metric, geo_res, sensor, smoother)
9089
df = dfs[metric]
9190
# Aggregate to appropriate geographic resolution
@@ -121,7 +120,9 @@ def run_module():
121120
_, fails = arch_diff.archive_exports(to_archive)
122121

123122
# Filter existing exports to exclude those that failed to archive
124-
succ_common_diffs = {f: diff for f, diff in common_diffs.items() if f not in fails}
123+
succ_common_diffs = {
124+
f: diff for f, diff in common_diffs.items() if f not in fails
125+
}
125126
arch_diff.filter_exports(succ_common_diffs)
126127

127128
# Report failures: someone should probably look at them

jhu/delphi_jhu/smooth.py

Lines changed: 0 additions & 39 deletions
This file was deleted.

jhu/tests/test_run.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
from os.path import join
55

66
import pandas as pd
7-
from delphi_jhu.run import run_module
87

98

109
class TestRun:

jhu/tests/test_smooth.py

Lines changed: 15 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,31 @@
11
import pytest
22

3-
from os import listdir
43
from os.path import join
54

65
import numpy as np
76
import pandas as pd
8-
from delphi_jhu.run import run_module
7+
98

109
class TestSmooth:
1110
def test_output_files_smoothed(self, run_as_module):
12-
1311
dates = [str(x) for x in range(20200303, 20200310)]
1412

1513
smoothed = pd.read_csv(
16-
join("./receiving",
17-
f"{dates[-1]}_state_confirmed_7dav_cumulative_num.csv")
14+
join("./receiving", f"{dates[-1]}_state_confirmed_7dav_cumulative_num.csv")
1815
)
1916

2017
# Build a dataframe out of the individual day files
21-
raw = pd.concat([
22-
pd.read_csv(
23-
join("./receiving",
24-
f"{date}_state_confirmed_cumulative_num.csv")
25-
) for date in dates
26-
])
27-
# Compute the mean across the time values; order doesn't matter
28-
# this corresponds to the smoothed value on the last day
18+
raw = pd.concat(
19+
[
20+
pd.read_csv(
21+
join("./receiving", f"{date}_state_confirmed_cumulative_num.csv")
22+
)
23+
for date in dates
24+
]
25+
)
26+
# Compute the mean across the time values; order doesn't matter
27+
# this corresponds to the smoothed value on the last day
2928
# 2020-03-10
30-
raw = raw.groupby('geo_id')['val'].mean()
31-
32-
df = pd.merge(smoothed, raw, on='geo_id', suffixes=('_smoothed', '_raw'))
33-
assert np.allclose(df['val_smoothed'].values, df['val_raw'].values)
34-
29+
raw = raw.groupby("geo_id")["val"].mean()
30+
df = pd.merge(smoothed, raw, on="geo_id", suffixes=("_smoothed", "_raw"))
31+
assert np.allclose(df["val_smoothed"].values, df["val_raw"].values)

0 commit comments

Comments
 (0)