Skip to content

Commit fbed79b

Browse files
committed
Merge branch 'nancodes' into nans_quidel
2 parents 6213713 + 801f04c commit fbed79b

File tree

109 files changed

+15847
-796
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

109 files changed

+15847
-796
lines changed

.github/workflows/python-ci.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,6 @@ jobs:
3434
run: |
3535
make install
3636
- name: Lint
37-
if: ${{ matrix.packages != 'doctor_visits' }}
3837
run: |
3938
make lint
4039
- name: Test

_delphi_utils_python/Makefile

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
.PHONY = venv, lint, test, clean
22

3-
dir = $(shell find ./delphi_* -name __init__.py | grep -o 'delphi_[_[:alnum:]]*')
4-
53
venv:
64
python3.8 -m venv env
75

@@ -11,12 +9,12 @@ install: venv
119
pip install -e .
1210

1311
lint:
14-
. env/bin/activate; pylint $(dir)
15-
. env/bin/activate; pydocstyle $(dir)
12+
. env/bin/activate; pylint delphi_utils
13+
. env/bin/activate; pydocstyle delphi_utils
1614

1715
test:
1816
. env/bin/activate ;\
19-
(cd tests && ../env/bin/pytest --cov=$(dir) --cov-report=term-missing)
17+
(cd tests && ../env/bin/pytest --cov=delphi_utils --cov-report=term-missing)
2018

2119
clean:
2220
rm -rf env

_delphi_utils_python/delphi_utils/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,6 @@
1212
from .geomap import GeoMapper
1313
from .smooth import Smoother
1414
from .signal import add_prefix
15-
from .nancodes import NAN_CODES
15+
from .nancodes import Nans
1616

1717
__version__ = "0.1.0"

_delphi_utils_python/delphi_utils/geomap.py

Lines changed: 87 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -141,85 +141,77 @@ def _load_crosswalk(self, from_code, to_code):
141141
assert from_code in self.crosswalk_filepaths, \
142142
f"No crosswalk files for {from_code}; try {'; '.join(self.crosswalk_filepaths.keys())}"
143143
assert to_code in self.crosswalk_filepaths[from_code], \
144-
f"No crosswalk file from {from_code} to {to_code}; try" \
144+
f"No crosswalk file from {from_code} to {to_code}; try " \
145145
f"{'; '.join(self.crosswalk_filepaths[from_code].keys())}"
146+
147+
if self.crosswalks[from_code][to_code] is None:
148+
self.crosswalks[from_code][to_code] = self._load_crosswalk_from_file(from_code, to_code)
149+
return self.crosswalks[from_code][to_code]
150+
151+
def _load_crosswalk_from_file(self, from_code, to_code):
146152
stream = pkg_resources.resource_stream(
147153
__name__, self.crosswalk_filepaths[from_code][to_code]
148154
)
149-
if self.crosswalks[from_code][to_code] is None:
150-
# Weighted crosswalks
151-
if (from_code, to_code) in [
152-
("zip", "fips"),
153-
("fips", "zip"),
154-
("jhu_uid", "fips"),
155-
("zip", "msa"),
156-
("fips", "hrr"),
157-
("zip", "hhs")
158-
]:
159-
self.crosswalks[from_code][to_code] = pd.read_csv(
160-
stream,
161-
dtype={
162-
from_code: str,
163-
to_code: str,
164-
"weight": float,
165-
},
166-
)
167-
# Unweighted crosswalks
168-
elif (from_code, to_code) in [
169-
("zip", "hrr"),
170-
("fips", "msa"),
171-
("fips", "hhs"),
172-
("state_code", "hhs")
173-
]:
174-
self.crosswalks[from_code][to_code] = pd.read_csv(
175-
stream,
176-
dtype={from_code: str, to_code: str},
177-
)
178-
# Special table of state codes, state IDs, and state names
179-
elif (from_code, to_code) == ("state", "state"):
180-
self.crosswalks[from_code][to_code] = pd.read_csv(
181-
stream,
182-
dtype={
183-
"state_code": str,
184-
"state_id": str,
185-
"state_name": str,
186-
},
187-
)
188-
elif (from_code, to_code) == ("zip", "state"):
189-
self.crosswalks[from_code][to_code] = pd.read_csv(
190-
stream,
191-
dtype={
192-
"zip": str,
193-
"weight": float,
194-
"state_code": str,
195-
"state_id": str,
196-
"state_name": str,
197-
},
198-
)
199-
elif (from_code, to_code) == ("fips", "state"):
200-
self.crosswalks[from_code][to_code] = pd.read_csv(
201-
stream,
202-
dtype={
203-
"fips": str,
204-
"state_code": str,
205-
"state_id": str,
206-
"state_name": str,
207-
},
208-
)
209-
# Population tables
210-
elif to_code == "pop":
211-
self.crosswalks[from_code][to_code] = pd.read_csv(
212-
stream,
213-
dtype={
214-
from_code: str,
215-
"pop": int,
216-
},
217-
usecols=[
218-
from_code,
219-
"pop"
220-
]
221-
)
222-
return self.crosswalks[from_code][to_code]
155+
usecols = None
156+
dtype = None
157+
# Weighted crosswalks
158+
if (from_code, to_code) in [
159+
("zip", "fips"),
160+
("fips", "zip"),
161+
("jhu_uid", "fips"),
162+
("zip", "msa"),
163+
("fips", "hrr"),
164+
("zip", "hhs")
165+
]:
166+
dtype = {
167+
from_code: str,
168+
to_code: str,
169+
"weight": float,
170+
}
171+
172+
# Unweighted crosswalks
173+
elif (from_code, to_code) in [
174+
("zip", "hrr"),
175+
("fips", "msa"),
176+
("fips", "hhs"),
177+
("state_code", "hhs")
178+
]:
179+
dtype = {from_code: str, to_code: str}
180+
181+
# Special table of state codes, state IDs, and state names
182+
elif (from_code, to_code) == ("state", "state"):
183+
dtype = {
184+
"state_code": str,
185+
"state_id": str,
186+
"state_name": str,
187+
}
188+
elif (from_code, to_code) == ("zip", "state"):
189+
dtype = {
190+
"zip": str,
191+
"weight": float,
192+
"state_code": str,
193+
"state_id": str,
194+
"state_name": str,
195+
}
196+
elif (from_code, to_code) == ("fips", "state"):
197+
dtype = {
198+
"fips": str,
199+
"state_code": str,
200+
"state_id": str,
201+
"state_name": str,
202+
}
203+
204+
# Population tables
205+
elif to_code == "pop":
206+
dtype = {
207+
from_code: str,
208+
"pop": int,
209+
}
210+
usecols = [
211+
from_code,
212+
"pop"
213+
]
214+
return pd.read_csv(stream, dtype=dtype, usecols=usecols)
223215

224216
@staticmethod
225217
def convert_fips_to_mega(data, fips_col="fips", mega_col="megafips"):
@@ -333,19 +325,8 @@ def add_geocode(
333325
else:
334326
df[from_col] = df[from_col].astype(str)
335327

336-
# Assuming that the passed-in records are all United States data, at the moment
337-
if (from_code, new_code) in [("fips", "nation"), # pylint: disable=no-else-return
338-
("zip", "nation"),
339-
("state_code", "nation"),
340-
("state_name", "nation"),
341-
("state_id", "nation")]:
342-
df[new_col] = df[from_col].apply(lambda x: "us")
343-
return df
344-
elif new_code == "nation":
345-
raise ValueError(
346-
f"Conversion to the nation level is not supported "
347-
f"from {from_code}; try fips, zip, or state_*"
348-
)
328+
if new_code == "nation":
329+
return self._add_nation_geocode(df, from_code, from_col, new_col)
349330

350331
# state codes are all stored in one table
351332
if from_code in state_codes and new_code in state_codes:
@@ -375,11 +356,28 @@ def add_geocode(
375356
df.drop(columns=state_codes, inplace=True)
376357
elif new_code in state_codes and from_code in state_codes:
377358
state_codes.remove(new_code)
378-
state_codes.remove(from_code)
359+
if from_code in state_codes:
360+
state_codes.remove(from_code)
379361
df.drop(columns=state_codes, inplace=True)
380362

381363
return df
382364

365+
def _add_nation_geocode(self, df, from_code, from_col, new_col):
366+
"""Add a nation geocode column to a dataframe.
367+
368+
See `add_geocode()` documentation for argument description.
369+
"""
370+
valid_from_codes = ["fips", "zip", "state_code", "state_name", "state_id"]
371+
# Assuming that the passed-in records are all United States data, at the moment
372+
if from_code in valid_from_codes:
373+
df[new_col] = df[from_col].apply(lambda x: "us")
374+
return df
375+
376+
raise ValueError(
377+
f"Conversion to the nation level is not supported "
378+
f"from {from_code}; try {valid_from_codes}"
379+
)
380+
383381
def replace_geocode(
384382
self,
385383
df,
Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,13 @@
1+
"""Provides unified not-a-number codes for the indicators."""
12

2-
NAN_CODES = {
3-
"Not Missing": 0,
4-
"Not Applicable": 1,
5-
"Region Exception": 2,
6-
"Data Insufficient": 3,
7-
"Unknown": 4
8-
}
3+
from enum import IntEnum
4+
5+
class Nans(IntEnum):
6+
"""An enum of not-a-number codes for the indicators."""
7+
8+
NOT_MISSING = 0
9+
NOT_APPLICABLE = 1
10+
REGION_EXCEPTION = 2
11+
DATA_INSUFFICIENT = 3
12+
PRIVACY = 4
13+
UNKNOWN = 5
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
"""Indicator running utilities."""
2+
from typing import Any, Callable, Dict, Optional
3+
from .archive import ArchiveDiffer
4+
from .utils import read_params
5+
from .validator.validate import Validator
6+
7+
Params = Dict[str, Any]
8+
9+
# Trivial function to use as default value for validator and archive functions.
10+
NULL_FN = lambda x: None
11+
12+
def run_indicator_pipeline(indicator_fn: Callable[[Params], None],
13+
validator_fn: Callable[[Params], Optional[Validator]] = NULL_FN,
14+
archiver_fn: Callable[[Params], Optional[ArchiveDiffer]] = NULL_FN):
15+
"""Run an indicator with its optional validation and archiving.
16+
17+
Arguments
18+
---------
19+
indicator_fn: Callable[[Params], None]
20+
function that takes a dictionary of parameters and produces indicator output
21+
validator_fn: Callable[[Params], Optional[Validator]]
22+
function that takes a dictionary of parameters and produces the associated Validator or
23+
None if no validation should be performed.
24+
archiver_fn: Callable[[Params], Optional[ArchiveDiffer]]
25+
function that takes a dictionary of parameters and produces the associated ArchiveDiffer or
26+
None if no archiving should be performed.
27+
"""
28+
params = read_params()
29+
indicator_fn(params)
30+
validator = validator_fn(params)
31+
archiver = archiver_fn(params)
32+
if validator:
33+
validation_report = validator.validate()
34+
if archiver and (not validator or validation_report.success()):
35+
archiver.archive()

_delphi_utils_python/delphi_utils/smooth.py

Lines changed: 28 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -192,14 +192,7 @@ def smooth(
192192
signal = self.impute(signal, impute_order=impute_order)
193193

194194
# Smooth
195-
if self.smoother_name == "savgol":
196-
signal_smoothed = self.savgol_smoother(signal)
197-
elif self.smoother_name == "left_gauss_linear":
198-
signal_smoothed = self.left_gauss_linear_smoother(signal)
199-
elif self.smoother_name == "moving_average":
200-
signal_smoothed = self.moving_average_smoother(signal)
201-
elif self.smoother_name == "identity":
202-
signal_smoothed = signal
195+
signal_smoothed = self._select_smoother()(signal)
203196

204197
# Append the nans back, since we want to preserve length
205198
signal_smoothed = np.hstack([np.nan * np.ones(ix), signal_smoothed])
@@ -209,6 +202,18 @@ def smooth(
209202
signal_smoothed.index = pandas_index
210203
return signal_smoothed
211204

205+
def _select_smoother(self):
206+
"""Select a smoothing method based on the smoother type."""
207+
if self.smoother_name == "savgol":
208+
return self.savgol_smoother
209+
if self.smoother_name == "left_gauss_linear":
210+
return self.left_gauss_linear_smoother
211+
if self.smoother_name == "moving_average":
212+
return self.moving_average_smoother
213+
if self.smoother_name == "identity":
214+
return lambda x: x
215+
raise ValueError(f"invalid smoother {self.smoother_name}")
216+
212217
def impute(self, signal, impute_order=2):
213218
"""Impute the nan values in the signal.
214219
@@ -421,26 +426,23 @@ def savgol_smoother(self, signal): # pylint: disable=inconsistent-return-statem
421426
# - shortened_window (default) applies savgol with a smaller window to do the fit
422427
# - identity keeps the original signal (doesn't smooth)
423428
# - nan writes nans
424-
if self.boundary_method == "shortened_window": # pylint: disable=no-else-return
425-
for ix in range(min(len(self.coeffs), len(signal))):
426-
if ix == 0:
427-
signal_smoothed[ix] = signal[ix]
428-
else:
429-
# At the very edge, the design matrix is often singular, in which case
430-
# we just fall back to the raw signal
431-
try:
432-
signal_smoothed[ix] = self.savgol_predict(
433-
signal[: ix + 1], self.poly_fit_degree, 0
434-
)
435-
except np.linalg.LinAlgError: # for small ix, the design matrix is singular
436-
signal_smoothed[ix] = signal[ix]
429+
if self.boundary_method == "nan":
437430
return signal_smoothed
438-
elif self.boundary_method == "identity":
439-
for ix in range(min(len(self.coeffs), len(signal))):
431+
432+
# boundary methods "identity" and "shortened window"
433+
for ix in range(min(len(self.coeffs), len(signal))):
434+
if ix == 0 or self.boundary_method == "identity":
440435
signal_smoothed[ix] = signal[ix]
441-
return signal_smoothed
442-
elif self.boundary_method == "nan":
443-
return signal_smoothed
436+
else:
437+
# At the very edge, the design matrix is often singular, in which case
438+
# we just fall back to the raw signal
439+
try:
440+
signal_smoothed[ix] = self.savgol_predict(
441+
signal[: ix + 1], self.poly_fit_degree, 0
442+
)
443+
except np.linalg.LinAlgError: # for small ix, the design matrix is singular
444+
signal_smoothed[ix] = signal[ix]
445+
return signal_smoothed
444446

445447
def savgol_impute(self, signal, impute_order):
446448
"""Impute the nan values in signal using savgol.

validator/README.md renamed to _delphi_utils_python/delphi_utils/validator/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ All of the user-changable parameters are stored in the `validation` field of the
5151

5252
Please update the follow settings:
5353

54-
* `global`: global validation settings
54+
* `common`: global validation settings
5555
* `data_source`: should match the [formatting](https://cmu-delphi.github.io/delphi-epidata/api/covidcast_signals.html) as used in COVIDcast API calls
5656
* `end_date`: specifies the last date to be checked; if set to "latest", `end_date` will always be the current date
5757
* `span_length`: specifies the number of days before the `end_date` to check. `span_length` should be long enough to contain all recent source data that is still in the process of being updated (i.e. in the backfill period), for example, if the data source of interest has a 2-week lag before all reports are in for a given date, `scan_length` should be 14 days

0 commit comments

Comments
 (0)