Skip to content

Commit 0592d17

Browse files
authored
Merge pull request #75 from cmu-delphi/emr-hosp-package
EMR hospitalization indicator
2 parents 33a79bd + 6cc6397 commit 0592d17

File tree

8 files changed

+324
-195
lines changed

8 files changed

+324
-195
lines changed

emr_hosp/README.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
# EMR Hospitalizations Indicator
22

33
COVID-19 indicator using hospitalizations from electronic medical records (EMR).
4+
Reads claims data (AGG) and EMR data (CMB) and combines into pandas dataframe.
5+
Makes appropriate date shifts, adjusts for backfilling, and smooths estimates.
6+
Writes results to csvs.
47

58

69
## Running the Indicator
@@ -56,3 +59,10 @@ The output will show the number of unit tests that passed and failed, along
5659
with the percentage of code covered by the tests. None of the tests should
5760
fail and the code lines that are not covered by unit tests should be small and
5861
should not include critical sub-routines.
62+
63+
## Code tour
64+
65+
- update_sensor.py: EMRHospSensorUpdator: reads the data, makes transformations,
66+
- sensor.py: EMRHospSensor: methods for transforming data, including backfill and smoothing
67+
- load_data.py: methods for loading claims and EHR data
68+
- geo_maps.py: geo reindexing

emr_hosp/delphi_emr_hosp/geo_maps.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ def county_to_state(self, data):
105105
return data
106106

107107
def hrr(self, data):
108-
"""Prepare hrr groups.
108+
"""Prepare hrr (Hospital Referral Region) groups.
109109
110110
Args:
111111
data: dataframe aggregated to the daily-hrr resolution

emr_hosp/delphi_emr_hosp/load_data.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ def load_combined_data(emr_filepath, claims_filepath, dropdate, base_geo):
102102
base_geo: base geographic unit before aggregation (either 'fips' or 'hrr')
103103
104104
Returns:
105-
combined dataframe
105+
combined multiindexed dataframe, index 0 is geo_base, index 1 is date
106106
"""
107107
assert base_geo in ["fips", "hrr"], "base unit must be either 'fips' or 'hrr'"
108108

emr_hosp/delphi_emr_hosp/run.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
from delphi_utils import read_params
1515

1616
# first party
17-
from .update_sensor import update_sensor
17+
from .update_sensor import EMRHospSensorUpdator
1818

1919

2020
def run_module():
@@ -64,17 +64,19 @@ def run_module():
6464
logging.info("starting %s, weekday adj", geo)
6565
else:
6666
logging.info("starting %s, no adj", geo)
67-
update_sensor(
68-
params["input_emr_file"],
69-
params["input_claims_file"],
70-
params["export_dir"],
71-
params["static_file_dir"],
67+
su_inst = EMRHospSensorUpdator(
7268
params["start_date"],
7369
params["end_date"],
7470
dropdate,
7571
geo,
7672
params["parallel"],
77-
weekday,
73+
weekday
74+
)
75+
su_inst.update_sensor(
76+
params["input_emr_file"],
77+
params["input_claims_file"],
78+
params["export_dir"],
79+
params["static_file_dir"]
7880
)
7981
logging.info("finished %s", geo)
8082

emr_hosp/delphi_emr_hosp/sensor.py

Lines changed: 40 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,19 @@ class EMRHospSensor:
2222
"""Sensor class to fit a signal using CLI counts from EMR Hospitalization data.
2323
"""
2424

25+
@staticmethod
26+
def gauss_smooth(count,total):
27+
"""smooth using the left_gauss_linear
28+
29+
Args:
30+
count, total: array
31+
"""
32+
count_smooth = left_gauss_linear(count)
33+
total_smooth = left_gauss_linear(total)
34+
total_clip = np.clip(total_smooth, 0, None)
35+
count_clip = np.clip(count_smooth, 0, total_clip)
36+
return count_clip, total_clip
37+
2538
@staticmethod
2639
def backfill(
2740
num,
@@ -37,15 +50,19 @@ def backfill(
3750
bin size so to avoid inluding long-past values.
3851
3952
Args:
40-
num: dataframe of covid counts
41-
den: dataframe of total visits
53+
num: array of covid counts
54+
den: array of total visits
4255
k: maximum number of days used to average a backfill correction
4356
min_visits_to_fill: minimum number of total visits needed in order to sum a bin
4457
4558
Returns: dataframes of adjusted covid counts, adjusted visit counts, inclusion array
4659
"""
47-
revden = den[::-1].values
48-
revnum = num[::-1].values.reshape(-1, 1)
60+
if isinstance(den,(pd.DataFrame,pd.Series)):
61+
den = den.values
62+
if isinstance(num,(pd.DataFrame,pd.Series)):
63+
num = num.values
64+
revden = den[::-1]
65+
revnum = num[::-1].reshape(-1, 1)
4966
new_num = np.full_like(revnum, np.nan, dtype=float)
5067
new_den = np.full_like(revden, np.nan, dtype=float)
5168
n, p = revnum.shape
@@ -76,40 +93,37 @@ def backfill(
7693
new_num = new_num[::-1]
7794
new_den = new_den[::-1]
7895

79-
# reset date index and format
80-
new_num = pd.Series(new_num.flatten(), name=num.name, index=num.index)
81-
new_den = pd.Series(new_den, index=den.index)
82-
8396
return new_num, new_den
8497

8598
@staticmethod
86-
def fit(y_data, sensor_dates, geo_id):
99+
def fit(y_data, first_sensor_date, geo_id, num_col="num", den_col="den"):
87100
"""Fitting routine.
88101
89102
Args:
90103
y_data: dataframe for one geo_id, indexed by date
91-
sensor_dates: list of sorted datetime for which to produce sensor values
104+
first_sensor_date: datetime of first date
92105
geo_id: unique identifier for the location column
106+
num_col: str name of numerator column
107+
den_col: str name of denominator column
93108
94109
Returns:
95110
dictionary of results
96111
97112
"""
98-
# values to keep
99-
fitting_idxs = np.where(y_data.index >= sensor_dates[0])[0]
100-
101113
# backfill
102-
total_counts, total_visits = EMRHospSensor.backfill(y_data["num"], y_data["den"])
114+
total_counts, total_visits = EMRHospSensor.backfill(y_data[num_col].values, y_data[den_col].values)
115+
# total_counts = pd.Series(total_counts.flatten(), name=num_col, index=y_data.index)
116+
# total_visits = pd.Series(total_visits, index=y_data.index)
103117

104118
# calculate smoothed counts and jeffreys rate
105119
# the left_gauss_linear smoother is not guaranteed to return values greater than 0
106-
smoothed_total_counts = np.clip(left_gauss_linear(total_counts.values), 0, None)
107-
smoothed_total_visits = np.clip(left_gauss_linear(total_visits.values), 0, None)
120+
121+
smoothed_total_counts, smoothed_total_visits = EMRHospSensor.gauss_smooth(total_counts.flatten(),total_visits)
108122

109123
# in smoothing, the numerator may have become more than the denominator
110124
# simple fix is to clip the max values elementwise to the denominator (note that
111125
# this has only been observed in synthetic data)
112-
smoothed_total_counts = np.clip(smoothed_total_counts, 0, smoothed_total_visits)
126+
# smoothed_total_counts = np.clip(smoothed_total_counts, 0, smoothed_total_visits)
113127

114128
smoothed_total_rates = (
115129
(smoothed_total_counts + 0.5) / (smoothed_total_visits + 1)
@@ -124,14 +138,12 @@ def fit(y_data, sensor_dates, geo_id):
124138
), f"0 or negative value, {geo_id}"
125139

126140
# cut off at sensor indexes
127-
rates = smoothed_total_rates[fitting_idxs]
128-
den = smoothed_total_visits[fitting_idxs]
129-
include = den >= Config.MIN_DEN
130-
131-
# calculate standard error
132-
se = np.full_like(rates, np.nan)
133-
se[include] = np.sqrt(
134-
np.divide((rates[include] * (1 - rates[include])), den[include]))
135-
136-
logging.debug(f"{geo_id}: {rates[-1]:.3f},[{se[-1]:.3f}]")
137-
return {"geo_id": geo_id, "rate": 100 * rates, "se": 100 * se, "incl": include}
141+
rate_data = pd.DataFrame({'rate':smoothed_total_rates, 'den': smoothed_total_visits}, index=y_data.index)
142+
rate_data = rate_data[first_sensor_date:]
143+
include = rate_data['den'] >= Config.MIN_DEN
144+
valid_rates = rate_data[include]
145+
se_valid = valid_rates.eval('sqrt(rate * (1 - rate) / den)')
146+
rate_data['se'] = se_valid
147+
148+
logging.debug(f"{geo_id}: {rate_data['rate'][-1]:.3f},[{rate_data['se'][-1]:.3f}]")
149+
return {"geo_id": geo_id, "rate": 100 * rate_data['rate'], "se": 100 * rate_data['se'], "incl": include}

0 commit comments

Comments
 (0)