Skip to content

Commit 071aa2b

Browse files
authored
Merge pull request #551 from cmu-delphi/main
Deploy `chng` to production
2 parents a5d22d5 + d049649 commit 071aa2b

File tree

15 files changed

+196
-70
lines changed

15 files changed

+196
-70
lines changed

Jenkinsfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
- Keep in sync with '.github/workflows/python-ci.yml'.
1010
- TODO: #527 Get this list automatically from python-ci.yml at runtime.
1111
*/
12-
def indicator_list = ["cdc_covidnet", "claims_hosp", "combo_cases_and_deaths", "google_symptoms", "jhu", "nchs_mortality", "quidel", "quidel_covidtest", "safegraph", "safegraph_patterns", "usafacts"]
12+
def indicator_list = ["cdc_covidnet", "changehc", "claims_hosp", "combo_cases_and_deaths", "google_symptoms", "jhu", "nchs_mortality", "quidel", "quidel_covidtest", "safegraph", "safegraph_patterns", "usafacts"]
1313
def build_package = [:]
1414
def deploy_staging = [:]
1515
def deploy_production = [:]
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
{
2+
"static_file_dir": "./static",
3+
"raw_data_dir": "/common/safegraph",
4+
"export_dir": "./receiving",
5+
"cache_dir": "./cache",
6+
"n_core": "12",
7+
"aws_access_key_id": "{{ safegraph_aws_access_key_id }}",
8+
"aws_secret_access_key": "{{ safegraph_aws_secret_access_key }}",
9+
"aws_default_region": "us-east-1",
10+
"aws_endpoint": "https://s3.wasabisys.com",
11+
"sync": true,
12+
"wip_signal" : []
13+
}

google_symptoms/DETAILS.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ hrr level data is derived from county level data using population weighted avera
1717
## Metrics, Level 1 (`m1`)
1818
* `anosmia`: Google search volume for Anosmia-related searches
1919
* `ageusia`: Google search volume for Ageusia-related searches
20-
*`combined_symptoms`*: The sum of Google search volume for Anosmia-related searches and Ageusia-related searches.
20+
*`sum_anosmia_ageusia`*: The sum of Google search volume for Anosmia-related searches and Ageusia-related searches.
2121

2222
## Metrics, Level 2 (`m2`)
2323
* `raw_search`: Google search volume reported as-is

google_symptoms/delphi_google_symptoms/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
# global constants
1111
METRICS = ["Anosmia", "Ageusia"]
12+
COMBINED_METRIC = "sum_anosmia_ageusia"
1213
SMOOTHERS = ["raw", "smoothed"]
1314
GEO_RESOLUTIONS = [
1415
"state",

google_symptoms/delphi_google_symptoms/geo.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import numpy as np
33
import pandas as pd
44
from delphi_utils import GeoMapper
5-
from .constants import METRICS
5+
from .constants import METRICS, COMBINED_METRIC
66

77
gmpr = GeoMapper()
88
def generate_transition_matrix(geo_res):
@@ -64,7 +64,7 @@ def geo_map(df, geo_res):
6464
for _date in df["timestamp"].unique():
6565
val_lists = df[df["timestamp"] == _date].merge(
6666
map_df["geo_id"], how="right"
67-
)[METRICS + ["combined_symptoms"]].fillna(0)
67+
)[METRICS + [COMBINED_METRIC]].fillna(0)
6868
newdf = pd.DataFrame(
6969
np.matmul(map_df.values[:, 1:].T, val_lists.values),
7070
columns = list(val_lists.keys())

google_symptoms/delphi_google_symptoms/pull.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import numpy as np
55
import pandas as pd
66

7-
from .constants import STATE_TO_ABBREV, DC_FIPS, METRICS
7+
from .constants import STATE_TO_ABBREV, DC_FIPS, METRICS, COMBINED_METRIC
88

99
def get_geo_id(region_code):
1010
"""
@@ -42,16 +42,16 @@ def preprocess(df, level):
4242
Dataframe as described above.
4343
"""
4444
# Constants
45-
KEEP_COLUMNS = ["geo_id", "date"] + METRICS + ["combined_symptoms"]
45+
KEEP_COLUMNS = ["geo_id", "date"] + METRICS + [COMBINED_METRIC]
4646

47-
df["combined_symptoms"] = 0
47+
df[COMBINED_METRIC] = 0
4848
for metric in METRICS:
4949
df.rename({"symptom:" + metric: metric}, axis = 1, inplace = True)
50-
df["combined_symptoms"] += df[metric].fillna(0)
50+
df[COMBINED_METRIC] += df[metric].fillna(0)
5151
df.loc[
5252
(df["Anosmia"].isnull())
5353
& (df["Ageusia"].isnull())
54-
, "combined_symptoms"] = np.nan
54+
, COMBINED_METRIC] = np.nan
5555

5656
# Delete rows with missing FIPS
5757
null_mask = (df["geo_id"].isnull())

google_symptoms/delphi_google_symptoms/run.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,8 @@
1212

1313
from .pull import pull_gs_data
1414
from .geo import geo_map
15-
from .constants import METRICS, GEO_RESOLUTIONS, SMOOTHERS, SMOOTHERS_MAP
15+
from .constants import (METRICS, COMBINED_METRIC,
16+
GEO_RESOLUTIONS, SMOOTHERS, SMOOTHERS_MAP)
1617

1718

1819
def run_module():
@@ -31,7 +32,7 @@ def run_module():
3132
df_pull = dfs["county"]
3233
df_pull = geo_map(df_pull, geo_res)
3334
for metric, smoother in product(
34-
METRICS+["combined_symptoms"], SMOOTHERS):
35+
METRICS+[COMBINED_METRIC], SMOOTHERS):
3536
print(geo_res, metric, smoother)
3637
df = df_pull.set_index(["timestamp", "geo_id"])
3738
df["val"] = df[metric].groupby(level=1

google_symptoms/tests/test_geo.py

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
import pytest
2+
3+
from os.path import join
4+
5+
import numpy as np
6+
import pandas as pd
7+
8+
from delphi_google_symptoms.geo import geo_map
9+
from delphi_google_symptoms.constants import METRICS, COMBINED_METRIC
10+
11+
class TestGeo:
12+
def test_fips(self):
13+
df = pd.DataFrame(
14+
{
15+
"geo_id": ["53003", "48027", "50103"],
16+
"timestamp": ["2020-02-15", "2020-02-15", "2020-02-15"],
17+
METRICS[0]: [10, 15, 2],
18+
METRICS[1]: [100, 20, 45],
19+
COMBINED_METRIC: [110, 35, 47],
20+
}
21+
)
22+
new_df = geo_map(df, "county")
23+
24+
assert set(new_df.keys()) == set(df.keys())
25+
assert (new_df[METRICS[0]] == df[METRICS[0]]).all()
26+
assert (new_df[METRICS[1]] == df[METRICS[1]]).all()
27+
assert (new_df[COMBINED_METRIC] == df[COMBINED_METRIC]).all()
28+
29+
def test_hrr(self):
30+
df = pd.DataFrame(
31+
{
32+
"geo_id": ["01001", "01009", "01007"],
33+
"timestamp": ["2020-02-15", "2020-02-15", "2020-02-15"],
34+
METRICS[0]: [10, 15, 2],
35+
METRICS[1]: [100, 20, 45],
36+
COMBINED_METRIC: [110, 35, 47],
37+
}
38+
)
39+
new_df = geo_map(df, "hrr").dropna()
40+
41+
assert set(new_df.keys()) == set(df.keys())
42+
assert set(new_df["geo_id"]) == set(["1", "5", "7", "9"])
43+
assert new_df[METRICS[0]].values == pytest.approx([0.39030655604059333,
44+
0.014572815050225169,
45+
1.1509470322941868,
46+
0.08525105356979307])
47+
assert new_df[METRICS[1]].values == pytest.approx([0.7973533171562179,
48+
0.019430420066966894,
49+
11.509470322941867,
50+
1.918148705320344])
51+
assert new_df[COMBINED_METRIC].values == pytest.approx(
52+
new_df[METRICS[0]].values + new_df[METRICS[1]])
53+
54+
def test_msa(self):
55+
df = pd.DataFrame(
56+
{
57+
"geo_id": ["01001", "01009", "01007"],
58+
"timestamp": ["2020-02-15", "2020-02-15", "2020-02-15"],
59+
METRICS[0]: [10, 15, 2],
60+
METRICS[1]: [100, 20, 45],
61+
COMBINED_METRIC: [110, 35, 47],
62+
}
63+
)
64+
new_df = geo_map(df, "msa").dropna()
65+
66+
assert set(new_df.keys()) == set(df.keys())
67+
assert set(new_df["geo_id"]) == set(["13820", "33860"])
68+
assert new_df[METRICS[0]].values == pytest.approx([0.8365267072315176,
69+
1.4966647914490074])
70+
assert new_df[METRICS[1]].values == pytest.approx([1.9847583762443426,
71+
14.966647914490075])
72+
assert new_df[COMBINED_METRIC].values == pytest.approx(
73+
new_df[METRICS[0]].values + new_df[METRICS[1]])

google_symptoms/tests/test_pull.py

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import pandas as pd
44

55
from delphi_google_symptoms.pull import pull_gs_data, preprocess
6+
from delphi_google_symptoms.constants import METRICS, COMBINED_METRIC
67

78
base_url_good = "./test_data{sub_url}small_{state}symptoms_dataset.csv"
89

@@ -20,23 +21,23 @@ def test_good_file(self):
2021
df = dfs[level]
2122
assert (
2223
df.columns.values
23-
== ["geo_id", "timestamp", "Anosmia", "Ageusia", "combined_symptoms"]
24+
== ["geo_id", "timestamp"] + METRICS + [COMBINED_METRIC]
2425
).all()
2526

2627
# combined_symptoms is nan when both Anosmia and Ageusia are nan
2728
assert sum(~df.loc[
28-
(df["Anosmia"].isnull())
29-
& (df["Ageusia"].isnull())
30-
, "combined_symptoms"].isnull()) == 0
29+
(df[METRICS[0]].isnull())
30+
& (df[METRICS[1]].isnull())
31+
, COMBINED_METRIC].isnull()) == 0
3132
# combined_symptoms is not nan when either Anosmia or Ageusia isn't nan
3233
assert sum(df.loc[
33-
(~df["Anosmia"].isnull())
34-
& (df["Ageusia"].isnull())
35-
, "combined_symptoms"].isnull()) == 0
34+
(~df[METRICS[0]].isnull())
35+
& (df[METRICS[1]].isnull())
36+
, COMBINED_METRIC].isnull()) == 0
3637
assert sum(df.loc[
37-
(df["Anosmia"].isnull())
38-
& (~df["Ageusia"].isnull())
39-
, "combined_symptoms"].isnull()) == 0
38+
(df[METRICS[0]].isnull())
39+
& (~df[METRICS[1]].isnull())
40+
, COMBINED_METRIC].isnull()) == 0
4041

4142
def test_missing_cols(self):
4243
df = pd.read_csv(base_url_bad["missing_cols"])

google_symptoms/tests/test_run.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ def test_output_files_exist(self, run_as_module):
2323
"20200811"
2424
]
2525
geos = ["county", "state"]
26-
metrics = ["anosmia", "ageusia", "combined_symptoms"]
26+
metrics = ["anosmia", "ageusia", "sum_anosmia_ageusia"]
2727
smoother = ["raw", "smoothed"]
2828

2929
expected_files = []

nchs_mortality/delphi_nchs_mortality/pull.py

Lines changed: 17 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import numpy as np
44
import pandas as pd
55
from sodapy import Socrata
6+
from .constants import METRICS
67

78
def pull_nchs_mortality_data(token: str, map_df: pd.DataFrame, test_mode: str):
89
"""Pull the latest NCHS Mortality data, and conforms it into a dataset.
@@ -33,10 +34,7 @@ def pull_nchs_mortality_data(token: str, map_df: pd.DataFrame, test_mode: str):
3334
Dataframe as described above.
3435
"""
3536
# Constants
36-
keep_columns = ['covid_deaths', 'total_deaths',
37-
'percent_of_expected_deaths', 'pneumonia_deaths',
38-
'pneumonia_and_covid_deaths', 'influenza_deaths',
39-
'pneumonia_influenza_or_covid_19_deaths']
37+
keep_columns = METRICS.copy()
4038
type_dict = {key: float for key in keep_columns}
4139
type_dict["timestamp"] = 'datetime64[ns]'
4240

@@ -64,31 +62,23 @@ def pull_nchs_mortality_data(token: str, map_df: pd.DataFrame, test_mode: str):
6462
"schema may have changed. Please investigate and "
6563
"amend the code.") from exc
6664

65+
# Drop rows for locations outside US
6766
df = df[df["state"] != "United States"]
68-
df.loc[df["state"] == "New York City", "state"] = "New York"
67+
df = df.loc[:, keep_columns + ["timestamp", "state"]].set_index("timestamp")
6968

70-
state_list = df["state"].unique()
71-
date_list = df["timestamp"].unique()
72-
index_df = pd.MultiIndex.from_product(
73-
[state_list, date_list], names=['state', 'timestamp']
74-
)
75-
df = df.groupby(
76-
["state", "timestamp"]).sum().reindex(index_df).reset_index()
77-
78-
# Final sanity checks
79-
days_by_states = df.groupby("state").count()["covid_deaths"].unique()
80-
unique_days = df["timestamp"].unique()
81-
# each FIPS has same number of rows
82-
if (len(days_by_states) > 1) or (days_by_states[0] != len(unique_days)):
83-
raise ValueError("Differing number of days by fips")
84-
min_timestamp = min(unique_days)
85-
max_timestamp = max(unique_days)
86-
n_days = (max_timestamp - min_timestamp) / np.timedelta64(1, 'D') / 7 + 1
87-
if n_days != len(unique_days):
88-
raise ValueError(
89-
f"Not every day between {min_timestamp} and "
90-
"{max_timestamp} is represented."
91-
)
69+
# NCHS considers NYC as an individual state, however, we want it included
70+
# in NY. If values are nan for both NYC and NY, the aggreagtion should
71+
# also have NAN.
72+
df_ny = df.loc[df["state"] == "New York", :].drop("state", axis=1)
73+
df_nyc = df.loc[df["state"] == "New York City", :].drop("state", axis=1)
74+
# Get mask df to ignore cells where both of them have NAN values
75+
mask = (df_ny[keep_columns].isnull().values \
76+
& df_nyc[keep_columns].isnull().values)
77+
df_ny = df_ny.append(df_nyc).groupby("timestamp").sum().where(~mask, np.nan)
78+
df_ny["state"] = "New York"
79+
# Drop NYC and NY in the full dataset
80+
df = df.loc[~df["state"].isin(["New York", "New York City"]), :]
81+
df = df.append(df_ny).reset_index().sort_values(["state", "timestamp"])
9282

9383
# Add population info
9484
keep_columns.extend(["timestamp", "geo_id", "population"])

nchs_mortality/delphi_nchs_mortality/run.py

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from .constants import (METRICS, SENSOR_NAME_MAP,
1818
SENSORS, INCIDENCE_BASE, GEO_RES)
1919

20-
def run_module(): # pylint: disable=too-many-branches,too-many-statements
20+
def run_module():
2121
"""Run module for processing NCHS mortality data."""
2222
params = read_params()
2323
export_start_date = params["export_start_date"]
@@ -41,13 +41,15 @@ def run_module(): # pylint: disable=too-many-branches,too-many-statements
4141
join(static_file_dir, "state_pop.csv"), dtype={"fips": int}
4242
)
4343

44-
df = pull_nchs_mortality_data(token, map_df, test_mode)
44+
df_pull = pull_nchs_mortality_data(token, map_df, test_mode)
4545
for metric in METRICS:
4646
if metric == 'percent_of_expected_deaths':
4747
print(metric)
48+
df = df_pull.copy()
4849
df["val"] = df[metric]
4950
df["se"] = np.nan
5051
df["sample_size"] = np.nan
52+
df = df[~df["val"].isnull()]
5153
sensor_name = "_".join(["wip", SENSOR_NAME_MAP[metric]])
5254
export_csv(
5355
df,
@@ -59,12 +61,14 @@ def run_module(): # pylint: disable=too-many-branches,too-many-statements
5961
else:
6062
for sensor in SENSORS:
6163
print(metric, sensor)
64+
df = df_pull.copy()
6265
if sensor == "num":
6366
df["val"] = df[metric]
6467
else:
6568
df["val"] = df[metric] / df["population"] * INCIDENCE_BASE
6669
df["se"] = np.nan
6770
df["sample_size"] = np.nan
71+
df = df[~df["val"].isnull()]
6872
sensor_name = "_".join(["wip", SENSOR_NAME_MAP[metric], sensor])
6973
export_csv(
7074
df,
@@ -74,10 +78,10 @@ def run_module(): # pylint: disable=too-many-branches,too-many-statements
7478
sensor=sensor_name,
7579
)
7680

77-
# Weekly run of archive utility on Monday
78-
# - Does not upload to S3, that is handled by daily run of archive utility
79-
# - Exports issues into receiving for the API
80-
# Daily run of archiving utility
81-
# - Uploads changed files to S3
82-
# - Does not export any issues into receiving
81+
# Weekly run of archive utility on Monday
82+
# - Does not upload to S3, that is handled by daily run of archive utility
83+
# - Exports issues into receiving for the API
84+
# Daily run of archiving utility
85+
# - Uploads changed files to S3
86+
# - Does not export any issues into receiving
8387
arch_diffs(params, daily_arch_diff)

0 commit comments

Comments
 (0)