Skip to content

Fix nchs missing value #535

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Nov 18, 2020
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 15 additions & 23 deletions nchs_mortality/delphi_nchs_mortality/pull.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,31 +64,23 @@ def pull_nchs_mortality_data(token: str, map_df: pd.DataFrame, test_mode: str):
"schema may have changed. Please investigate and "
"amend the code.") from exc

# Drop rows for locations outside US
df = df[df["state"] != "United States"]
df.loc[df["state"] == "New York City", "state"] = "New York"
df = df.loc[:, keep_columns + ["timestamp", "state"]].set_index("timestamp")

state_list = df["state"].unique()
date_list = df["timestamp"].unique()
index_df = pd.MultiIndex.from_product(
[state_list, date_list], names=['state', 'timestamp']
)
df = df.groupby(
["state", "timestamp"]).sum().reindex(index_df).reset_index()
Comment on lines -75 to -76
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like this reindex step was used to fill in missing values by state and time and removing it makes sense given the discussion.
I am not super familiar with the actual NCHS data, but were there cases of duplicated (state, timestamp) where the sum aggregation should still be retained?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, you are right! Let me fix this. Thanks!


# Final sanity checks
days_by_states = df.groupby("state").count()["covid_deaths"].unique()
unique_days = df["timestamp"].unique()
# each FIPS has same number of rows
if (len(days_by_states) > 1) or (days_by_states[0] != len(unique_days)):
raise ValueError("Differing number of days by fips")
min_timestamp = min(unique_days)
max_timestamp = max(unique_days)
n_days = (max_timestamp - min_timestamp) / np.timedelta64(1, 'D') / 7 + 1
if n_days != len(unique_days):
raise ValueError(
f"Not every day between {min_timestamp} and "
"{max_timestamp} is represented."
)
# NCHS considers NYC as an individual state, however, we want it included
# in NY. If values are nan for both NYC and NY, the aggreagtion should
# also have NAN.
df_ny = df.loc[df["state"] == "New York", :].drop("state", axis=1)
df_nyc = df.loc[df["state"] == "New York City", :].drop("state", axis=1)
# Get mask df to ignore cells where both of them have NAN values
mask = (df_ny[keep_columns].isnull().values \
& df_nyc[keep_columns].isnull().values)
df_ny = df_ny.append(df_nyc).groupby("timestamp").sum().where(~mask, np.nan)
df_ny["state"] = "New York"
# Drop NYC and NY in the full dataset
df = df.loc[~df["state"].isin(["New York", "New York City"]), :]
df = df.append(df_ny).reset_index().sort_values(["state", "timestamp"])

# Add population info
keep_columns.extend(["timestamp", "geo_id", "population"])
Expand Down
16 changes: 9 additions & 7 deletions nchs_mortality/delphi_nchs_mortality/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from .constants import (METRICS, SENSOR_NAME_MAP,
SENSORS, INCIDENCE_BASE, GEO_RES)

def run_module(): # pylint: disable=too-many-branches,too-many-statements
def run_module():
"""Run module for processing NCHS mortality data."""
params = read_params()
export_start_date = params["export_start_date"]
Expand Down Expand Up @@ -48,6 +48,7 @@ def run_module(): # pylint: disable=too-many-branches,too-many-statements
df["val"] = df[metric]
df["se"] = np.nan
df["sample_size"] = np.nan
df = df[~df["val"].isnull()]
sensor_name = "_".join(["wip", SENSOR_NAME_MAP[metric]])
export_csv(
df,
Expand All @@ -65,6 +66,7 @@ def run_module(): # pylint: disable=too-many-branches,too-many-statements
df["val"] = df[metric] / df["population"] * INCIDENCE_BASE
df["se"] = np.nan
df["sample_size"] = np.nan
df = df[~df["val"].isnull()]
sensor_name = "_".join(["wip", SENSOR_NAME_MAP[metric], sensor])
export_csv(
df,
Expand All @@ -74,10 +76,10 @@ def run_module(): # pylint: disable=too-many-branches,too-many-statements
sensor=sensor_name,
)

# Weekly run of archive utility on Monday
# - Does not upload to S3, that is handled by daily run of archive utility
# - Exports issues into receiving for the API
# Daily run of archiving utility
# - Uploads changed files to S3
# - Does not export any issues into receiving
# Weekly run of archive utility on Monday
# - Does not upload to S3, that is handled by daily run of archive utility
# - Exports issues into receiving for the API
# Daily run of archiving utility
# - Uploads changed files to S3
# - Does not export any issues into receiving
arch_diffs(params, daily_arch_diff)
18 changes: 7 additions & 11 deletions nchs_mortality/tests/test_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,17 +20,13 @@ def test_output_files_exist(self, run_as_module, date):
csv_files = listdir(output_folder)

dates = [
"202016",
"202017",
"202018",
"202019",
"202020",
"202021",
"202022",
"202023",
"202024",
"202025",
"202026",
"202030",
"202031",
"202032",
"202033",
"202034",
"202035",
"202036",
]
metrics = ['deaths_covid_incidence',
'deaths_allcause_incidence',
Expand Down