-
Notifications
You must be signed in to change notification settings - Fork 16
Extend geocode utility to actually support the state to state mappings #310
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
7ff30ac
3d8c331
3b6d139
8a1112d
d6f43ae
d8c491a
893aafc
0befb53
e1f96f9
5859982
84422b6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -278,11 +278,13 @@ def test_zip_to_state_id(self): | |
def test_add_population_column(self): | ||
gmpr = GeoMapper() | ||
new_data = gmpr.add_population_column(self.fips_data_3, "fips") | ||
assert new_data["population"].sum() == 274963 | ||
assert new_data.shape == (5, 5) | ||
new_data = gmpr.add_population_column(self.zip_data, "zip") | ||
assert new_data["population"].sum() == 274902 | ||
assert new_data.shape == (6, 5) | ||
with pytest.raises(ValueError): | ||
new_data = gmpr.add_population_column(self.zip_data, "hrr") | ||
new_data = gmpr.add_population_column(self.fips_data_5, "fips") | ||
assert new_data.shape == (4, 5) | ||
|
||
def test_add_geocode(self): | ||
gmpr = GeoMapper() | ||
|
@@ -382,13 +384,20 @@ def test_add_geocode(self): | |
new_data2 = gmpr.add_geocode(new_data, "state_code", "hhs_region_number") | ||
assert new_data2["hhs_region_number"].unique().size == 2 | ||
|
||
# state_name -> state_id | ||
new_data = gmpr.replace_geocode(self.zip_data, "zip", "state_name") | ||
new_data2 = gmpr.add_geocode(new_data, "state_name", "state_id") | ||
assert new_data2.shape == (4, 5) | ||
new_data2 = gmpr.replace_geocode(new_data, "state_name", "state_id", new_col="abbr") | ||
assert "abbr" in new_data2.columns | ||
|
||
# fips -> nation | ||
new_data = gmpr.replace_geocode(self.fips_data_5, "fips", "nation") | ||
new_data = gmpr.replace_geocode(self.fips_data_5, "fips", "nation", new_col="NATION") | ||
assert new_data.equals( | ||
pd.DataFrame().from_dict( | ||
{ | ||
"date": {0: pd.Timestamp("2018-01-01 00:00:00")}, | ||
"nation": {0: "us"}, | ||
"NATION": {0: "us"}, | ||
"count": {0: 10024.0}, | ||
"total": {0: 100006.0}, | ||
} | ||
|
@@ -411,6 +420,23 @@ def test_add_geocode(self): | |
) | ||
) | ||
|
||
# hrr -> nation | ||
with pytest.raises(ValueError): | ||
new_data = gmpr.replace_geocode(self.zip_data, "zip", "hrr") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm pretty sure if the first line valueerrors but the second one doesnt, it still passes, so may want to split this into two There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Wouldn't that have been caught in the zip -> hrr test some lines before that though? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe im understanding this test wrong. I read it as testing that both
and
raise valueerrors. Is that right? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's testing the second one, since we should not be mapping hrr -> nation (it's an incomplete mapping, so it's unsupported). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ohhh got it, didn't see the second line calls new_data. Thanks. |
||
new_data2 = gmpr.replace_geocode(new_data, "hrr", "nation") | ||
|
||
# fips -> hrr (dropna=True/False check) | ||
assert not gmpr.add_geocode(self.fips_data_3, "fips", "hrr").isna().any().any() | ||
assert gmpr.add_geocode(self.fips_data_3, "fips", "hrr", dropna=False).isna().any().any() | ||
|
||
# fips -> zip (date_col=None chech) | ||
new_data = gmpr.replace_geocode(self.fips_data_5.drop(columns=["date"]), "fips", "hrr", date_col=None) | ||
assert new_data.equals( | ||
pd.DataFrame().from_dict( | ||
{ | ||
'hrr': {0: '1', 1: '183', 2: '184', 3: '382', 4: '7'}, | ||
'count': {0: 1.772347174163783, 1: 7157.392403522299, 2: 2863.607596477701, 3: 1.0, 4: 0.22765282583621685}, | ||
'total': {0: 3.544694348327566, 1: 71424.64801363471, 2: 28576.35198636529, 3: 1.0, 4: 0.4553056516724337} | ||
} | ||
) | ||
) |
This file was deleted.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,11 +12,10 @@ | |
import numpy as np | ||
import pandas as pd | ||
|
||
from delphi_utils import read_params | ||
from delphi_utils import read_params, GeoMapper | ||
import covidcast | ||
from .api_config import APIConfig | ||
from .covidnet import CovidNet | ||
from .geo_maps import GeoMaps | ||
from .constants import SIGNALS | ||
|
||
def write_to_csv(data: pd.DataFrame, out_name: str, output_path: str): | ||
|
@@ -49,17 +48,18 @@ def write_to_csv(data: pd.DataFrame, out_name: str, output_path: str): | |
|
||
|
||
def update_sensor( | ||
state_files: List[str], mmwr_info: pd.DataFrame, | ||
output_path: str, static_path: str, | ||
start_date: datetime, end_date: datetime) -> pd.DataFrame: | ||
state_files: List[str], | ||
mmwr_info: pd.DataFrame, | ||
output_path: str, | ||
start_date: datetime, | ||
end_date: datetime) -> pd.DataFrame: | ||
""" | ||
Generate sensor values, and write to csv format. | ||
|
||
Args: | ||
state_files: List of JSON files representing COVID-NET hospitalization data for each state | ||
mmwr_info: Mappings from MMWR week to actual dates, as a pd.DataFrame | ||
output_path: Path to write the csvs to | ||
static_path: Path for the static geographic fiels | ||
start_date: First sensor date (datetime.datetime) | ||
end_date: Last sensor date (datetime.datetime) | ||
|
||
|
@@ -85,9 +85,15 @@ def update_sensor( | |
] | ||
|
||
# Set state id to two-letter abbreviation | ||
geo_map = GeoMaps(static_path) | ||
hosp_df = geo_map.state_name_to_abbr(hosp_df) | ||
|
||
gmpr = GeoMapper() | ||
hosp_df = gmpr.add_geocode(hosp_df, | ||
from_col=APIConfig.STATE_COL, | ||
from_code="state_name", | ||
new_code="state_id", | ||
dropna=False) | ||
# To use the original column name, reassign original column and drop new one | ||
hosp_df[APIConfig.STATE_COL] = hosp_df["state_id"].str.upper() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @chinandrew btw, this may be cdc_covidnet specific, but the state abbreviation in other indicators (like JHU) is assumed to be lower case. Are we sure this is what we want? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For some reason cdc covidnet was uppercase, so i kept it consistent. if it's something we can change, would definitely recommend we standardize, but haven't looked into it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @krivard thoughts? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Slacked Katie on this while discussing a similar issue, apparently downstream ingestion will standardize everything and accepts lower or uppercase, so this can be removed and we can just move to lowercase for the indicator code. |
||
hosp_df.drop("state_id", axis=1, inplace=True) | ||
assert not hosp_df.duplicated(["date", "geo_id"]).any(), "Non-unique (date, geo_id) pairs" | ||
hosp_df.set_index(["date", "geo_id"], inplace=True) | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
any reason not to keep both of these checks?
Also, if it's 5x5 output it may be simpler juts to do a direct df comparison on values
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is an old test I don't see anymore. Pull again?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I was just asking why it was deleted
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah. I decided to move away from tests based on data-derived population counts. I figured the tests should catch whether the underlying logic or arithmetic breaks, not whether the data file changed. Am open to reasons for keeping though.