Skip to content

Update all indicators for pandas v2 #1825

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
Apr 11, 2023
Merged
2 changes: 1 addition & 1 deletion _delphi_utils_python/delphi_utils/flash_eval/eval_day.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ def output(evd_ranking, day, lag, signal, logger):
"""
starter_link = f"{HTML_LINK}{(day+pd.Timedelta(f'{lag}d')).strftime('%Y-%m_%d')}"
p_text = ""
for j, (index, value) in enumerate(evd_ranking.sort_values(ascending=False).iteritems()):
for j, (index, value) in enumerate(evd_ranking.sort_values(ascending=False).items()):
if j < 30:
start_link = f"{starter_link},{day.strftime('%Y-%m_%d')},{index}"
p_text += f"\t{start_link}|*{index}*, {'{:.2f}'.format(value)}>\n"
Expand Down
6 changes: 3 additions & 3 deletions _delphi_utils_python/delphi_utils/geomap.py
Original file line number Diff line number Diff line change
Expand Up @@ -401,9 +401,9 @@ def replace_geocode(
df.drop("weight", axis=1, inplace=True)

if not date_col is None:
df = df.groupby([date_col, new_col]).sum().reset_index()
df = df.groupby([date_col, new_col]).sum(numeric_only=True).reset_index()
else:
df = df.groupby([new_col]).sum().reset_index()
df = df.groupby([new_col]).sum(numeric_only=True).reset_index()
return df

def add_population_column(self, data, geocode_type, geocode_col=None, dropna=True):
Expand Down Expand Up @@ -501,7 +501,7 @@ def fips_to_megacounty(
)
data.set_index([fips_col, date_col], inplace=True)
data = data.join(mega_data)
data = data.reset_index().groupby([date_col, mega_col]).sum()
data = data.reset_index().groupby([date_col, mega_col]).sum(numeric_only=True)
return data.reset_index()

def as_mapper_name(self, geo_type, state="state_id"):
Expand Down
2 changes: 1 addition & 1 deletion _delphi_utils_python/delphi_utils/validator/dynamic.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ def replace_first_six(df, start_date):
start_date = self.params.time_window.start_date)

if not error_df.empty:
for index, value in error_df.iteritems():
for index, value in error_df.items():
report.add_raised_error(
ValidationFailure("check_val_missing",
geo_type=geo_type,
Expand Down
2 changes: 1 addition & 1 deletion _delphi_utils_python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
"mock",
"moto",
"numpy",
"pandas>=1.1.0,<2",
"pandas>=1.1.0",
"pydocstyle",
"pylint==2.8.3",
"pytest",
Expand Down
16 changes: 8 additions & 8 deletions _delphi_utils_python/tests/test_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,15 +250,15 @@ def test_export_with_null_removal(self):
"""Test that `remove_null_samples = True` removes entries with null samples."""
_clean_directory(self.TEST_DIR)

df_with_nulls = self.DF.copy().append(
{
df_with_nulls = pd.concat(
[self.DF.copy(),
pd.DataFrame({
"geo_id": "66666",
"timestamp": datetime(2020, 6, 6),
"val": 10,
"se": 0.2,
"sample_size": pd.NA,
},
ignore_index=True,
}, index = [0])]
)

create_export_csv(
Expand All @@ -283,15 +283,15 @@ def test_export_without_null_removal(self):
"""Test that `remove_null_samples = False` does not remove entries with null samples."""
_clean_directory(self.TEST_DIR)

df_with_nulls = self.DF.copy().append(
{
df_with_nulls = pd.concat(
[self.DF.copy(),
pd.DataFrame({
"geo_id": "66666",
"timestamp": datetime(2020, 6, 6),
"val": 10,
"se": 0.2,
"sample_size": pd.NA,
},
ignore_index=True,
}, index = [0])]
)

create_export_csv(
Expand Down
2 changes: 1 addition & 1 deletion _delphi_utils_python/tests/test_geomap.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ def test_load_fips_chngfips_table(self, geomapper):

def test_load_jhu_uid_fips_table(self, geomapper):
jhu_data = geomapper.get_crosswalk(from_code="jhu_uid", to_code="fips")
assert np.allclose(jhu_data.groupby("jhu_uid").sum(), 1.0)
assert np.allclose(jhu_data.groupby("jhu_uid").sum(numeric_only=True), 1.0)

def test_load_zip_hrr_table(self, geomapper):
zip_data = geomapper.get_crosswalk(from_code="zip", to_code="hrr")
Expand Down
4 changes: 2 additions & 2 deletions _delphi_utils_python/tests/validator/test_dynamic.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def test_half_padding(self):
ref_df, test_df, ref_date, ref_date)

# Check it only takes missing dates - so the last 5 dates
assert new_ref_df.time_value.max() == datetime.strptime("2021-01-11",
assert new_ref_df.time_value.max().date() == datetime.strptime("2021-01-11",
"%Y-%m-%d").date()
assert new_ref_df.shape[0] == 11
assert new_ref_df["val"].iloc[5] == 2
Expand All @@ -71,7 +71,7 @@ def test_full_padding(self):
ref_df, test_df, ref_date, ref_date)

# Check it only takes missing dates up to the day before the reference
assert new_ref_df.time_value.max() == datetime.strptime("2021-01-15",
assert new_ref_df.time_value.max().date() == datetime.strptime("2021-01-15",
"%Y-%m-%d").date()
assert new_ref_df.shape[0] == 15
assert new_ref_df["val"].iloc[5] == 2
Expand Down
2 changes: 1 addition & 1 deletion changehc/delphi_changehc/load_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def load_chng_data(filepath, dropdate, base_geo,
), "Counts must be nonnegative"

# aggregate age groups (so data is unique by date and base geography)
data = data.groupby([base_geo, Config.DATE_COL]).sum()
data = data.groupby([base_geo, Config.DATE_COL]).sum(numeric_only=True)
data.dropna(inplace=True) # drop rows with any missing entries

return data
Expand Down
2 changes: 1 addition & 1 deletion changehc/tests/test_update_sensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def test_geo_reindex(self):
"timestamp": [pd.Timestamp(f'03-{i}-2020') for i in range(1, 14)]})
data_frame = su_inst.geo_reindex(test_data)
assert data_frame.shape[0] == multiple*len(su_inst.fit_dates)
assert (data_frame.sum() == (4200,19000)).all()
assert (data_frame.sum(numeric_only=True) == (4200,19000)).all()

def test_update_sensor(self):
"""Tests that the sensors are properly updated."""
Expand Down
2 changes: 1 addition & 1 deletion claims_hosp/delphi_claims_hosp/load_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def load_claims_data(claims_filepath, dropdate, base_geo):
), "Claims counts must be nonnegative"

# aggregate age groups (so data is unique by date and base geography)
claims_data = claims_data.groupby([base_geo, Config.DATE_COL]).sum()
claims_data = claims_data.groupby([base_geo, Config.DATE_COL]).sum(numeric_only=True)
claims_data.dropna(inplace=True) # drop rows with any missing entries

return claims_data
Expand Down
8 changes: 4 additions & 4 deletions doctor_visits/delphi_doctor_visits/geo_maps.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def county_to_msa(self, data):
from_col="PatCountyFIPS",
new_col="cbsa_id")
data.drop(columns="PatCountyFIPS", inplace=True)
data = data.groupby(["ServiceDate", "cbsa_id"]).sum().reset_index()
data = data.groupby(["ServiceDate", "cbsa_id"]).sum(numeric_only=True).reset_index()

return data.groupby("cbsa_id"), "cbsa_id"

Expand All @@ -66,7 +66,7 @@ def county_to_state(self, data):
"state_id",
from_col="PatCountyFIPS")
data.drop(columns="PatCountyFIPS", inplace=True)
data = data.groupby(["ServiceDate", "state_id"]).sum().reset_index()
data = data.groupby(["ServiceDate", "state_id"]).sum(numeric_only=True).reset_index()

return data.groupby("state_id"), "state_id"

Expand All @@ -83,7 +83,7 @@ def county_to_hhs(self, data):
"hhs",
from_col="PatCountyFIPS")
data.drop(columns="PatCountyFIPS", inplace=True)
data = data.groupby(["ServiceDate", "hhs"]).sum().reset_index()
data = data.groupby(["ServiceDate", "hhs"]).sum(numeric_only=True).reset_index()

return data.groupby("hhs"), "hhs"

Expand All @@ -100,7 +100,7 @@ def county_to_nation(self, data):
"nation",
from_col="PatCountyFIPS")
data.drop(columns="PatCountyFIPS", inplace=True)
data = data.groupby(["ServiceDate", "nation"]).sum().reset_index()
data = data.groupby(["ServiceDate", "nation"]).sum(numeric_only=True).reset_index()

return data.groupby("nation"), "nation"

Expand Down
7 changes: 4 additions & 3 deletions doctor_visits/delphi_doctor_visits/sensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,16 +60,17 @@ def fill_dates(y_data, dates):
last_date = dates[-1]
cols = y_data.columns

df_list = [y_data]
if first_date not in y_data.index:
y_data = y_data.append(
df_list.append(
pd.DataFrame(dict.fromkeys(cols, 0.0), columns=cols, index=[first_date])
)
if last_date not in y_data.index:
y_data = y_data.append(
df_list.append(
pd.DataFrame(dict.fromkeys(cols, 0.0), columns=cols, index=[last_date])
)

y_data.sort_index(inplace=True)
y_data = pd.concat(df_list).sort_index()
y_data = y_data.asfreq("D", fill_value=0)
return y_data

Expand Down
2 changes: 1 addition & 1 deletion doctor_visits/delphi_doctor_visits/update_sensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def update_sensor(
data.dropna(inplace=True) # drop rows with any missing entries

# aggregate age groups (so data is unique by service date and FIPS)
data = data.groupby([Config.DATE_COL, Config.GEO_COL]).sum().reset_index()
data = data.groupby([Config.DATE_COL, Config.GEO_COL]).sum(numeric_only=True).reset_index()
assert np.sum(data.duplicated()) == 0, "Duplicates after age group aggregation"
assert (data[Config.COUNT_COLS] >= 0).all().all(), "Counts must be nonnegative"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -701,6 +701,7 @@ def generate_prop_signal(df, geo, geo_mapper):
).groupby(
geo
).sum(
numeric_only=True
).reset_index(
)
df = pd.merge(df, map_df, left_on="geo_id", right_on=geo, how="inner")
Expand Down
21 changes: 14 additions & 7 deletions dsew_community_profile/tests/test_pull.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,8 +240,8 @@ def test_nation_from_state(self):
'sample_size': [None, None],
'publish_date': [datetime(year=2020, month=1, day=1)]*2,})

pa_pop = int(state_pop.loc[state_pop.state_id == "pa", "pop"])
wv_pop = int(state_pop.loc[state_pop.state_id == "wv", "pop"])
pa_pop = int(state_pop.loc[state_pop.state_id == "pa", "pop"].iloc[0])
wv_pop = int(state_pop.loc[state_pop.state_id == "wv", "pop"].iloc[0])
Comment on lines +243 to +244
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

wacky. good catch!

tot_pop = pa_pop + wv_pop

assert True, nation_from_state(
Expand Down Expand Up @@ -285,7 +285,14 @@ def test_generate_prop_signal_msa(self):
geomapper = GeoMapper()
county_pop = geomapper.get_crosswalk("fips", "pop")
county_msa = geomapper.get_crosswalk("fips", "msa")
msa_pop = county_pop.merge(county_msa, on="fips", how="inner").groupby("msa").sum().reset_index()
msa_pop = county_pop.merge(
county_msa, on="fips", how="inner"
).groupby(
"msa"
).sum(
numeric_only=True
).reset_index(
)

test_df = pd.DataFrame({
'geo_id': ['35620', '31080'],
Expand All @@ -294,8 +301,8 @@ def test_generate_prop_signal_msa(self):
'se': [None, None],
'sample_size': [None, None],})

nyc_pop = int(msa_pop.loc[msa_pop.msa == "35620", "pop"])
la_pop = int(msa_pop.loc[msa_pop.msa == "31080", "pop"])
nyc_pop = int(msa_pop.loc[msa_pop.msa == "35620", "pop"].iloc[0])
la_pop = int(msa_pop.loc[msa_pop.msa == "31080", "pop"].iloc[0])

expected_df = pd.DataFrame({
'geo_id': ['35620', '31080'],
Expand Down Expand Up @@ -342,8 +349,8 @@ def test_generate_prop_signal_non_msa(self):
'se': [None, None],
'sample_size': [None, None],})

pop1 = int(geo_pop.loc[geo_pop[settings["code_name"]] == settings["geo_names"][0], "pop"])
pop2 = int(geo_pop.loc[geo_pop[settings["code_name"]] == settings["geo_names"][1], "pop"])
pop1 = int(geo_pop.loc[geo_pop[settings["code_name"]] == settings["geo_names"][0], "pop"].iloc[0])
pop2 = int(geo_pop.loc[geo_pop[settings["code_name"]] == settings["geo_names"][1], "pop"].iloc[0])

expected_df = pd.DataFrame({
'geo_id': settings["geo_names"],
Expand Down
4 changes: 2 additions & 2 deletions hhs_hosp/tests/test_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,8 @@ def test_transform_signal_pop():
'timestamp': [datetime(year=2020, month=1, day=1)]*2,
'val': [15., 150.],})

pa_pop = int(state_pop.loc[state_pop.state_id == "pa", "pop"])
wv_pop = int(state_pop.loc[state_pop.state_id == "wv", "pop"])
pa_pop = int(state_pop[state_pop.state_id == "pa"]["pop"].iloc[0])
wv_pop = int(state_pop[state_pop.state_id == "wv"]["pop"].iloc[0])
pd.testing.assert_frame_equal(
transform_signal(
CONFIRMED_PROP,
Expand Down
12 changes: 10 additions & 2 deletions nchs_mortality/delphi_nchs_mortality/pull.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,11 +108,19 @@ def pull_nchs_mortality_data(token: str, test_file: Optional[str]=None):
# Get mask df to ignore cells where both of them have NAN values
mask = (df_ny[keep_columns].isnull().values \
& df_nyc[keep_columns].isnull().values)
df_ny = df_ny.append(df_nyc).groupby("timestamp").sum().where(~mask, np.nan)
df_ny = pd.concat(
[df_ny, df_nyc]
).groupby(
"timestamp"
).sum(
numeric_only=True
).where(
~mask, np.nan
)
df_ny["state"] = "New York"
# Drop NYC and NY in the full dataset
df = df.loc[~df["state"].isin(["New York", "New York City"]), :]
df = df.append(df_ny).reset_index().sort_values(["state", "timestamp"])
df = pd.concat([df, df_ny]).reset_index().sort_values(["state", "timestamp"])
# Add population info
keep_columns.extend(["timestamp", "geo_id", "population"])
gmpr = GeoMapper()
Expand Down
14 changes: 9 additions & 5 deletions quidel_covidtest/delphi_quidel_covidtest/data_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,18 @@ def fill_dates(y_data, first_date, last_date):
Returns: dataframe containing all dates given
"""
cols = y_data.columns

df_list = [y_data]
if first_date not in y_data.index:
y_data = y_data.append(pd.DataFrame(dict.fromkeys(cols, 0.),
columns=cols, index=[first_date]))
df_list.append(
pd.DataFrame(dict.fromkeys(cols, 0.), columns=cols, index=[first_date])
)
if last_date not in y_data.index:
y_data = y_data.append(pd.DataFrame(dict.fromkeys(cols, 0.),
columns=cols, index=[last_date]))
df_list.append(
pd.DataFrame(dict.fromkeys(cols, 0.), columns=cols, index=[last_date])
)

y_data.sort_index(inplace=True)
y_data = pd.concat(df_list).sort_index()
y_data = y_data.asfreq('D', fill_value=0)
y_data.fillna(0, inplace=True)
return y_data
Expand Down
34 changes: 20 additions & 14 deletions quidel_covidtest/delphi_quidel_covidtest/generate_sensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ def generate_sensor_for_nonparent_geo(state_groups, res_key, smooth, device,
Returns:
df: pd.DataFrame
"""
state_df = pd.DataFrame(columns=["geo_id", "val", "se", "sample_size", "timestamp"])
state_list = list(state_groups.groups.keys())
df_list = []
for state in state_list:
state_group = state_groups.get_group(state)
state_group = state_group.drop(columns=[res_key])
Expand Down Expand Up @@ -63,12 +63,15 @@ def generate_sensor_for_nonparent_geo(state_groups, res_key, smooth, device,
stat = stat * 100

se = se * 100
state_df = state_df.append(pd.DataFrame({"geo_id": state,
"timestamp": state_group.index,
"val": stat,
"se": se,
"sample_size": sample_size}))
return remove_null_samples(state_df)
df_list.append(
pd.DataFrame({"geo_id": state,
"timestamp": state_group.index,
"val": stat,
"se": se,
"sample_size": sample_size})
)

return remove_null_samples(pd.concat(df_list))

def generate_sensor_for_parent_geo(state_groups, data, res_key, smooth,
device, first_date, last_date, suffix):
Expand All @@ -88,9 +91,9 @@ def generate_sensor_for_parent_geo(state_groups, data, res_key, smooth,
df: pd.DataFrame
"""
has_parent = True
res_df = pd.DataFrame(columns=["geo_id", "val", "se", "sample_size"])
if res_key == "fips": # Add rest-of-state report for county level
data = add_megacounties(data, smooth)
df_list = []
for loc, res_group in data.groupby(res_key):
parent_state = res_group['state_id'].values[0]
try:
Expand Down Expand Up @@ -147,9 +150,12 @@ def generate_sensor_for_parent_geo(state_groups, data, res_key, smooth,
stat = stat * 100

se = se * 100
res_df = res_df.append(pd.DataFrame({"geo_id": loc,
"timestamp": res_group.index,
"val": stat,
"se": se,
"sample_size": sample_size}))
return remove_null_samples(res_df)
df_list.append(
pd.DataFrame({"geo_id": loc,
"timestamp": res_group.index,
"val": stat,
"se": se,
"sample_size": sample_size})
)

return remove_null_samples(pd.concat(df_list))
2 changes: 1 addition & 1 deletion quidel_covidtest/delphi_quidel_covidtest/geo_maps.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,5 +88,5 @@ def add_parent_state(data, geo_res, geo_key):
# Merge the info of parent state to the data
data = data.merge(mix_map, how="left", on=geo_key).drop(
columns=["population"]).dropna()
data = data.groupby(["timestamp", geo_key, "state_id"]).sum().reset_index()
data = data.groupby(["timestamp", geo_key, "state_id"]).sum(numeric_only=True).reset_index()
return data
Loading