Merge pull request #1825 from cmu-delphi/ndefries/pandasv2-fix-tests

nmdefries · web-flow · commit fb446a65312b · 2023-04-11T14:18:34.000-04:00
Update all indicators for pandas v2
diff --git a/_delphi_utils_python/delphi_utils/flash_eval/eval_day.py b/_delphi_utils_python/delphi_utils/flash_eval/eval_day.py
@@ -147,7 +147,7 @@ def output(evd_ranking, day, lag, signal, logger):
     """
     starter_link = f"{HTML_LINK}{(day+pd.Timedelta(f'{lag}d')).strftime('%Y-%m_%d')}"
     p_text = ""
-    for j, (index, value) in enumerate(evd_ranking.sort_values(ascending=False).iteritems()):
+    for j, (index, value) in enumerate(evd_ranking.sort_values(ascending=False).items()):
         if j < 30:
             start_link = f"{starter_link},{day.strftime('%Y-%m_%d')},{index}"
             p_text += f"\t{start_link}|*{index}*, {'{:.2f}'.format(value)}>\n"
diff --git a/_delphi_utils_python/delphi_utils/geomap.py b/_delphi_utils_python/delphi_utils/geomap.py
@@ -401,9 +401,9 @@ def replace_geocode(
             df.drop("weight", axis=1, inplace=True)
 
         if not date_col is None:
-            df = df.groupby([date_col, new_col]).sum().reset_index()
+            df = df.groupby([date_col, new_col]).sum(numeric_only=True).reset_index()
         else:
-            df = df.groupby([new_col]).sum().reset_index()
+            df = df.groupby([new_col]).sum(numeric_only=True).reset_index()
         return df
 
     def add_population_column(self, data, geocode_type, geocode_col=None, dropna=True):
@@ -501,7 +501,7 @@ def fips_to_megacounty(
         )
         data.set_index([fips_col, date_col], inplace=True)
         data = data.join(mega_data)
-        data = data.reset_index().groupby([date_col, mega_col]).sum()
+        data = data.reset_index().groupby([date_col, mega_col]).sum(numeric_only=True)
         return data.reset_index()
 
     def as_mapper_name(self, geo_type, state="state_id"):
diff --git a/_delphi_utils_python/delphi_utils/validator/dynamic.py b/_delphi_utils_python/delphi_utils/validator/dynamic.py
@@ -195,7 +195,7 @@ def replace_first_six(df, start_date):
             start_date = self.params.time_window.start_date)
 
         if not error_df.empty:
-            for index, value in error_df.iteritems():
+            for index, value in error_df.items():
                 report.add_raised_error(
                     ValidationFailure("check_val_missing",
                                       geo_type=geo_type,
diff --git a/_delphi_utils_python/setup.py b/_delphi_utils_python/setup.py
@@ -14,7 +14,7 @@
     "mock",
     "moto",
     "numpy",
-    "pandas>=1.1.0,<2",
+    "pandas>=1.1.0",
     "pydocstyle",
     "pylint==2.8.3",
     "pytest",
diff --git a/_delphi_utils_python/tests/test_export.py b/_delphi_utils_python/tests/test_export.py
@@ -250,15 +250,15 @@ def test_export_with_null_removal(self):
         """Test that `remove_null_samples = True` removes entries with null samples."""
         _clean_directory(self.TEST_DIR)
 
-        df_with_nulls = self.DF.copy().append(
-            {
+        df_with_nulls = pd.concat(
+            [self.DF.copy(),
+            pd.DataFrame({
                 "geo_id": "66666",
                 "timestamp": datetime(2020, 6, 6),
                 "val": 10,
                 "se": 0.2,
                 "sample_size": pd.NA,
-            },
-            ignore_index=True,
+            }, index = [0])]
         )
 
         create_export_csv(
@@ -283,15 +283,15 @@ def test_export_without_null_removal(self):
         """Test that `remove_null_samples = False` does not remove entries with null samples."""
         _clean_directory(self.TEST_DIR)
 
-        df_with_nulls = self.DF.copy().append(
-            {
+        df_with_nulls = pd.concat(
+            [self.DF.copy(),
+            pd.DataFrame({
                 "geo_id": "66666",
                 "timestamp": datetime(2020, 6, 6),
                 "val": 10,
                 "se": 0.2,
                 "sample_size": pd.NA,
-            },
-            ignore_index=True,
+            }, index = [0])]
         )
 
         create_export_csv(
diff --git a/_delphi_utils_python/tests/test_geomap.py b/_delphi_utils_python/tests/test_geomap.py
@@ -196,7 +196,7 @@ def test_load_fips_chngfips_table(self, geomapper):
 
     def test_load_jhu_uid_fips_table(self, geomapper):
         jhu_data = geomapper.get_crosswalk(from_code="jhu_uid", to_code="fips")
-        assert np.allclose(jhu_data.groupby("jhu_uid").sum(), 1.0)
+        assert np.allclose(jhu_data.groupby("jhu_uid").sum(numeric_only=True), 1.0)
 
     def test_load_zip_hrr_table(self, geomapper):
         zip_data = geomapper.get_crosswalk(from_code="zip", to_code="hrr")
diff --git a/_delphi_utils_python/tests/validator/test_dynamic.py b/_delphi_utils_python/tests/validator/test_dynamic.py
@@ -48,7 +48,7 @@ def test_half_padding(self):
             ref_df, test_df, ref_date, ref_date)
 
         # Check it only takes missing dates - so the last 5 dates
-        assert new_ref_df.time_value.max() == datetime.strptime("2021-01-11",
+        assert new_ref_df.time_value.max().date() == datetime.strptime("2021-01-11",
             "%Y-%m-%d").date()
         assert new_ref_df.shape[0] == 11
         assert new_ref_df["val"].iloc[5] == 2
@@ -71,7 +71,7 @@ def test_full_padding(self):
             ref_df, test_df, ref_date, ref_date)
 
         # Check it only takes missing dates up to the day before the reference
-        assert new_ref_df.time_value.max() == datetime.strptime("2021-01-15",
+        assert new_ref_df.time_value.max().date() == datetime.strptime("2021-01-15",
             "%Y-%m-%d").date()
         assert new_ref_df.shape[0] == 15
         assert new_ref_df["val"].iloc[5] == 2
diff --git a/changehc/delphi_changehc/load_data.py b/changehc/delphi_changehc/load_data.py
@@ -71,7 +71,7 @@ def load_chng_data(filepath, dropdate, base_geo,
     ), "Counts must be nonnegative"
 
     # aggregate age groups (so data is unique by date and base geography)
-    data = data.groupby([base_geo, Config.DATE_COL]).sum()
+    data = data.groupby([base_geo, Config.DATE_COL]).sum(numeric_only=True)
     data.dropna(inplace=True)  # drop rows with any missing entries
 
     return data
diff --git a/changehc/tests/test_update_sensor.py b/changehc/tests/test_update_sensor.py
@@ -91,7 +91,7 @@ def test_geo_reindex(self):
                 "timestamp": [pd.Timestamp(f'03-{i}-2020') for i in range(1, 14)]})
             data_frame = su_inst.geo_reindex(test_data)
             assert data_frame.shape[0] == multiple*len(su_inst.fit_dates)
-            assert (data_frame.sum() == (4200,19000)).all()
+            assert (data_frame.sum(numeric_only=True) == (4200,19000)).all()
 
     def test_update_sensor(self):
         """Tests that the sensors are properly updated."""
diff --git a/claims_hosp/delphi_claims_hosp/load_data.py b/claims_hosp/delphi_claims_hosp/load_data.py
@@ -47,7 +47,7 @@ def load_claims_data(claims_filepath, dropdate, base_geo):
     ), "Claims counts must be nonnegative"
 
     # aggregate age groups (so data is unique by date and base geography)
-    claims_data = claims_data.groupby([base_geo, Config.DATE_COL]).sum()
+    claims_data = claims_data.groupby([base_geo, Config.DATE_COL]).sum(numeric_only=True)
     claims_data.dropna(inplace=True)  # drop rows with any missing entries
 
     return claims_data
diff --git a/doctor_visits/delphi_doctor_visits/geo_maps.py b/doctor_visits/delphi_doctor_visits/geo_maps.py
@@ -49,7 +49,7 @@ def county_to_msa(self, data):
                                      from_col="PatCountyFIPS",
                                      new_col="cbsa_id")
         data.drop(columns="PatCountyFIPS", inplace=True)
-        data = data.groupby(["ServiceDate", "cbsa_id"]).sum().reset_index()
+        data = data.groupby(["ServiceDate", "cbsa_id"]).sum(numeric_only=True).reset_index()
 
         return data.groupby("cbsa_id"), "cbsa_id"
 
@@ -66,7 +66,7 @@ def county_to_state(self, data):
                                      "state_id",
                                      from_col="PatCountyFIPS")
         data.drop(columns="PatCountyFIPS", inplace=True)
-        data = data.groupby(["ServiceDate", "state_id"]).sum().reset_index()
+        data = data.groupby(["ServiceDate", "state_id"]).sum(numeric_only=True).reset_index()
 
         return data.groupby("state_id"), "state_id"
 
@@ -83,7 +83,7 @@ def county_to_hhs(self, data):
                                      "hhs",
                                      from_col="PatCountyFIPS")
         data.drop(columns="PatCountyFIPS", inplace=True)
-        data = data.groupby(["ServiceDate", "hhs"]).sum().reset_index()
+        data = data.groupby(["ServiceDate", "hhs"]).sum(numeric_only=True).reset_index()
 
         return data.groupby("hhs"), "hhs"
 
@@ -100,7 +100,7 @@ def county_to_nation(self, data):
                                      "nation",
                                      from_col="PatCountyFIPS")
         data.drop(columns="PatCountyFIPS", inplace=True)
-        data = data.groupby(["ServiceDate", "nation"]).sum().reset_index()
+        data = data.groupby(["ServiceDate", "nation"]).sum(numeric_only=True).reset_index()
 
         return data.groupby("nation"), "nation"
 
diff --git a/doctor_visits/delphi_doctor_visits/sensor.py b/doctor_visits/delphi_doctor_visits/sensor.py
@@ -60,16 +60,17 @@ def fill_dates(y_data, dates):
         last_date = dates[-1]
         cols = y_data.columns
 
+        df_list = [y_data]
         if first_date not in y_data.index:
-            y_data = y_data.append(
+            df_list.append(
                 pd.DataFrame(dict.fromkeys(cols, 0.0), columns=cols, index=[first_date])
             )
         if last_date not in y_data.index:
-            y_data = y_data.append(
+            df_list.append(
                 pd.DataFrame(dict.fromkeys(cols, 0.0), columns=cols, index=[last_date])
             )
 
-        y_data.sort_index(inplace=True)
+        y_data = pd.concat(df_list).sort_index()
         y_data = y_data.asfreq("D", fill_value=0)
         return y_data
 
diff --git a/doctor_visits/delphi_doctor_visits/update_sensor.py b/doctor_visits/delphi_doctor_visits/update_sensor.py
@@ -101,7 +101,7 @@ def update_sensor(
     data.dropna(inplace=True)  # drop rows with any missing entries
 
     # aggregate age groups (so data is unique by service date and FIPS)
-    data = data.groupby([Config.DATE_COL, Config.GEO_COL]).sum().reset_index()
+    data = data.groupby([Config.DATE_COL, Config.GEO_COL]).sum(numeric_only=True).reset_index()
     assert np.sum(data.duplicated()) == 0, "Duplicates after age group aggregation"
     assert (data[Config.COUNT_COLS] >= 0).all().all(), "Counts must be nonnegative"
 
diff --git a/dsew_community_profile/delphi_dsew_community_profile/pull.py b/dsew_community_profile/delphi_dsew_community_profile/pull.py
@@ -701,6 +701,7 @@ def generate_prop_signal(df, geo, geo_mapper):
         ).groupby(
             geo
         ).sum(
+            numeric_only=True
         ).reset_index(
         )
         df = pd.merge(df, map_df, left_on="geo_id", right_on=geo, how="inner")
diff --git a/dsew_community_profile/tests/test_pull.py b/dsew_community_profile/tests/test_pull.py
@@ -240,8 +240,8 @@ def test_nation_from_state(self):
                 'sample_size': [None, None],
                 'publish_date': [datetime(year=2020, month=1, day=1)]*2,})
 
-        pa_pop = int(state_pop.loc[state_pop.state_id == "pa", "pop"])
-        wv_pop = int(state_pop.loc[state_pop.state_id == "wv", "pop"])
+        pa_pop = int(state_pop.loc[state_pop.state_id == "pa", "pop"].iloc[0])
+        wv_pop = int(state_pop.loc[state_pop.state_id == "wv", "pop"].iloc[0])
         tot_pop = pa_pop + wv_pop
 
         assert True, nation_from_state(
@@ -285,7 +285,14 @@ def test_generate_prop_signal_msa(self):
         geomapper = GeoMapper()
         county_pop = geomapper.get_crosswalk("fips", "pop")
         county_msa = geomapper.get_crosswalk("fips", "msa")
-        msa_pop = county_pop.merge(county_msa, on="fips", how="inner").groupby("msa").sum().reset_index()
+        msa_pop = county_pop.merge(
+            county_msa, on="fips", how="inner"
+        ).groupby(
+            "msa"
+        ).sum(
+            numeric_only=True
+        ).reset_index(
+        )
 
         test_df = pd.DataFrame({
                 'geo_id': ['35620', '31080'],
@@ -294,8 +301,8 @@ def test_generate_prop_signal_msa(self):
                 'se': [None, None],
                 'sample_size': [None, None],})
 
-        nyc_pop = int(msa_pop.loc[msa_pop.msa == "35620", "pop"])
-        la_pop = int(msa_pop.loc[msa_pop.msa == "31080", "pop"])
+        nyc_pop = int(msa_pop.loc[msa_pop.msa == "35620", "pop"].iloc[0])
+        la_pop = int(msa_pop.loc[msa_pop.msa == "31080", "pop"].iloc[0])
 
         expected_df = pd.DataFrame({
                 'geo_id': ['35620', '31080'],
@@ -342,8 +349,8 @@ def test_generate_prop_signal_non_msa(self):
                     'se': [None, None],
                     'sample_size': [None, None],})
 
-            pop1 = int(geo_pop.loc[geo_pop[settings["code_name"]] == settings["geo_names"][0], "pop"])
-            pop2 = int(geo_pop.loc[geo_pop[settings["code_name"]] == settings["geo_names"][1], "pop"])
+            pop1 = int(geo_pop.loc[geo_pop[settings["code_name"]] == settings["geo_names"][0], "pop"].iloc[0])
+            pop2 = int(geo_pop.loc[geo_pop[settings["code_name"]] == settings["geo_names"][1], "pop"].iloc[0])
 
             expected_df = pd.DataFrame({
                     'geo_id': settings["geo_names"],
diff --git a/hhs_hosp/tests/test_run.py b/hhs_hosp/tests/test_run.py
@@ -100,8 +100,8 @@ def test_transform_signal_pop():
         'timestamp': [datetime(year=2020, month=1, day=1)]*2,
         'val': [15., 150.],})
 
-    pa_pop = int(state_pop.loc[state_pop.state_id == "pa", "pop"])
-    wv_pop = int(state_pop.loc[state_pop.state_id == "wv", "pop"])
+    pa_pop = int(state_pop[state_pop.state_id == "pa"]["pop"].iloc[0])
+    wv_pop = int(state_pop[state_pop.state_id == "wv"]["pop"].iloc[0])
     pd.testing.assert_frame_equal(
         transform_signal(
             CONFIRMED_PROP,
diff --git a/nchs_mortality/delphi_nchs_mortality/pull.py b/nchs_mortality/delphi_nchs_mortality/pull.py
@@ -108,11 +108,19 @@ def pull_nchs_mortality_data(token: str, test_file: Optional[str]=None):
     # Get mask df to ignore cells where both of them have NAN values
     mask = (df_ny[keep_columns].isnull().values \
             & df_nyc[keep_columns].isnull().values)
-    df_ny = df_ny.append(df_nyc).groupby("timestamp").sum().where(~mask, np.nan)
+    df_ny = pd.concat(
+        [df_ny, df_nyc]
+    ).groupby(
+        "timestamp"
+    ).sum(
+        numeric_only=True
+    ).where(
+        ~mask, np.nan
+    )
     df_ny["state"] = "New York"
     # Drop NYC and NY in the full dataset
     df = df.loc[~df["state"].isin(["New York", "New York City"]), :]
-    df = df.append(df_ny).reset_index().sort_values(["state", "timestamp"])
+    df = pd.concat([df, df_ny]).reset_index().sort_values(["state", "timestamp"])
     # Add population info
     keep_columns.extend(["timestamp", "geo_id", "population"])
     gmpr = GeoMapper()
diff --git a/quidel_covidtest/delphi_quidel_covidtest/data_tools.py b/quidel_covidtest/delphi_quidel_covidtest/data_tools.py
@@ -30,14 +30,18 @@ def fill_dates(y_data, first_date, last_date):
     Returns: dataframe containing all dates given
     """
     cols = y_data.columns
+
+    df_list = [y_data]
     if first_date not in y_data.index:
-        y_data = y_data.append(pd.DataFrame(dict.fromkeys(cols, 0.),
-                                            columns=cols, index=[first_date]))
+        df_list.append(
+            pd.DataFrame(dict.fromkeys(cols, 0.), columns=cols, index=[first_date])
+        )
     if last_date not in y_data.index:
-        y_data = y_data.append(pd.DataFrame(dict.fromkeys(cols, 0.),
-                                            columns=cols, index=[last_date]))
+        df_list.append(
+            pd.DataFrame(dict.fromkeys(cols, 0.), columns=cols, index=[last_date])
+        )
 
-    y_data.sort_index(inplace=True)
+    y_data = pd.concat(df_list).sort_index()
     y_data = y_data.asfreq('D', fill_value=0)
     y_data.fillna(0, inplace=True)
     return y_data
diff --git a/quidel_covidtest/delphi_quidel_covidtest/generate_sensor.py b/quidel_covidtest/delphi_quidel_covidtest/generate_sensor.py
@@ -27,8 +27,8 @@ def generate_sensor_for_nonparent_geo(state_groups, res_key, smooth, device,
     Returns:
         df: pd.DataFrame
     """
-    state_df = pd.DataFrame(columns=["geo_id", "val", "se", "sample_size", "timestamp"])
     state_list = list(state_groups.groups.keys())
+    df_list = []
     for state in state_list:
         state_group = state_groups.get_group(state)
         state_group = state_group.drop(columns=[res_key])
@@ -63,12 +63,15 @@ def generate_sensor_for_nonparent_geo(state_groups, res_key, smooth, device,
             stat = stat * 100
 
         se = se * 100
-        state_df = state_df.append(pd.DataFrame({"geo_id": state,
-                                                 "timestamp": state_group.index,
-                                                 "val": stat,
-                                                 "se": se,
-                                                 "sample_size": sample_size}))
-    return remove_null_samples(state_df)
+        df_list.append(
+            pd.DataFrame({"geo_id": state,
+                         "timestamp": state_group.index,
+                         "val": stat,
+                         "se": se,
+                         "sample_size": sample_size})
+        )
+
+    return remove_null_samples(pd.concat(df_list))
 
 def generate_sensor_for_parent_geo(state_groups, data, res_key, smooth,
                                    device, first_date, last_date, suffix):
@@ -88,9 +91,9 @@ def generate_sensor_for_parent_geo(state_groups, data, res_key, smooth,
         df: pd.DataFrame
     """
     has_parent = True
-    res_df = pd.DataFrame(columns=["geo_id", "val", "se", "sample_size"])
     if res_key == "fips": # Add rest-of-state report for county level
         data = add_megacounties(data, smooth)
+    df_list = []
     for loc, res_group in data.groupby(res_key):
         parent_state = res_group['state_id'].values[0]
         try:
@@ -147,9 +150,12 @@ def generate_sensor_for_parent_geo(state_groups, data, res_key, smooth,
                 stat = stat * 100
 
         se = se * 100
-        res_df = res_df.append(pd.DataFrame({"geo_id": loc,
-                                             "timestamp": res_group.index,
-                                             "val": stat,
-                                             "se": se,
-                                             "sample_size": sample_size}))
-    return remove_null_samples(res_df)
+        df_list.append(
+            pd.DataFrame({"geo_id": loc,
+                         "timestamp": res_group.index,
+                         "val": stat,
+                         "se": se,
+                         "sample_size": sample_size})
+        )
+
+    return remove_null_samples(pd.concat(df_list))
diff --git a/quidel_covidtest/delphi_quidel_covidtest/geo_maps.py b/quidel_covidtest/delphi_quidel_covidtest/geo_maps.py
@@ -88,5 +88,5 @@ def add_parent_state(data, geo_res, geo_key):
     # Merge the info of parent state to the data
     data = data.merge(mix_map, how="left", on=geo_key).drop(
         columns=["population"]).dropna()
-    data = data.groupby(["timestamp", geo_key, "state_id"]).sum().reset_index()
+    data = data.groupby(["timestamp", geo_key, "state_id"]).sum(numeric_only=True).reset_index()
     return data
diff --git a/quidel_covidtest/delphi_quidel_covidtest/pull.py b/quidel_covidtest/delphi_quidel_covidtest/pull.py

Original file line number	Diff line number	Diff line change
`@@ -701,6 +701,7 @@ def generate_prop_signal(df, geo, geo_mapper):`
`701`	`701`	`).groupby(`
`702`	`702`	`geo`
`703`	`703`	`).sum(`
	`704`	`+ numeric_only=True`
`704`	`705`	`).reset_index(`
`705`	`706`	`)`
`706`	`707`	`df = pd.merge(df, map_df, left_on="geo_id", right_on=geo, how="inner")`