diff --git a/_delphi_utils_python/delphi_utils/flash_eval/eval_day.py b/_delphi_utils_python/delphi_utils/flash_eval/eval_day.py index 373b486d6..660fca042 100644 --- a/_delphi_utils_python/delphi_utils/flash_eval/eval_day.py +++ b/_delphi_utils_python/delphi_utils/flash_eval/eval_day.py @@ -147,7 +147,7 @@ def output(evd_ranking, day, lag, signal, logger): """ starter_link = f"{HTML_LINK}{(day+pd.Timedelta(f'{lag}d')).strftime('%Y-%m_%d')}" p_text = "" - for j, (index, value) in enumerate(evd_ranking.sort_values(ascending=False).iteritems()): + for j, (index, value) in enumerate(evd_ranking.sort_values(ascending=False).items()): if j < 30: start_link = f"{starter_link},{day.strftime('%Y-%m_%d')},{index}" p_text += f"\t{start_link}|*{index}*, {'{:.2f}'.format(value)}>\n" diff --git a/_delphi_utils_python/delphi_utils/geomap.py b/_delphi_utils_python/delphi_utils/geomap.py index d5446d1ea..4782798a0 100644 --- a/_delphi_utils_python/delphi_utils/geomap.py +++ b/_delphi_utils_python/delphi_utils/geomap.py @@ -401,9 +401,9 @@ def replace_geocode( df.drop("weight", axis=1, inplace=True) if not date_col is None: - df = df.groupby([date_col, new_col]).sum().reset_index() + df = df.groupby([date_col, new_col]).sum(numeric_only=True).reset_index() else: - df = df.groupby([new_col]).sum().reset_index() + df = df.groupby([new_col]).sum(numeric_only=True).reset_index() return df def add_population_column(self, data, geocode_type, geocode_col=None, dropna=True): @@ -501,7 +501,7 @@ def fips_to_megacounty( ) data.set_index([fips_col, date_col], inplace=True) data = data.join(mega_data) - data = data.reset_index().groupby([date_col, mega_col]).sum() + data = data.reset_index().groupby([date_col, mega_col]).sum(numeric_only=True) return data.reset_index() def as_mapper_name(self, geo_type, state="state_id"): diff --git a/_delphi_utils_python/delphi_utils/validator/dynamic.py b/_delphi_utils_python/delphi_utils/validator/dynamic.py index 7e3524779..6beb5712b 100644 --- a/_delphi_utils_python/delphi_utils/validator/dynamic.py +++ b/_delphi_utils_python/delphi_utils/validator/dynamic.py @@ -195,7 +195,7 @@ def replace_first_six(df, start_date): start_date = self.params.time_window.start_date) if not error_df.empty: - for index, value in error_df.iteritems(): + for index, value in error_df.items(): report.add_raised_error( ValidationFailure("check_val_missing", geo_type=geo_type, diff --git a/_delphi_utils_python/setup.py b/_delphi_utils_python/setup.py index 2cc6a2cb0..0c8c0951a 100644 --- a/_delphi_utils_python/setup.py +++ b/_delphi_utils_python/setup.py @@ -14,7 +14,7 @@ "mock", "moto", "numpy", - "pandas>=1.1.0,<2", + "pandas>=1.1.0", "pydocstyle", "pylint==2.8.3", "pytest", diff --git a/_delphi_utils_python/tests/test_export.py b/_delphi_utils_python/tests/test_export.py index 6076f362d..04cd2f6ce 100644 --- a/_delphi_utils_python/tests/test_export.py +++ b/_delphi_utils_python/tests/test_export.py @@ -250,15 +250,15 @@ def test_export_with_null_removal(self): """Test that `remove_null_samples = True` removes entries with null samples.""" _clean_directory(self.TEST_DIR) - df_with_nulls = self.DF.copy().append( - { + df_with_nulls = pd.concat( + [self.DF.copy(), + pd.DataFrame({ "geo_id": "66666", "timestamp": datetime(2020, 6, 6), "val": 10, "se": 0.2, "sample_size": pd.NA, - }, - ignore_index=True, + }, index = [0])] ) create_export_csv( @@ -283,15 +283,15 @@ def test_export_without_null_removal(self): """Test that `remove_null_samples = False` does not remove entries with null samples.""" _clean_directory(self.TEST_DIR) - df_with_nulls = self.DF.copy().append( - { + df_with_nulls = pd.concat( + [self.DF.copy(), + pd.DataFrame({ "geo_id": "66666", "timestamp": datetime(2020, 6, 6), "val": 10, "se": 0.2, "sample_size": pd.NA, - }, - ignore_index=True, + }, index = [0])] ) create_export_csv( diff --git a/_delphi_utils_python/tests/test_geomap.py b/_delphi_utils_python/tests/test_geomap.py index d10b4d493..78fccca77 100644 --- a/_delphi_utils_python/tests/test_geomap.py +++ b/_delphi_utils_python/tests/test_geomap.py @@ -196,7 +196,7 @@ def test_load_fips_chngfips_table(self, geomapper): def test_load_jhu_uid_fips_table(self, geomapper): jhu_data = geomapper.get_crosswalk(from_code="jhu_uid", to_code="fips") - assert np.allclose(jhu_data.groupby("jhu_uid").sum(), 1.0) + assert np.allclose(jhu_data.groupby("jhu_uid").sum(numeric_only=True), 1.0) def test_load_zip_hrr_table(self, geomapper): zip_data = geomapper.get_crosswalk(from_code="zip", to_code="hrr") diff --git a/_delphi_utils_python/tests/validator/test_dynamic.py b/_delphi_utils_python/tests/validator/test_dynamic.py index 248e1b2aa..45d5a15d0 100644 --- a/_delphi_utils_python/tests/validator/test_dynamic.py +++ b/_delphi_utils_python/tests/validator/test_dynamic.py @@ -48,7 +48,7 @@ def test_half_padding(self): ref_df, test_df, ref_date, ref_date) # Check it only takes missing dates - so the last 5 dates - assert new_ref_df.time_value.max() == datetime.strptime("2021-01-11", + assert new_ref_df.time_value.max().date() == datetime.strptime("2021-01-11", "%Y-%m-%d").date() assert new_ref_df.shape[0] == 11 assert new_ref_df["val"].iloc[5] == 2 @@ -71,7 +71,7 @@ def test_full_padding(self): ref_df, test_df, ref_date, ref_date) # Check it only takes missing dates up to the day before the reference - assert new_ref_df.time_value.max() == datetime.strptime("2021-01-15", + assert new_ref_df.time_value.max().date() == datetime.strptime("2021-01-15", "%Y-%m-%d").date() assert new_ref_df.shape[0] == 15 assert new_ref_df["val"].iloc[5] == 2 diff --git a/changehc/delphi_changehc/load_data.py b/changehc/delphi_changehc/load_data.py index c22c2483a..c4a5f1e9c 100644 --- a/changehc/delphi_changehc/load_data.py +++ b/changehc/delphi_changehc/load_data.py @@ -71,7 +71,7 @@ def load_chng_data(filepath, dropdate, base_geo, ), "Counts must be nonnegative" # aggregate age groups (so data is unique by date and base geography) - data = data.groupby([base_geo, Config.DATE_COL]).sum() + data = data.groupby([base_geo, Config.DATE_COL]).sum(numeric_only=True) data.dropna(inplace=True) # drop rows with any missing entries return data diff --git a/changehc/tests/test_update_sensor.py b/changehc/tests/test_update_sensor.py index 318437b92..999fed7e8 100644 --- a/changehc/tests/test_update_sensor.py +++ b/changehc/tests/test_update_sensor.py @@ -91,7 +91,7 @@ def test_geo_reindex(self): "timestamp": [pd.Timestamp(f'03-{i}-2020') for i in range(1, 14)]}) data_frame = su_inst.geo_reindex(test_data) assert data_frame.shape[0] == multiple*len(su_inst.fit_dates) - assert (data_frame.sum() == (4200,19000)).all() + assert (data_frame.sum(numeric_only=True) == (4200,19000)).all() def test_update_sensor(self): """Tests that the sensors are properly updated.""" diff --git a/claims_hosp/delphi_claims_hosp/load_data.py b/claims_hosp/delphi_claims_hosp/load_data.py index c2ee07e74..010d9d61b 100644 --- a/claims_hosp/delphi_claims_hosp/load_data.py +++ b/claims_hosp/delphi_claims_hosp/load_data.py @@ -47,7 +47,7 @@ def load_claims_data(claims_filepath, dropdate, base_geo): ), "Claims counts must be nonnegative" # aggregate age groups (so data is unique by date and base geography) - claims_data = claims_data.groupby([base_geo, Config.DATE_COL]).sum() + claims_data = claims_data.groupby([base_geo, Config.DATE_COL]).sum(numeric_only=True) claims_data.dropna(inplace=True) # drop rows with any missing entries return claims_data diff --git a/doctor_visits/delphi_doctor_visits/geo_maps.py b/doctor_visits/delphi_doctor_visits/geo_maps.py index 716e8899d..8c72f6bc4 100644 --- a/doctor_visits/delphi_doctor_visits/geo_maps.py +++ b/doctor_visits/delphi_doctor_visits/geo_maps.py @@ -49,7 +49,7 @@ def county_to_msa(self, data): from_col="PatCountyFIPS", new_col="cbsa_id") data.drop(columns="PatCountyFIPS", inplace=True) - data = data.groupby(["ServiceDate", "cbsa_id"]).sum().reset_index() + data = data.groupby(["ServiceDate", "cbsa_id"]).sum(numeric_only=True).reset_index() return data.groupby("cbsa_id"), "cbsa_id" @@ -66,7 +66,7 @@ def county_to_state(self, data): "state_id", from_col="PatCountyFIPS") data.drop(columns="PatCountyFIPS", inplace=True) - data = data.groupby(["ServiceDate", "state_id"]).sum().reset_index() + data = data.groupby(["ServiceDate", "state_id"]).sum(numeric_only=True).reset_index() return data.groupby("state_id"), "state_id" @@ -83,7 +83,7 @@ def county_to_hhs(self, data): "hhs", from_col="PatCountyFIPS") data.drop(columns="PatCountyFIPS", inplace=True) - data = data.groupby(["ServiceDate", "hhs"]).sum().reset_index() + data = data.groupby(["ServiceDate", "hhs"]).sum(numeric_only=True).reset_index() return data.groupby("hhs"), "hhs" @@ -100,7 +100,7 @@ def county_to_nation(self, data): "nation", from_col="PatCountyFIPS") data.drop(columns="PatCountyFIPS", inplace=True) - data = data.groupby(["ServiceDate", "nation"]).sum().reset_index() + data = data.groupby(["ServiceDate", "nation"]).sum(numeric_only=True).reset_index() return data.groupby("nation"), "nation" diff --git a/doctor_visits/delphi_doctor_visits/sensor.py b/doctor_visits/delphi_doctor_visits/sensor.py index e96c8bfe0..b5a645ea8 100644 --- a/doctor_visits/delphi_doctor_visits/sensor.py +++ b/doctor_visits/delphi_doctor_visits/sensor.py @@ -60,16 +60,17 @@ def fill_dates(y_data, dates): last_date = dates[-1] cols = y_data.columns + df_list = [y_data] if first_date not in y_data.index: - y_data = y_data.append( + df_list.append( pd.DataFrame(dict.fromkeys(cols, 0.0), columns=cols, index=[first_date]) ) if last_date not in y_data.index: - y_data = y_data.append( + df_list.append( pd.DataFrame(dict.fromkeys(cols, 0.0), columns=cols, index=[last_date]) ) - y_data.sort_index(inplace=True) + y_data = pd.concat(df_list).sort_index() y_data = y_data.asfreq("D", fill_value=0) return y_data diff --git a/doctor_visits/delphi_doctor_visits/update_sensor.py b/doctor_visits/delphi_doctor_visits/update_sensor.py index 1bb068059..019c3f9d5 100644 --- a/doctor_visits/delphi_doctor_visits/update_sensor.py +++ b/doctor_visits/delphi_doctor_visits/update_sensor.py @@ -101,7 +101,7 @@ def update_sensor( data.dropna(inplace=True) # drop rows with any missing entries # aggregate age groups (so data is unique by service date and FIPS) - data = data.groupby([Config.DATE_COL, Config.GEO_COL]).sum().reset_index() + data = data.groupby([Config.DATE_COL, Config.GEO_COL]).sum(numeric_only=True).reset_index() assert np.sum(data.duplicated()) == 0, "Duplicates after age group aggregation" assert (data[Config.COUNT_COLS] >= 0).all().all(), "Counts must be nonnegative" diff --git a/dsew_community_profile/delphi_dsew_community_profile/pull.py b/dsew_community_profile/delphi_dsew_community_profile/pull.py index 1ba5281cf..e9b8d24a1 100644 --- a/dsew_community_profile/delphi_dsew_community_profile/pull.py +++ b/dsew_community_profile/delphi_dsew_community_profile/pull.py @@ -701,6 +701,7 @@ def generate_prop_signal(df, geo, geo_mapper): ).groupby( geo ).sum( + numeric_only=True ).reset_index( ) df = pd.merge(df, map_df, left_on="geo_id", right_on=geo, how="inner") diff --git a/dsew_community_profile/tests/test_pull.py b/dsew_community_profile/tests/test_pull.py index fab6bca76..e968df4d7 100644 --- a/dsew_community_profile/tests/test_pull.py +++ b/dsew_community_profile/tests/test_pull.py @@ -240,8 +240,8 @@ def test_nation_from_state(self): 'sample_size': [None, None], 'publish_date': [datetime(year=2020, month=1, day=1)]*2,}) - pa_pop = int(state_pop.loc[state_pop.state_id == "pa", "pop"]) - wv_pop = int(state_pop.loc[state_pop.state_id == "wv", "pop"]) + pa_pop = int(state_pop.loc[state_pop.state_id == "pa", "pop"].iloc[0]) + wv_pop = int(state_pop.loc[state_pop.state_id == "wv", "pop"].iloc[0]) tot_pop = pa_pop + wv_pop assert True, nation_from_state( @@ -285,7 +285,14 @@ def test_generate_prop_signal_msa(self): geomapper = GeoMapper() county_pop = geomapper.get_crosswalk("fips", "pop") county_msa = geomapper.get_crosswalk("fips", "msa") - msa_pop = county_pop.merge(county_msa, on="fips", how="inner").groupby("msa").sum().reset_index() + msa_pop = county_pop.merge( + county_msa, on="fips", how="inner" + ).groupby( + "msa" + ).sum( + numeric_only=True + ).reset_index( + ) test_df = pd.DataFrame({ 'geo_id': ['35620', '31080'], @@ -294,8 +301,8 @@ def test_generate_prop_signal_msa(self): 'se': [None, None], 'sample_size': [None, None],}) - nyc_pop = int(msa_pop.loc[msa_pop.msa == "35620", "pop"]) - la_pop = int(msa_pop.loc[msa_pop.msa == "31080", "pop"]) + nyc_pop = int(msa_pop.loc[msa_pop.msa == "35620", "pop"].iloc[0]) + la_pop = int(msa_pop.loc[msa_pop.msa == "31080", "pop"].iloc[0]) expected_df = pd.DataFrame({ 'geo_id': ['35620', '31080'], @@ -342,8 +349,8 @@ def test_generate_prop_signal_non_msa(self): 'se': [None, None], 'sample_size': [None, None],}) - pop1 = int(geo_pop.loc[geo_pop[settings["code_name"]] == settings["geo_names"][0], "pop"]) - pop2 = int(geo_pop.loc[geo_pop[settings["code_name"]] == settings["geo_names"][1], "pop"]) + pop1 = int(geo_pop.loc[geo_pop[settings["code_name"]] == settings["geo_names"][0], "pop"].iloc[0]) + pop2 = int(geo_pop.loc[geo_pop[settings["code_name"]] == settings["geo_names"][1], "pop"].iloc[0]) expected_df = pd.DataFrame({ 'geo_id': settings["geo_names"], diff --git a/hhs_hosp/tests/test_run.py b/hhs_hosp/tests/test_run.py index 4719ce8db..a88b240be 100644 --- a/hhs_hosp/tests/test_run.py +++ b/hhs_hosp/tests/test_run.py @@ -100,8 +100,8 @@ def test_transform_signal_pop(): 'timestamp': [datetime(year=2020, month=1, day=1)]*2, 'val': [15., 150.],}) - pa_pop = int(state_pop.loc[state_pop.state_id == "pa", "pop"]) - wv_pop = int(state_pop.loc[state_pop.state_id == "wv", "pop"]) + pa_pop = int(state_pop[state_pop.state_id == "pa"]["pop"].iloc[0]) + wv_pop = int(state_pop[state_pop.state_id == "wv"]["pop"].iloc[0]) pd.testing.assert_frame_equal( transform_signal( CONFIRMED_PROP, diff --git a/nchs_mortality/delphi_nchs_mortality/pull.py b/nchs_mortality/delphi_nchs_mortality/pull.py index 55358eef2..45887041e 100644 --- a/nchs_mortality/delphi_nchs_mortality/pull.py +++ b/nchs_mortality/delphi_nchs_mortality/pull.py @@ -108,11 +108,19 @@ def pull_nchs_mortality_data(token: str, test_file: Optional[str]=None): # Get mask df to ignore cells where both of them have NAN values mask = (df_ny[keep_columns].isnull().values \ & df_nyc[keep_columns].isnull().values) - df_ny = df_ny.append(df_nyc).groupby("timestamp").sum().where(~mask, np.nan) + df_ny = pd.concat( + [df_ny, df_nyc] + ).groupby( + "timestamp" + ).sum( + numeric_only=True + ).where( + ~mask, np.nan + ) df_ny["state"] = "New York" # Drop NYC and NY in the full dataset df = df.loc[~df["state"].isin(["New York", "New York City"]), :] - df = df.append(df_ny).reset_index().sort_values(["state", "timestamp"]) + df = pd.concat([df, df_ny]).reset_index().sort_values(["state", "timestamp"]) # Add population info keep_columns.extend(["timestamp", "geo_id", "population"]) gmpr = GeoMapper() diff --git a/quidel_covidtest/delphi_quidel_covidtest/data_tools.py b/quidel_covidtest/delphi_quidel_covidtest/data_tools.py index e8958ecfd..98c9f8502 100644 --- a/quidel_covidtest/delphi_quidel_covidtest/data_tools.py +++ b/quidel_covidtest/delphi_quidel_covidtest/data_tools.py @@ -30,14 +30,18 @@ def fill_dates(y_data, first_date, last_date): Returns: dataframe containing all dates given """ cols = y_data.columns + + df_list = [y_data] if first_date not in y_data.index: - y_data = y_data.append(pd.DataFrame(dict.fromkeys(cols, 0.), - columns=cols, index=[first_date])) + df_list.append( + pd.DataFrame(dict.fromkeys(cols, 0.), columns=cols, index=[first_date]) + ) if last_date not in y_data.index: - y_data = y_data.append(pd.DataFrame(dict.fromkeys(cols, 0.), - columns=cols, index=[last_date])) + df_list.append( + pd.DataFrame(dict.fromkeys(cols, 0.), columns=cols, index=[last_date]) + ) - y_data.sort_index(inplace=True) + y_data = pd.concat(df_list).sort_index() y_data = y_data.asfreq('D', fill_value=0) y_data.fillna(0, inplace=True) return y_data diff --git a/quidel_covidtest/delphi_quidel_covidtest/generate_sensor.py b/quidel_covidtest/delphi_quidel_covidtest/generate_sensor.py index 5f44a519b..757175874 100644 --- a/quidel_covidtest/delphi_quidel_covidtest/generate_sensor.py +++ b/quidel_covidtest/delphi_quidel_covidtest/generate_sensor.py @@ -27,8 +27,8 @@ def generate_sensor_for_nonparent_geo(state_groups, res_key, smooth, device, Returns: df: pd.DataFrame """ - state_df = pd.DataFrame(columns=["geo_id", "val", "se", "sample_size", "timestamp"]) state_list = list(state_groups.groups.keys()) + df_list = [] for state in state_list: state_group = state_groups.get_group(state) state_group = state_group.drop(columns=[res_key]) @@ -63,12 +63,15 @@ def generate_sensor_for_nonparent_geo(state_groups, res_key, smooth, device, stat = stat * 100 se = se * 100 - state_df = state_df.append(pd.DataFrame({"geo_id": state, - "timestamp": state_group.index, - "val": stat, - "se": se, - "sample_size": sample_size})) - return remove_null_samples(state_df) + df_list.append( + pd.DataFrame({"geo_id": state, + "timestamp": state_group.index, + "val": stat, + "se": se, + "sample_size": sample_size}) + ) + + return remove_null_samples(pd.concat(df_list)) def generate_sensor_for_parent_geo(state_groups, data, res_key, smooth, device, first_date, last_date, suffix): @@ -88,9 +91,9 @@ def generate_sensor_for_parent_geo(state_groups, data, res_key, smooth, df: pd.DataFrame """ has_parent = True - res_df = pd.DataFrame(columns=["geo_id", "val", "se", "sample_size"]) if res_key == "fips": # Add rest-of-state report for county level data = add_megacounties(data, smooth) + df_list = [] for loc, res_group in data.groupby(res_key): parent_state = res_group['state_id'].values[0] try: @@ -147,9 +150,12 @@ def generate_sensor_for_parent_geo(state_groups, data, res_key, smooth, stat = stat * 100 se = se * 100 - res_df = res_df.append(pd.DataFrame({"geo_id": loc, - "timestamp": res_group.index, - "val": stat, - "se": se, - "sample_size": sample_size})) - return remove_null_samples(res_df) + df_list.append( + pd.DataFrame({"geo_id": loc, + "timestamp": res_group.index, + "val": stat, + "se": se, + "sample_size": sample_size}) + ) + + return remove_null_samples(pd.concat(df_list)) diff --git a/quidel_covidtest/delphi_quidel_covidtest/geo_maps.py b/quidel_covidtest/delphi_quidel_covidtest/geo_maps.py index 9cefc0f9e..d59dab692 100644 --- a/quidel_covidtest/delphi_quidel_covidtest/geo_maps.py +++ b/quidel_covidtest/delphi_quidel_covidtest/geo_maps.py @@ -88,5 +88,5 @@ def add_parent_state(data, geo_res, geo_key): # Merge the info of parent state to the data data = data.merge(mix_map, how="left", on=geo_key).drop( columns=["population"]).dropna() - data = data.groupby(["timestamp", geo_key, "state_id"]).sum().reset_index() + data = data.groupby(["timestamp", geo_key, "state_id"]).sum(numeric_only=True).reset_index() return data diff --git a/quidel_covidtest/delphi_quidel_covidtest/pull.py b/quidel_covidtest/delphi_quidel_covidtest/pull.py index 2ac5b958f..84ae8742e 100644 --- a/quidel_covidtest/delphi_quidel_covidtest/pull.py +++ b/quidel_covidtest/delphi_quidel_covidtest/pull.py @@ -34,7 +34,6 @@ def get_from_s3(start_date, end_date, bucket, logger): 'State', 'Zip', 'PatientAge', 'Result1', 'Result2', 'OverallResult', 'StorageDate', 'fname'] - df = pd.DataFrame(columns=selected_columns) s3_files = {} for obj in bucket.objects.all(): if "-sars" in obj.key: @@ -52,25 +51,30 @@ def get_from_s3(start_date, end_date, bucket, logger): s3_files[received_date].append(obj.key) n_days = (end_date - start_date).days + 1 + df_list = [] + seen_files = set() for search_date in [start_date + timedelta(days=x) for x in range(n_days)]: if search_date in s3_files.keys(): - # Avoid appending duplicate datasets logger.info(f"Pulling data received on {search_date.date()}") # Fetch data received on the same day for fn in s3_files[search_date]: + # Skip non-CSV files, such as directories if ".csv" not in fn: - continue #Add to avoid that the folder name was readed as a fn. - if fn in set(df["fname"].values): + continue + # Avoid appending duplicate datasets + if fn in seen_files: continue obj = bucket.Object(key=fn) newdf = pd.read_csv(obj.get()["Body"], parse_dates=["StorageDate", "TestDate"], low_memory=False) + seen_files.add(fn) newdf["fname"] = fn - df = df.append(newdf[selected_columns]) + df_list.append(newdf[selected_columns]) time_flag = search_date - return df, time_flag + + return pd.concat(df_list), time_flag def fix_zipcode(df): """Fix zipcode that is 9 digit instead of 5 digit.""" @@ -297,7 +301,14 @@ def pull_quidel_covidtest(params, logger): # Utilize previously stored data if previous_df is not None: - df = previous_df.append(df).groupby(["timestamp", "zip"]).sum().reset_index() + df = pd.concat( + [previous_df, df] + ).groupby( + ["timestamp", "zip"] + ).sum( + numeric_only=True + ).reset_index( + ) return df, _end_date def check_export_end_date(input_export_end_date, _end_date,