diff --git a/.bumpversion.cfg b/.bumpversion.cfg index a35f8c142..270d7d5ba 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.3.35 +current_version = 0.3.36 commit = True message = chore: bump covidcast-indicators to {new_version} tag = False diff --git a/_delphi_utils_python/.bumpversion.cfg b/_delphi_utils_python/.bumpversion.cfg index 4377a24ba..001ce163e 100644 --- a/_delphi_utils_python/.bumpversion.cfg +++ b/_delphi_utils_python/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.3.12 +current_version = 0.3.13 commit = True message = chore: bump delphi_utils to {new_version} tag = False diff --git a/_delphi_utils_python/delphi_utils/__init__.py b/_delphi_utils_python/delphi_utils/__init__.py index e1433582d..d9ef74919 100644 --- a/_delphi_utils_python/delphi_utils/__init__.py +++ b/_delphi_utils_python/delphi_utils/__init__.py @@ -15,4 +15,4 @@ from .nancodes import Nans from .weekday import Weekday -__version__ = "0.3.12" +__version__ = "0.3.13" diff --git a/_delphi_utils_python/delphi_utils/flash_eval/eval_day.py b/_delphi_utils_python/delphi_utils/flash_eval/eval_day.py index 373b486d6..660fca042 100644 --- a/_delphi_utils_python/delphi_utils/flash_eval/eval_day.py +++ b/_delphi_utils_python/delphi_utils/flash_eval/eval_day.py @@ -147,7 +147,7 @@ def output(evd_ranking, day, lag, signal, logger): """ starter_link = f"{HTML_LINK}{(day+pd.Timedelta(f'{lag}d')).strftime('%Y-%m_%d')}" p_text = "" - for j, (index, value) in enumerate(evd_ranking.sort_values(ascending=False).iteritems()): + for j, (index, value) in enumerate(evd_ranking.sort_values(ascending=False).items()): if j < 30: start_link = f"{starter_link},{day.strftime('%Y-%m_%d')},{index}" p_text += f"\t{start_link}|*{index}*, {'{:.2f}'.format(value)}>\n" diff --git a/_delphi_utils_python/delphi_utils/geomap.py b/_delphi_utils_python/delphi_utils/geomap.py index d5446d1ea..4782798a0 100644 --- a/_delphi_utils_python/delphi_utils/geomap.py +++ b/_delphi_utils_python/delphi_utils/geomap.py @@ -401,9 +401,9 @@ def replace_geocode( df.drop("weight", axis=1, inplace=True) if not date_col is None: - df = df.groupby([date_col, new_col]).sum().reset_index() + df = df.groupby([date_col, new_col]).sum(numeric_only=True).reset_index() else: - df = df.groupby([new_col]).sum().reset_index() + df = df.groupby([new_col]).sum(numeric_only=True).reset_index() return df def add_population_column(self, data, geocode_type, geocode_col=None, dropna=True): @@ -501,7 +501,7 @@ def fips_to_megacounty( ) data.set_index([fips_col, date_col], inplace=True) data = data.join(mega_data) - data = data.reset_index().groupby([date_col, mega_col]).sum() + data = data.reset_index().groupby([date_col, mega_col]).sum(numeric_only=True) return data.reset_index() def as_mapper_name(self, geo_type, state="state_id"): diff --git a/_delphi_utils_python/delphi_utils/validator/dynamic.py b/_delphi_utils_python/delphi_utils/validator/dynamic.py index 7e3524779..6beb5712b 100644 --- a/_delphi_utils_python/delphi_utils/validator/dynamic.py +++ b/_delphi_utils_python/delphi_utils/validator/dynamic.py @@ -195,7 +195,7 @@ def replace_first_six(df, start_date): start_date = self.params.time_window.start_date) if not error_df.empty: - for index, value in error_df.iteritems(): + for index, value in error_df.items(): report.add_raised_error( ValidationFailure("check_val_missing", geo_type=geo_type, diff --git a/_delphi_utils_python/setup.py b/_delphi_utils_python/setup.py index 2cc6a2cb0..9aaa1f6d6 100644 --- a/_delphi_utils_python/setup.py +++ b/_delphi_utils_python/setup.py @@ -14,7 +14,7 @@ "mock", "moto", "numpy", - "pandas>=1.1.0,<2", + "pandas>=1.1.0", "pydocstyle", "pylint==2.8.3", "pytest", @@ -26,7 +26,7 @@ setup( name="delphi_utils", - version="0.3.12", + version="0.3.13", description="Shared Utility Functions for Indicators", long_description=long_description, long_description_content_type="text/markdown", diff --git a/_delphi_utils_python/tests/test_export.py b/_delphi_utils_python/tests/test_export.py index 6076f362d..04cd2f6ce 100644 --- a/_delphi_utils_python/tests/test_export.py +++ b/_delphi_utils_python/tests/test_export.py @@ -250,15 +250,15 @@ def test_export_with_null_removal(self): """Test that `remove_null_samples = True` removes entries with null samples.""" _clean_directory(self.TEST_DIR) - df_with_nulls = self.DF.copy().append( - { + df_with_nulls = pd.concat( + [self.DF.copy(), + pd.DataFrame({ "geo_id": "66666", "timestamp": datetime(2020, 6, 6), "val": 10, "se": 0.2, "sample_size": pd.NA, - }, - ignore_index=True, + }, index = [0])] ) create_export_csv( @@ -283,15 +283,15 @@ def test_export_without_null_removal(self): """Test that `remove_null_samples = False` does not remove entries with null samples.""" _clean_directory(self.TEST_DIR) - df_with_nulls = self.DF.copy().append( - { + df_with_nulls = pd.concat( + [self.DF.copy(), + pd.DataFrame({ "geo_id": "66666", "timestamp": datetime(2020, 6, 6), "val": 10, "se": 0.2, "sample_size": pd.NA, - }, - ignore_index=True, + }, index = [0])] ) create_export_csv( diff --git a/_delphi_utils_python/tests/test_geomap.py b/_delphi_utils_python/tests/test_geomap.py index d10b4d493..78fccca77 100644 --- a/_delphi_utils_python/tests/test_geomap.py +++ b/_delphi_utils_python/tests/test_geomap.py @@ -196,7 +196,7 @@ def test_load_fips_chngfips_table(self, geomapper): def test_load_jhu_uid_fips_table(self, geomapper): jhu_data = geomapper.get_crosswalk(from_code="jhu_uid", to_code="fips") - assert np.allclose(jhu_data.groupby("jhu_uid").sum(), 1.0) + assert np.allclose(jhu_data.groupby("jhu_uid").sum(numeric_only=True), 1.0) def test_load_zip_hrr_table(self, geomapper): zip_data = geomapper.get_crosswalk(from_code="zip", to_code="hrr") diff --git a/_delphi_utils_python/tests/validator/test_dynamic.py b/_delphi_utils_python/tests/validator/test_dynamic.py index 248e1b2aa..45d5a15d0 100644 --- a/_delphi_utils_python/tests/validator/test_dynamic.py +++ b/_delphi_utils_python/tests/validator/test_dynamic.py @@ -48,7 +48,7 @@ def test_half_padding(self): ref_df, test_df, ref_date, ref_date) # Check it only takes missing dates - so the last 5 dates - assert new_ref_df.time_value.max() == datetime.strptime("2021-01-11", + assert new_ref_df.time_value.max().date() == datetime.strptime("2021-01-11", "%Y-%m-%d").date() assert new_ref_df.shape[0] == 11 assert new_ref_df["val"].iloc[5] == 2 @@ -71,7 +71,7 @@ def test_full_padding(self): ref_df, test_df, ref_date, ref_date) # Check it only takes missing dates up to the day before the reference - assert new_ref_df.time_value.max() == datetime.strptime("2021-01-15", + assert new_ref_df.time_value.max().date() == datetime.strptime("2021-01-15", "%Y-%m-%d").date() assert new_ref_df.shape[0] == 15 assert new_ref_df["val"].iloc[5] == 2 diff --git a/ansible/templates/sir_complainsalot-params-prod.json.j2 b/ansible/templates/sir_complainsalot-params-prod.json.j2 index 9aa6506a6..65171a230 100644 --- a/ansible/templates/sir_complainsalot-params-prod.json.j2 +++ b/ansible/templates/sir_complainsalot-params-prod.json.j2 @@ -32,11 +32,14 @@ "max_age":6, "maintainers": ["U01AP8GSWG3","U01069KCRS7"], "retired-signals": [ - "raw_pct_negative","smoothed_pct_negative","raw_tests_per_device","smoothed_tests_per_device", - ["covid_ag_raw_pct_positive_age_0_4", "hrr"], ["covid_ag_raw_pct_positive_age_0_4", "msa"], - ["covid_ag_raw_pct_positive_age_5_17", "hrr"], ["covid_ag_raw_pct_positive_age_5_17", "msa"], - ["covid_ag_raw_pct_positive_age_50_64", "hrr"], ["covid_ag_raw_pct_positive_age_50_64", "msa"], - ["covid_ag_raw_pct_positive_age_65plus", "hrr"], ["covid_ag_raw_pct_positive_age_65plus", "msa"] + "raw_pct_negative", "smoothed_pct_negative", + "raw_tests_per_device", "smoothed_tests_per_device", + "covid_ag_raw_pct_positive_age_0_4", "covid_ag_smoothed_pct_positive_age_0_4", + "covid_ag_raw_pct_positive_age_5_17", "covid_ag_smoothed_pct_positive_age_5_17", + "covid_ag_raw_pct_positive_age_18_49", "covid_ag_smoothed_pct_positive_age_18_49", + "covid_ag_raw_pct_positive_age_50_64", "covid_ag_smoothed_pct_positive_age_50_64", + "covid_ag_raw_pct_positive_age_65plus", "covid_ag_smoothed_pct_positive_age_65plus", + "covid_ag_raw_pct_positive_age_0_17", "covid_ag_smoothed_pct_positive_age_0_17" ] }, "nchs-mortality": { diff --git a/backfill_corrections/Dockerfile b/backfill_corrections/Dockerfile index e88267b27..76f115c46 100644 --- a/backfill_corrections/Dockerfile +++ b/backfill_corrections/Dockerfile @@ -26,6 +26,7 @@ RUN install2.r --error \ RUN --mount=type=secret,id=GITHUB_TOKEN \ export GITHUB_PAT="$(cat /run/secrets/GITHUB_TOKEN)" && \ + R -e 'devtools::install_version("bettermc", version = "1.1.2")' && \ R -e 'devtools::install_github("cmu-delphi/covidcast", ref = "evalcast", subdir = "R-packages/evalcast")' && \ R -e 'devtools::install_github(repo="ryantibs/quantgen", subdir="quantgen")' && \ R -e 'install.packages(list.files(path="/opt/gurobi/linux64/R/", pattern="^gurobi_.*[.]tar[.]gz$", full.names = TRUE), repos=NULL)' diff --git a/backfill_corrections/Makefile b/backfill_corrections/Makefile index e58418dbc..68abff228 100644 --- a/backfill_corrections/Makefile +++ b/backfill_corrections/Makefile @@ -67,6 +67,8 @@ run-local: setup-dirs grep "backfill correction completed successfully" $(LOG_FILE) grep "scheduled core" $(LOG_FILE) ; \ [ "$$?" -eq 1 ] + grep "SIGBUS" $(LOG_FILE) ; \ + [ "$$?" -eq 1 ] gurobi.lic: @echo WLSACCESSID=$(GRB_WLSACCESSID) >> $(GRB_LICENSE_FILE) @@ -81,6 +83,7 @@ run: -v "`realpath $(USR_CACHE_DIR)`:/backfill_corrections/${CACHE_DIR}" \ -v "${PWD}"/params.json:/backfill_corrections/params.host.json \ --env GRB_LICENSE_FILE=$(GRB_LICENSE_FILE) \ + --shm-size=2gb \ -it "${DOCKER_IMAGE}:${DOCKER_TAG}" \ /bin/bash -c "cp params.host.json params.json && make gurobi.lic && make standardize-dirs && make run-local OPTIONS=\"${OPTIONS}\" LOG_FILE=${LOG_FILE}" @@ -124,7 +127,7 @@ standardize-dirs: $(PYTHON) -m delphi_utils set export_dir $(EXPORT_DIR) clean: - rm -f $(USR_EXPORT_DIR)/*.csv.gz + rm -rf $(USR_EXPORT_DIR)/* coverage: Rscript -e 'covr::package_coverage("delphiBackfillCorrection")' diff --git a/backfill_corrections/delphiBackfillCorrection/NAMESPACE b/backfill_corrections/delphiBackfillCorrection/NAMESPACE index 665c41e49..4c5b24a18 100644 --- a/backfill_corrections/delphiBackfillCorrection/NAMESPACE +++ b/backfill_corrections/delphiBackfillCorrection/NAMESPACE @@ -30,6 +30,7 @@ importFrom(dplyr,arrange) importFrom(dplyr,bind_cols) importFrom(dplyr,bind_rows) importFrom(dplyr,desc) +importFrom(dplyr,distinct) importFrom(dplyr,everything) importFrom(dplyr,filter) importFrom(dplyr,full_join) diff --git a/backfill_corrections/delphiBackfillCorrection/R/main.R b/backfill_corrections/delphiBackfillCorrection/R/main.R index b9b0fc892..69d37fa67 100644 --- a/backfill_corrections/delphiBackfillCorrection/R/main.R +++ b/backfill_corrections/delphiBackfillCorrection/R/main.R @@ -18,8 +18,6 @@ run_backfill <- function(df, params, refd_col = "time_value", lag_col = "lag", issued_col = "issue_date", signal_suffixes = c(""), indicator = "", signal = "") { - df <- filter(df, lag < params$ref_lag + 30) # a rough filtration to save memory - geo_levels <- params$geo_levels if ("state" %in% geo_levels) { # If state included, do it last since state processing modifies the @@ -62,6 +60,7 @@ run_backfill <- function(df, params, msg_ts("Splitting data into geo groups") group_dfs <- group_split(df, geo_value) + msg_ts("Beginning training and/or testing...") # Build model for each location apply_fn <- ifelse(params$parallel, mclapply, lapply) result <- apply_fn(group_dfs, function(subdf) { @@ -317,14 +316,12 @@ main <- function(params, msg_ts("Reading in and combining associated files") input_data <- lapply( - files_list, - function(file) { - # refd_col and issued_col read in as strings - read_data(file) %>% - fips_to_geovalue() - } + files_list, read_data # refd_col and issued_col read in as strings ) %>% - bind_rows() + bind_rows() %>% + fips_to_geovalue() %>% + # a rough filter to save memory + filter(lag < params$ref_lag + 30) if (nrow(input_data) == 0) { warning("No data available for indicator ", input_group$indicator, diff --git a/backfill_corrections/delphiBackfillCorrection/R/utils.R b/backfill_corrections/delphiBackfillCorrection/R/utils.R index 637bd9413..c50a6337c 100644 --- a/backfill_corrections/delphiBackfillCorrection/R/utils.R +++ b/backfill_corrections/delphiBackfillCorrection/R/utils.R @@ -169,6 +169,8 @@ create_dir_not_exist <- function(path) #' @return list of input dataframe augmented with lag column, if it #' didn't already exist, and character vector of one or two value #' column names, depending on requested `value_type` +#' +#' @importFrom dplyr distinct across validity_checks <- function(df, value_types, num_col, denom_col, signal_suffixes, refd_col = "time_value", lag_col = "lag", issued_col = "issue_date") { if (!missing(signal_suffixes) && !is.na(signal_suffixes) && !all(signal_suffixes == "") && !all(is.na(signal_suffixes))) { @@ -205,13 +207,16 @@ validity_checks <- function(df, value_types, num_col, denom_col, signal_suffixes } # Drop duplicate rows. - duplicate_i <- duplicated(df) - if (any(duplicate_i)) { + raw_df_rows <- nrow(df) + df <- distinct(df) + new_df_rows <- nrow(df) + if (raw_df_rows != new_df_rows) { warning("Data contains duplicate rows, dropping") - df <- df[!duplicate_i,] } - if (anyDuplicated(df[, c(refd_col, issued_col, "geo_value", "state_id")])) { + if (new_df_rows != nrow( + distinct(df, across(c(refd_col, issued_col, "geo_value", "state_id"))) + )) { stop("Data contains multiple entries with differing values for at", " least one reference date-issue date-location combination") } diff --git a/changehc/delphi_changehc/load_data.py b/changehc/delphi_changehc/load_data.py index c22c2483a..c4a5f1e9c 100644 --- a/changehc/delphi_changehc/load_data.py +++ b/changehc/delphi_changehc/load_data.py @@ -71,7 +71,7 @@ def load_chng_data(filepath, dropdate, base_geo, ), "Counts must be nonnegative" # aggregate age groups (so data is unique by date and base geography) - data = data.groupby([base_geo, Config.DATE_COL]).sum() + data = data.groupby([base_geo, Config.DATE_COL]).sum(numeric_only=True) data.dropna(inplace=True) # drop rows with any missing entries return data diff --git a/changehc/tests/test_update_sensor.py b/changehc/tests/test_update_sensor.py index 318437b92..999fed7e8 100644 --- a/changehc/tests/test_update_sensor.py +++ b/changehc/tests/test_update_sensor.py @@ -91,7 +91,7 @@ def test_geo_reindex(self): "timestamp": [pd.Timestamp(f'03-{i}-2020') for i in range(1, 14)]}) data_frame = su_inst.geo_reindex(test_data) assert data_frame.shape[0] == multiple*len(su_inst.fit_dates) - assert (data_frame.sum() == (4200,19000)).all() + assert (data_frame.sum(numeric_only=True) == (4200,19000)).all() def test_update_sensor(self): """Tests that the sensors are properly updated.""" diff --git a/changehc/version.cfg b/changehc/version.cfg index 9a65f3745..c771cf15a 100644 --- a/changehc/version.cfg +++ b/changehc/version.cfg @@ -1 +1 @@ -current_version = 0.3.35 +current_version = 0.3.36 diff --git a/claims_hosp/delphi_claims_hosp/load_data.py b/claims_hosp/delphi_claims_hosp/load_data.py index c2ee07e74..010d9d61b 100644 --- a/claims_hosp/delphi_claims_hosp/load_data.py +++ b/claims_hosp/delphi_claims_hosp/load_data.py @@ -47,7 +47,7 @@ def load_claims_data(claims_filepath, dropdate, base_geo): ), "Claims counts must be nonnegative" # aggregate age groups (so data is unique by date and base geography) - claims_data = claims_data.groupby([base_geo, Config.DATE_COL]).sum() + claims_data = claims_data.groupby([base_geo, Config.DATE_COL]).sum(numeric_only=True) claims_data.dropna(inplace=True) # drop rows with any missing entries return claims_data diff --git a/claims_hosp/version.cfg b/claims_hosp/version.cfg index 9a65f3745..c771cf15a 100644 --- a/claims_hosp/version.cfg +++ b/claims_hosp/version.cfg @@ -1 +1 @@ -current_version = 0.3.35 +current_version = 0.3.36 diff --git a/doctor_visits/delphi_doctor_visits/geo_maps.py b/doctor_visits/delphi_doctor_visits/geo_maps.py index 716e8899d..8c72f6bc4 100644 --- a/doctor_visits/delphi_doctor_visits/geo_maps.py +++ b/doctor_visits/delphi_doctor_visits/geo_maps.py @@ -49,7 +49,7 @@ def county_to_msa(self, data): from_col="PatCountyFIPS", new_col="cbsa_id") data.drop(columns="PatCountyFIPS", inplace=True) - data = data.groupby(["ServiceDate", "cbsa_id"]).sum().reset_index() + data = data.groupby(["ServiceDate", "cbsa_id"]).sum(numeric_only=True).reset_index() return data.groupby("cbsa_id"), "cbsa_id" @@ -66,7 +66,7 @@ def county_to_state(self, data): "state_id", from_col="PatCountyFIPS") data.drop(columns="PatCountyFIPS", inplace=True) - data = data.groupby(["ServiceDate", "state_id"]).sum().reset_index() + data = data.groupby(["ServiceDate", "state_id"]).sum(numeric_only=True).reset_index() return data.groupby("state_id"), "state_id" @@ -83,7 +83,7 @@ def county_to_hhs(self, data): "hhs", from_col="PatCountyFIPS") data.drop(columns="PatCountyFIPS", inplace=True) - data = data.groupby(["ServiceDate", "hhs"]).sum().reset_index() + data = data.groupby(["ServiceDate", "hhs"]).sum(numeric_only=True).reset_index() return data.groupby("hhs"), "hhs" @@ -100,7 +100,7 @@ def county_to_nation(self, data): "nation", from_col="PatCountyFIPS") data.drop(columns="PatCountyFIPS", inplace=True) - data = data.groupby(["ServiceDate", "nation"]).sum().reset_index() + data = data.groupby(["ServiceDate", "nation"]).sum(numeric_only=True).reset_index() return data.groupby("nation"), "nation" diff --git a/doctor_visits/delphi_doctor_visits/sensor.py b/doctor_visits/delphi_doctor_visits/sensor.py index e96c8bfe0..b5a645ea8 100644 --- a/doctor_visits/delphi_doctor_visits/sensor.py +++ b/doctor_visits/delphi_doctor_visits/sensor.py @@ -60,16 +60,17 @@ def fill_dates(y_data, dates): last_date = dates[-1] cols = y_data.columns + df_list = [y_data] if first_date not in y_data.index: - y_data = y_data.append( + df_list.append( pd.DataFrame(dict.fromkeys(cols, 0.0), columns=cols, index=[first_date]) ) if last_date not in y_data.index: - y_data = y_data.append( + df_list.append( pd.DataFrame(dict.fromkeys(cols, 0.0), columns=cols, index=[last_date]) ) - y_data.sort_index(inplace=True) + y_data = pd.concat(df_list).sort_index() y_data = y_data.asfreq("D", fill_value=0) return y_data diff --git a/doctor_visits/delphi_doctor_visits/update_sensor.py b/doctor_visits/delphi_doctor_visits/update_sensor.py index 1bb068059..019c3f9d5 100644 --- a/doctor_visits/delphi_doctor_visits/update_sensor.py +++ b/doctor_visits/delphi_doctor_visits/update_sensor.py @@ -101,7 +101,7 @@ def update_sensor( data.dropna(inplace=True) # drop rows with any missing entries # aggregate age groups (so data is unique by service date and FIPS) - data = data.groupby([Config.DATE_COL, Config.GEO_COL]).sum().reset_index() + data = data.groupby([Config.DATE_COL, Config.GEO_COL]).sum(numeric_only=True).reset_index() assert np.sum(data.duplicated()) == 0, "Duplicates after age group aggregation" assert (data[Config.COUNT_COLS] >= 0).all().all(), "Counts must be nonnegative" diff --git a/doctor_visits/version.cfg b/doctor_visits/version.cfg index 9a65f3745..c771cf15a 100644 --- a/doctor_visits/version.cfg +++ b/doctor_visits/version.cfg @@ -1 +1 @@ -current_version = 0.3.35 +current_version = 0.3.36 diff --git a/dsew_community_profile/delphi_dsew_community_profile/pull.py b/dsew_community_profile/delphi_dsew_community_profile/pull.py index 1ba5281cf..e9b8d24a1 100644 --- a/dsew_community_profile/delphi_dsew_community_profile/pull.py +++ b/dsew_community_profile/delphi_dsew_community_profile/pull.py @@ -701,6 +701,7 @@ def generate_prop_signal(df, geo, geo_mapper): ).groupby( geo ).sum( + numeric_only=True ).reset_index( ) df = pd.merge(df, map_df, left_on="geo_id", right_on=geo, how="inner") diff --git a/dsew_community_profile/tests/test_pull.py b/dsew_community_profile/tests/test_pull.py index fab6bca76..e968df4d7 100644 --- a/dsew_community_profile/tests/test_pull.py +++ b/dsew_community_profile/tests/test_pull.py @@ -240,8 +240,8 @@ def test_nation_from_state(self): 'sample_size': [None, None], 'publish_date': [datetime(year=2020, month=1, day=1)]*2,}) - pa_pop = int(state_pop.loc[state_pop.state_id == "pa", "pop"]) - wv_pop = int(state_pop.loc[state_pop.state_id == "wv", "pop"]) + pa_pop = int(state_pop.loc[state_pop.state_id == "pa", "pop"].iloc[0]) + wv_pop = int(state_pop.loc[state_pop.state_id == "wv", "pop"].iloc[0]) tot_pop = pa_pop + wv_pop assert True, nation_from_state( @@ -285,7 +285,14 @@ def test_generate_prop_signal_msa(self): geomapper = GeoMapper() county_pop = geomapper.get_crosswalk("fips", "pop") county_msa = geomapper.get_crosswalk("fips", "msa") - msa_pop = county_pop.merge(county_msa, on="fips", how="inner").groupby("msa").sum().reset_index() + msa_pop = county_pop.merge( + county_msa, on="fips", how="inner" + ).groupby( + "msa" + ).sum( + numeric_only=True + ).reset_index( + ) test_df = pd.DataFrame({ 'geo_id': ['35620', '31080'], @@ -294,8 +301,8 @@ def test_generate_prop_signal_msa(self): 'se': [None, None], 'sample_size': [None, None],}) - nyc_pop = int(msa_pop.loc[msa_pop.msa == "35620", "pop"]) - la_pop = int(msa_pop.loc[msa_pop.msa == "31080", "pop"]) + nyc_pop = int(msa_pop.loc[msa_pop.msa == "35620", "pop"].iloc[0]) + la_pop = int(msa_pop.loc[msa_pop.msa == "31080", "pop"].iloc[0]) expected_df = pd.DataFrame({ 'geo_id': ['35620', '31080'], @@ -342,8 +349,8 @@ def test_generate_prop_signal_non_msa(self): 'se': [None, None], 'sample_size': [None, None],}) - pop1 = int(geo_pop.loc[geo_pop[settings["code_name"]] == settings["geo_names"][0], "pop"]) - pop2 = int(geo_pop.loc[geo_pop[settings["code_name"]] == settings["geo_names"][1], "pop"]) + pop1 = int(geo_pop.loc[geo_pop[settings["code_name"]] == settings["geo_names"][0], "pop"].iloc[0]) + pop2 = int(geo_pop.loc[geo_pop[settings["code_name"]] == settings["geo_names"][1], "pop"].iloc[0]) expected_df = pd.DataFrame({ 'geo_id': settings["geo_names"], diff --git a/dsew_community_profile/version.cfg b/dsew_community_profile/version.cfg index 9a65f3745..c771cf15a 100644 --- a/dsew_community_profile/version.cfg +++ b/dsew_community_profile/version.cfg @@ -1 +1 @@ -current_version = 0.3.35 +current_version = 0.3.36 diff --git a/google_symptoms/version.cfg b/google_symptoms/version.cfg index 9a65f3745..c771cf15a 100644 --- a/google_symptoms/version.cfg +++ b/google_symptoms/version.cfg @@ -1 +1 @@ -current_version = 0.3.35 +current_version = 0.3.36 diff --git a/hhs_hosp/tests/test_run.py b/hhs_hosp/tests/test_run.py index 4719ce8db..a88b240be 100644 --- a/hhs_hosp/tests/test_run.py +++ b/hhs_hosp/tests/test_run.py @@ -100,8 +100,8 @@ def test_transform_signal_pop(): 'timestamp': [datetime(year=2020, month=1, day=1)]*2, 'val': [15., 150.],}) - pa_pop = int(state_pop.loc[state_pop.state_id == "pa", "pop"]) - wv_pop = int(state_pop.loc[state_pop.state_id == "wv", "pop"]) + pa_pop = int(state_pop[state_pop.state_id == "pa"]["pop"].iloc[0]) + wv_pop = int(state_pop[state_pop.state_id == "wv"]["pop"].iloc[0]) pd.testing.assert_frame_equal( transform_signal( CONFIRMED_PROP, diff --git a/hhs_hosp/version.cfg b/hhs_hosp/version.cfg index 9a65f3745..c771cf15a 100644 --- a/hhs_hosp/version.cfg +++ b/hhs_hosp/version.cfg @@ -1 +1 @@ -current_version = 0.3.35 +current_version = 0.3.36 diff --git a/nchs_mortality/delphi_nchs_mortality/pull.py b/nchs_mortality/delphi_nchs_mortality/pull.py index 55358eef2..45887041e 100644 --- a/nchs_mortality/delphi_nchs_mortality/pull.py +++ b/nchs_mortality/delphi_nchs_mortality/pull.py @@ -108,11 +108,19 @@ def pull_nchs_mortality_data(token: str, test_file: Optional[str]=None): # Get mask df to ignore cells where both of them have NAN values mask = (df_ny[keep_columns].isnull().values \ & df_nyc[keep_columns].isnull().values) - df_ny = df_ny.append(df_nyc).groupby("timestamp").sum().where(~mask, np.nan) + df_ny = pd.concat( + [df_ny, df_nyc] + ).groupby( + "timestamp" + ).sum( + numeric_only=True + ).where( + ~mask, np.nan + ) df_ny["state"] = "New York" # Drop NYC and NY in the full dataset df = df.loc[~df["state"].isin(["New York", "New York City"]), :] - df = df.append(df_ny).reset_index().sort_values(["state", "timestamp"]) + df = pd.concat([df, df_ny]).reset_index().sort_values(["state", "timestamp"]) # Add population info keep_columns.extend(["timestamp", "geo_id", "population"]) gmpr = GeoMapper() diff --git a/nchs_mortality/version.cfg b/nchs_mortality/version.cfg index 9a65f3745..c771cf15a 100644 --- a/nchs_mortality/version.cfg +++ b/nchs_mortality/version.cfg @@ -1 +1 @@ -current_version = 0.3.35 +current_version = 0.3.36 diff --git a/nowcast/version.cfg b/nowcast/version.cfg index 9a65f3745..c771cf15a 100644 --- a/nowcast/version.cfg +++ b/nowcast/version.cfg @@ -1 +1 @@ -current_version = 0.3.35 +current_version = 0.3.36 diff --git a/quidel_covidtest/delphi_quidel_covidtest/data_tools.py b/quidel_covidtest/delphi_quidel_covidtest/data_tools.py index e8958ecfd..98c9f8502 100644 --- a/quidel_covidtest/delphi_quidel_covidtest/data_tools.py +++ b/quidel_covidtest/delphi_quidel_covidtest/data_tools.py @@ -30,14 +30,18 @@ def fill_dates(y_data, first_date, last_date): Returns: dataframe containing all dates given """ cols = y_data.columns + + df_list = [y_data] if first_date not in y_data.index: - y_data = y_data.append(pd.DataFrame(dict.fromkeys(cols, 0.), - columns=cols, index=[first_date])) + df_list.append( + pd.DataFrame(dict.fromkeys(cols, 0.), columns=cols, index=[first_date]) + ) if last_date not in y_data.index: - y_data = y_data.append(pd.DataFrame(dict.fromkeys(cols, 0.), - columns=cols, index=[last_date])) + df_list.append( + pd.DataFrame(dict.fromkeys(cols, 0.), columns=cols, index=[last_date]) + ) - y_data.sort_index(inplace=True) + y_data = pd.concat(df_list).sort_index() y_data = y_data.asfreq('D', fill_value=0) y_data.fillna(0, inplace=True) return y_data diff --git a/quidel_covidtest/delphi_quidel_covidtest/generate_sensor.py b/quidel_covidtest/delphi_quidel_covidtest/generate_sensor.py index 5f44a519b..757175874 100644 --- a/quidel_covidtest/delphi_quidel_covidtest/generate_sensor.py +++ b/quidel_covidtest/delphi_quidel_covidtest/generate_sensor.py @@ -27,8 +27,8 @@ def generate_sensor_for_nonparent_geo(state_groups, res_key, smooth, device, Returns: df: pd.DataFrame """ - state_df = pd.DataFrame(columns=["geo_id", "val", "se", "sample_size", "timestamp"]) state_list = list(state_groups.groups.keys()) + df_list = [] for state in state_list: state_group = state_groups.get_group(state) state_group = state_group.drop(columns=[res_key]) @@ -63,12 +63,15 @@ def generate_sensor_for_nonparent_geo(state_groups, res_key, smooth, device, stat = stat * 100 se = se * 100 - state_df = state_df.append(pd.DataFrame({"geo_id": state, - "timestamp": state_group.index, - "val": stat, - "se": se, - "sample_size": sample_size})) - return remove_null_samples(state_df) + df_list.append( + pd.DataFrame({"geo_id": state, + "timestamp": state_group.index, + "val": stat, + "se": se, + "sample_size": sample_size}) + ) + + return remove_null_samples(pd.concat(df_list)) def generate_sensor_for_parent_geo(state_groups, data, res_key, smooth, device, first_date, last_date, suffix): @@ -88,9 +91,9 @@ def generate_sensor_for_parent_geo(state_groups, data, res_key, smooth, df: pd.DataFrame """ has_parent = True - res_df = pd.DataFrame(columns=["geo_id", "val", "se", "sample_size"]) if res_key == "fips": # Add rest-of-state report for county level data = add_megacounties(data, smooth) + df_list = [] for loc, res_group in data.groupby(res_key): parent_state = res_group['state_id'].values[0] try: @@ -147,9 +150,12 @@ def generate_sensor_for_parent_geo(state_groups, data, res_key, smooth, stat = stat * 100 se = se * 100 - res_df = res_df.append(pd.DataFrame({"geo_id": loc, - "timestamp": res_group.index, - "val": stat, - "se": se, - "sample_size": sample_size})) - return remove_null_samples(res_df) + df_list.append( + pd.DataFrame({"geo_id": loc, + "timestamp": res_group.index, + "val": stat, + "se": se, + "sample_size": sample_size}) + ) + + return remove_null_samples(pd.concat(df_list)) diff --git a/quidel_covidtest/delphi_quidel_covidtest/geo_maps.py b/quidel_covidtest/delphi_quidel_covidtest/geo_maps.py index 9cefc0f9e..d59dab692 100644 --- a/quidel_covidtest/delphi_quidel_covidtest/geo_maps.py +++ b/quidel_covidtest/delphi_quidel_covidtest/geo_maps.py @@ -88,5 +88,5 @@ def add_parent_state(data, geo_res, geo_key): # Merge the info of parent state to the data data = data.merge(mix_map, how="left", on=geo_key).drop( columns=["population"]).dropna() - data = data.groupby(["timestamp", geo_key, "state_id"]).sum().reset_index() + data = data.groupby(["timestamp", geo_key, "state_id"]).sum(numeric_only=True).reset_index() return data diff --git a/quidel_covidtest/delphi_quidel_covidtest/pull.py b/quidel_covidtest/delphi_quidel_covidtest/pull.py index 2ac5b958f..84ae8742e 100644 --- a/quidel_covidtest/delphi_quidel_covidtest/pull.py +++ b/quidel_covidtest/delphi_quidel_covidtest/pull.py @@ -34,7 +34,6 @@ def get_from_s3(start_date, end_date, bucket, logger): 'State', 'Zip', 'PatientAge', 'Result1', 'Result2', 'OverallResult', 'StorageDate', 'fname'] - df = pd.DataFrame(columns=selected_columns) s3_files = {} for obj in bucket.objects.all(): if "-sars" in obj.key: @@ -52,25 +51,30 @@ def get_from_s3(start_date, end_date, bucket, logger): s3_files[received_date].append(obj.key) n_days = (end_date - start_date).days + 1 + df_list = [] + seen_files = set() for search_date in [start_date + timedelta(days=x) for x in range(n_days)]: if search_date in s3_files.keys(): - # Avoid appending duplicate datasets logger.info(f"Pulling data received on {search_date.date()}") # Fetch data received on the same day for fn in s3_files[search_date]: + # Skip non-CSV files, such as directories if ".csv" not in fn: - continue #Add to avoid that the folder name was readed as a fn. - if fn in set(df["fname"].values): + continue + # Avoid appending duplicate datasets + if fn in seen_files: continue obj = bucket.Object(key=fn) newdf = pd.read_csv(obj.get()["Body"], parse_dates=["StorageDate", "TestDate"], low_memory=False) + seen_files.add(fn) newdf["fname"] = fn - df = df.append(newdf[selected_columns]) + df_list.append(newdf[selected_columns]) time_flag = search_date - return df, time_flag + + return pd.concat(df_list), time_flag def fix_zipcode(df): """Fix zipcode that is 9 digit instead of 5 digit.""" @@ -297,7 +301,14 @@ def pull_quidel_covidtest(params, logger): # Utilize previously stored data if previous_df is not None: - df = previous_df.append(df).groupby(["timestamp", "zip"]).sum().reset_index() + df = pd.concat( + [previous_df, df] + ).groupby( + ["timestamp", "zip"] + ).sum( + numeric_only=True + ).reset_index( + ) return df, _end_date def check_export_end_date(input_export_end_date, _end_date, diff --git a/quidel_covidtest/version.cfg b/quidel_covidtest/version.cfg index 9a65f3745..c771cf15a 100644 --- a/quidel_covidtest/version.cfg +++ b/quidel_covidtest/version.cfg @@ -1 +1 @@ -current_version = 0.3.35 +current_version = 0.3.36 diff --git a/sir_complainsalot/params.json.template b/sir_complainsalot/params.json.template index 27604806d..b6c7f885f 100644 --- a/sir_complainsalot/params.json.template +++ b/sir_complainsalot/params.json.template @@ -32,11 +32,14 @@ "max_age":6, "maintainers": ["U01AP8GSWG3","U01069KCRS7"], "retired-signals": [ - "raw_pct_negative","smoothed_pct_negative","raw_tests_per_device","smoothed_tests_per_device", - ["covid_ag_raw_pct_positive_age_0_4", "hrr"], ["covid_ag_raw_pct_positive_age_0_4", "msa"], - ["covid_ag_raw_pct_positive_age_5_17", "hrr"], ["covid_ag_raw_pct_positive_age_5_17", "msa"], - ["covid_ag_raw_pct_positive_age_50_64", "hrr"], ["covid_ag_raw_pct_positive_age_50_64", "msa"], - ["covid_ag_raw_pct_positive_age_65plus", "hrr"], ["covid_ag_raw_pct_positive_age_65plus", "msa"] + "raw_pct_negative", "smoothed_pct_negative", + "raw_tests_per_device", "smoothed_tests_per_device", + "covid_ag_raw_pct_positive_age_0_4", "covid_ag_smoothed_pct_positive_age_0_4", + "covid_ag_raw_pct_positive_age_5_17", "covid_ag_smoothed_pct_positive_age_5_17", + "covid_ag_raw_pct_positive_age_18_49", "covid_ag_smoothed_pct_positive_age_18_49", + "covid_ag_raw_pct_positive_age_50_64", "covid_ag_smoothed_pct_positive_age_50_64", + "covid_ag_raw_pct_positive_age_65plus", "covid_ag_smoothed_pct_positive_age_65plus", + "covid_ag_raw_pct_positive_age_0_17", "covid_ag_smoothed_pct_positive_age_0_17" ] }, "nchs-mortality": { diff --git a/sir_complainsalot/version.cfg b/sir_complainsalot/version.cfg index 9a65f3745..c771cf15a 100644 --- a/sir_complainsalot/version.cfg +++ b/sir_complainsalot/version.cfg @@ -1 +1 @@ -current_version = 0.3.35 +current_version = 0.3.36