diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 9c897f41f..a0ea8cd83 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.3.6 +current_version = 0.3.7 commit = True message = chore: bump covidcast-indicators to {new_version} tag = False diff --git a/_delphi_utils_python/.bumpversion.cfg b/_delphi_utils_python/.bumpversion.cfg index b0c60af24..8061dd5cf 100644 --- a/_delphi_utils_python/.bumpversion.cfg +++ b/_delphi_utils_python/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.3.2 +current_version = 0.3.3 commit = True message = chore: bump delphi_utils to {new_version} tag = False diff --git a/_delphi_utils_python/delphi_utils/__init__.py b/_delphi_utils_python/delphi_utils/__init__.py index 35d552179..7374a0605 100644 --- a/_delphi_utils_python/delphi_utils/__init__.py +++ b/_delphi_utils_python/delphi_utils/__init__.py @@ -15,4 +15,4 @@ from .nancodes import Nans from .weekday import Weekday -__version__ = "0.3.2" +__version__ = "0.3.3" diff --git a/_delphi_utils_python/setup.py b/_delphi_utils_python/setup.py index 782b49ad5..2e495d231 100644 --- a/_delphi_utils_python/setup.py +++ b/_delphi_utils_python/setup.py @@ -26,7 +26,7 @@ setup( name="delphi_utils", - version="0.3.2", + version="0.3.3", description="Shared Utility Functions for Indicators", long_description=long_description, long_description_content_type="text/markdown", diff --git a/_delphi_utils_python/tests/test_archive.py b/_delphi_utils_python/tests/test_archive.py index 68f05454d..0694bfdfa 100644 --- a/_delphi_utils_python/tests/test_archive.py +++ b/_delphi_utils_python/tests/test_archive.py @@ -1,4 +1,4 @@ - +from dataclasses import dataclass, field from io import StringIO, BytesIO from os import listdir, mkdir from os.path import join @@ -22,125 +22,197 @@ "missing_val": "Int64", "missing_se": "Int64", "missing_sample_size": "Int64" } -CSVS_BEFORE = { - # All rows unchanged - "csv0": pd.DataFrame({ - "geo_id": ["1", "2", "3"], - "val": [1.000000001, 2.00000002, 3.00000003], - "se": [0.1, 0.2, 0.3], - "sample_size": [10.0, 20.0, 30.0], - "missing_val": [Nans.NOT_MISSING] * 3, - "missing_se": [Nans.NOT_MISSING] * 3, - "missing_sample_size": [Nans.NOT_MISSING] * 3, - }), - - # One row deleted and one row added - "csv1": pd.DataFrame({ - "geo_id": ["1", "2", "3"], - "val": [1.0, 2.0, 3.0], - "se": [np.nan, 0.20000002, 0.30000003], - "sample_size": [10.0, 20.0, 30.0], - "missing_val": [Nans.NOT_MISSING] * 3, - "missing_se": [Nans.CENSORED] + [Nans.NOT_MISSING] * 2, - "missing_sample_size": [Nans.NOT_MISSING] * 3, - }), - - # File deleted - "csv2": pd.DataFrame({ - "geo_id": ["1"], - "val": [1.0], - "se": [0.1], - "sample_size": [10.0], - "missing_val": [Nans.NOT_MISSING], - "missing_se": [Nans.NOT_MISSING], - "missing_sample_size": [Nans.NOT_MISSING], - }), - - # All rows common, but missing columns added - "csv4": pd.DataFrame({ - "geo_id": ["1"], - "val": [1.0], - "se": [0.1], - "sample_size": [10.0] - }), - - # Same as 1, but no missing columns ("old-style" file) - "csv5": pd.DataFrame({ - "geo_id": ["1", "2", "3"], - "val": [1.0, 2.0, 3.0], - "se": [np.nan, 0.20000002, 0.30000003], - "sample_size": [10.0, 20.0, 30.0] - }) -} - -CSVS_AFTER = { - # All rows unchanged - "csv0": pd.DataFrame({ - "geo_id": ["1", "2", "3"], - "val": [1.0, 2.0, 3.0], - "se": [0.10000001, 0.20000002, 0.30000003], - "sample_size": [10.0, 20.0, 30.0], - "missing_val": [Nans.NOT_MISSING] * 3, - "missing_se": [Nans.NOT_MISSING] * 3, - "missing_sample_size": [Nans.NOT_MISSING] * 3, - }), - - # One row deleted and one row added - "csv1": pd.DataFrame({ - "geo_id": ["1", "2", "4"], - "val": [1.0, 2.1, 4.0], - "se": [np.nan, 0.21, np.nan], - "sample_size": [10.0, 21.0, 40.0], - "missing_val": [Nans.NOT_MISSING] * 3, - "missing_se": [Nans.CENSORED] + [Nans.NOT_MISSING] * 2, - "missing_sample_size": [Nans.NOT_MISSING] * 3, - }), - - # File added - "csv3": pd.DataFrame({ - "geo_id": ["2"], - "val": [2.0000002], - "se": [0.2], - "sample_size": [20.0], - "missing_val": [Nans.NOT_MISSING], - "missing_se": [Nans.NOT_MISSING], - "missing_sample_size": [Nans.NOT_MISSING], - }), - - # All rows common, but missing columns added - "csv4": pd.DataFrame({ - "geo_id": ["1"], - "val": [1.0], - "se": [0.1], - "sample_size": [10.0], - "missing_val": [Nans.NOT_MISSING], - "missing_se": [Nans.NOT_MISSING], - "missing_sample_size": [Nans.NOT_MISSING], - }), - - # Row 1 same, row 2 deleted, row 3 added, row 4 deleted previously - # and present again as nan, row 5 deleted previously and absent from file - # (no missing columns) - "csv5": pd.DataFrame({ - "geo_id": ["1", "2", "4"], - "val": [1.0, 2.1, 4.0], - "se": [np.nan, 0.21, np.nan], - "sample_size": [10.0, 21.0, 40.0] - }) +class Example: + def __init__(self, before, after, diff): + def fix_df(df): + if isinstance(df, pd.DataFrame): + return Example._set_df_datatypes(df, CSV_DTYPES) + return df + self.before = fix_df(before) + self.after = fix_df(after) + self.diff = fix_df(diff) + @staticmethod + def _set_df_datatypes(df: pd.DataFrame, dtypes: Dict[str, Any]) -> pd.DataFrame: + df = df.copy() + for k, v in dtypes.items(): + if k in df.columns: + df[k] = df[k].astype(v) + return df + +@dataclass +class Expecteds: + deleted: List[str] + common_diffs: Dict[str, str] + new: List[str] + raw_exports: List[str] = field(init=False) + diffed_exports: List[str] = field(init=False) + filtered_exports: List[str] = field(init=False) + + def __post_init__(self): + self.raw_exports = list(self.common_diffs.keys()) + self.new + self.diffed_exports = self.raw_exports + [diff_name for diff_name in self.common_diffs.values() if diff_name is not None] + self.filtered_exports = [f.replace(".diff", "") for f in self.diffed_exports if f.endswith(".diff")] + self.new +EMPTY = "empty" +CSVS = { + "unchanged": Example( # was: csv0 + before=pd.DataFrame({ + "geo_id": ["1", "2", "3"], + "val": [1.000000001, 2.00000002, 3.00000003], + "se": [0.1, 0.2, 0.3], + "sample_size": [10.0, 20.0, 30.0], + "missing_val": [Nans.NOT_MISSING] * 3, + "missing_se": [Nans.NOT_MISSING] * 3, + "missing_sample_size": [Nans.NOT_MISSING] * 3, + }), + after=pd.DataFrame({ + "geo_id": ["1", "2", "3"], + "val": [1.0, 2.0, 3.0], # ignore changes beyond 7 decimal points + "se": [0.10000001, 0.20000002, 0.30000003], + "sample_size": [10.0, 20.0, 30.0], + "missing_val": [Nans.NOT_MISSING] * 3, + "missing_se": [Nans.NOT_MISSING] * 3, + "missing_sample_size": [Nans.NOT_MISSING] * 3, + }), + diff=EMPTY # unchanged names are listed in common files but have no diff file + ), + "mod_2_del_3_add_4": Example( # was: csv1 + before=pd.DataFrame({ + "geo_id": ["1", "2", "3"], + "val": [1.0, 2.0, 3.0], + "se": [np.nan, 0.20000002, 0.30000003], + "sample_size": [10.0, 20.0, 30.0], + "missing_val": [Nans.NOT_MISSING] * 3, + "missing_se": [Nans.CENSORED] + [Nans.NOT_MISSING] * 2, + "missing_sample_size": [Nans.NOT_MISSING] * 3, + }), + after=pd.DataFrame({ + "geo_id": ["1", "2", "4"], + "val": [1.0, 2.1, 4.0], + "se": [np.nan, 0.21, np.nan], + "sample_size": [10.0, 21.0, 40.0], + "missing_val": [Nans.NOT_MISSING] * 3, + "missing_se": [Nans.CENSORED] + [Nans.NOT_MISSING] * 2, + "missing_sample_size": [Nans.NOT_MISSING] * 3, + }), + diff=pd.DataFrame({ + "geo_id": ["2", "3", "4"], + "val": [2.1, np.nan, 4.0], + "se": [0.21, np.nan, np.nan], + "sample_size": [21.0, np.nan, 40.0], + "missing_val": [Nans.NOT_MISSING, Nans.DELETED, Nans.NOT_MISSING], + "missing_se": [Nans.NOT_MISSING, Nans.DELETED, Nans.NOT_MISSING], + "missing_sample_size": [Nans.NOT_MISSING, Nans.DELETED, Nans.NOT_MISSING], + }) + ), + "delete_file": Example( # was: csv2 + before=pd.DataFrame({ + "geo_id": ["1"], + "val": [1.0], + "se": [0.1], + "sample_size": [10.0], + "missing_val": [Nans.NOT_MISSING], + "missing_se": [Nans.NOT_MISSING], + "missing_sample_size": [Nans.NOT_MISSING], + }), + after=None, + diff=None + ), + "add_file": Example( # was: csv3 + before=None, + after=pd.DataFrame({ + "geo_id": ["2"], + "val": [2.0000002], + "se": [0.2], + "sample_size": [20.0], + "missing_val": [Nans.NOT_MISSING], + "missing_se": [Nans.NOT_MISSING], + "missing_sample_size": [Nans.NOT_MISSING], + }), + diff=None + ), + "unchanged_old_new": Example( # was: csv4 + before=pd.DataFrame({ + "geo_id": ["1"], + "val": [1.0], + "se": [0.1], + "sample_size": [10.0] + }), + after=pd.DataFrame({ + "geo_id": ["1"], + "val": [1.0], + "se": [0.1], + "sample_size": [10.0], + "missing_val": [Nans.NOT_MISSING], + "missing_se": [Nans.NOT_MISSING], + "missing_sample_size": [Nans.NOT_MISSING], + }), + diff=pd.DataFrame({ + "geo_id": ["1"], + "val": [1.0], + "se": [0.1], + "sample_size": [10.0], + "missing_val": [Nans.NOT_MISSING], + "missing_se": [Nans.NOT_MISSING], + "missing_sample_size": [Nans.NOT_MISSING], + }) + ), + "mod_2_del_3_add_4_old_old": Example( # was: csv5 + before=pd.DataFrame({ + "geo_id": ["1", "2", "3"], + "val": [1.0, 2.0, 3.0], + "se": [np.nan, 0.20000002, 0.30000003], + "sample_size": [10.0, 20.0, 30.0] + }), + after=pd.DataFrame({ + "geo_id": ["1", "2", "4"], + "val": [1.0, 2.1, 4.0], + "se": [np.nan, 0.21, np.nan], + "sample_size": [10.0, 21.0, 40.0] + }), + diff=pd.DataFrame({ + "geo_id": ["2", "3", "4"], + "val": [2.1, np.nan, 4.0], + "se": [0.21, np.nan, np.nan], + "sample_size": [21.0, np.nan, 40.0], + "missing_val": [np.nan, Nans.DELETED, np.nan], + "missing_se": [np.nan, Nans.DELETED, np.nan], + "missing_sample_size": [np.nan, Nans.DELETED, np.nan], + }) + ) } +EXPECTEDS = Expecteds( + deleted=["delete_file.csv"], + common_diffs=dict( + (f"{csv_name}.csv", None if dfs.diff is EMPTY else f"{csv_name}.csv.diff") + for csv_name, dfs in CSVS.items() if dfs.diff is not None + ), + new=["add_file.csv"], +) +# check for incomplete modifications to tests +assert set(EXPECTEDS.new) == set(f"{csv_name}.csv" for csv_name, dfs in CSVS.items() if dfs.before is None), \ + "Bad programmer: added more new files to CSVS.after without updating EXPECTEDS.new" def _assert_frames_equal_ignore_row_order(df1, df2, index_cols: List[str] = None): return assert_frame_equal(df1.set_index(index_cols).sort_index(), df2.set_index(index_cols).sort_index()) -def _set_df_datatypes(df: pd.DataFrame, dtypes: Dict[str, Any]) -> pd.DataFrame: - df = df.copy() - for k, v in dtypes.items(): - if k in df.columns: - df[k] = df[k].astype(v) - return df - - -class TestArchiveDiffer: +class ArchiveDifferTestlike: + def set_up(self, tmp_path): + cache_dir = join(str(tmp_path), "cache") + export_dir = join(str(tmp_path), "export") + mkdir(cache_dir) + mkdir(export_dir) + return cache_dir, export_dir + def check_filtered_exports(self, export_dir): + assert set(listdir(export_dir)) == set(EXPECTEDS.filtered_exports) + for f in EXPECTEDS.filtered_exports: + example = CSVS[f.replace(".csv", "")] + _assert_frames_equal_ignore_row_order( + pd.read_csv(join(export_dir, f), dtype=CSV_DTYPES), + example.after if example.diff is None else example.diff, + index_cols=["geo_id"] + ) + +class TestArchiveDiffer(ArchiveDifferTestlike): def test_stubs(self): arch_diff = ArchiveDiffer("cache", "export") @@ -152,30 +224,7 @@ def test_stubs(self): arch_diff.archive_exports(None) def test_diff_and_filter_exports(self, tmp_path): - cache_dir = join(str(tmp_path), "cache") - export_dir = join(str(tmp_path), "export") - mkdir(cache_dir) - mkdir(export_dir) - - expected_csv1_diff = _set_df_datatypes(pd.DataFrame({ - "geo_id": ["2", "3", "4"], - "val": [2.1, np.nan, 4.0], - "se": [0.21, np.nan, np.nan], - "sample_size": [21.0, np.nan, 40.0], - "missing_val": [Nans.NOT_MISSING, Nans.DELETED, Nans.NOT_MISSING], - "missing_se": [Nans.NOT_MISSING, Nans.DELETED, Nans.NOT_MISSING], - "missing_sample_size": [Nans.NOT_MISSING, Nans.DELETED, Nans.NOT_MISSING], - }), dtypes=CSV_DTYPES) - expected_csv4_diff = _set_df_datatypes(CSVS_AFTER["csv4"], dtypes=CSV_DTYPES) - expected_csv5_diff = _set_df_datatypes(pd.DataFrame({ - "geo_id": ["2", "3", "4"], - "val": [2.1, np.nan, 4.0], - "se": [0.21, np.nan, np.nan], - "sample_size": [21.0, np.nan, 40.0], - "missing_val": [np.nan, Nans.DELETED, np.nan], - "missing_se": [np.nan, Nans.DELETED, np.nan], - "missing_sample_size": [np.nan, Nans.DELETED, np.nan], - }), dtypes=CSV_DTYPES) + cache_dir, export_dir = self.set_up(tmp_path) arch_diff = ArchiveDiffer(cache_dir, export_dir) @@ -187,47 +236,38 @@ def test_diff_and_filter_exports(self, tmp_path): arch_diff.diff_exports() # Simulate cache updated, and signal ran finish - for csv_name, df in CSVS_BEFORE.items(): - df.to_csv(join(cache_dir, f"{csv_name}.csv"), index=False) - for csv_name, df in CSVS_AFTER.items(): - df.to_csv(join(export_dir, f"{csv_name}.csv"), index=False) + for csv_name, dfs in CSVS.items(): + if dfs.before is not None: + dfs.before.to_csv(join(cache_dir, f"{csv_name}.csv"), index=False) + if dfs.after is not None: + dfs.after.to_csv(join(export_dir, f"{csv_name}.csv"), index=False) arch_diff._cache_updated = True deleted_files, common_diffs, new_files = arch_diff.diff_exports() - # Check return values - assert set(deleted_files) == {join(cache_dir, "csv2.csv")} + # Check deleted, common, diffed, and new file names match expected + assert set(deleted_files) == {join(cache_dir, f) for f in EXPECTEDS.deleted} assert set(common_diffs.keys()) == { - join(export_dir, f) for f in ["csv0.csv", "csv1.csv", "csv4.csv", "csv5.csv"]} - assert set(new_files) == {join(export_dir, "csv3.csv")} - assert common_diffs[join(export_dir, "csv0.csv")] is None - assert common_diffs[join(export_dir, "csv1.csv")] == join( - export_dir, "csv1.csv.diff") + join(export_dir, csv_name) for csv_name in EXPECTEDS.common_diffs} + assert set(new_files) == {join(export_dir, f) for f in EXPECTEDS.new} + # Check that diffed file names are identical + assert all( + (common_diffs[join(export_dir, csv_name)] == + None if diff_name is None else join(export_dir, diff_name)) + for csv_name, diff_name in EXPECTEDS.common_diffs.items() + ) # Check filesystem for actual files - assert set(listdir(export_dir)) == { - "csv0.csv", - "csv1.csv", "csv1.csv.diff", - "csv3.csv", - "csv4.csv", "csv4.csv.diff", - "csv5.csv", "csv5.csv.diff", - } - # Check that the files look as expected - _assert_frames_equal_ignore_row_order( - pd.read_csv(join(export_dir, "csv1.csv.diff"), dtype=CSV_DTYPES), - expected_csv1_diff, - index_cols=["geo_id"] - ) - _assert_frames_equal_ignore_row_order( - pd.read_csv(join(export_dir, "csv4.csv.diff"), dtype=CSV_DTYPES), - expected_csv4_diff, - index_cols=["geo_id"] - ) - _assert_frames_equal_ignore_row_order( - pd.read_csv(join(export_dir, "csv5.csv.diff"), dtype=CSV_DTYPES), - expected_csv5_diff, - index_cols=["geo_id"] - ) + assert set(listdir(export_dir)) == set(EXPECTEDS.diffed_exports) + + # Check that the diff files look as expected + for key, diff_name in EXPECTEDS.common_diffs.items(): + if diff_name is None: continue + _assert_frames_equal_ignore_row_order( + pd.read_csv(join(export_dir, diff_name), dtype=CSV_DTYPES), + CSVS[key.replace(".csv", "")].diff, + index_cols=["geo_id"] + ) # Test filter_exports @@ -243,49 +283,24 @@ def test_diff_and_filter_exports(self, tmp_path): arch_diff.filter_exports(common_diffs) # Check exports directory just has incremental changes - assert set(listdir(export_dir)) == {"csv1.csv", "csv3.csv", "csv4.csv", "csv5.csv"} - _assert_frames_equal_ignore_row_order( - pd.read_csv(join(export_dir, "csv1.csv"), dtype=CSV_DTYPES), - expected_csv1_diff, - index_cols=["geo_id"] - ) - _assert_frames_equal_ignore_row_order( - pd.read_csv(join(export_dir, "csv4.csv"), dtype=CSV_DTYPES), - expected_csv4_diff, - index_cols=["geo_id"] - ) - _assert_frames_equal_ignore_row_order( - pd.read_csv(join(export_dir, "csv5.csv"), dtype=CSV_DTYPES), - expected_csv5_diff, - index_cols=["geo_id"] - ) - + self.check_filtered_exports(export_dir) AWS_CREDENTIALS = { "aws_access_key_id": "FAKE_TEST_ACCESS_KEY_ID", "aws_secret_access_key": "FAKE_TEST_SECRET_ACCESS_KEY", } - -@pytest.fixture(scope="function") -def s3_client(): - with mock_s3(): - yield Session(**AWS_CREDENTIALS).client("s3") - - -class TestS3ArchiveDiffer: +class TestS3ArchiveDiffer(ArchiveDifferTestlike): bucket_name = "test-bucket" indicator_prefix = "test" @mock_s3 - def test_update_cache(self, tmp_path, s3_client): - cache_dir = join(str(tmp_path), "cache") - export_dir = join(str(tmp_path), "export") - mkdir(cache_dir) - mkdir(export_dir) + def test_update_cache(self, tmp_path): + s3_client = Session(**AWS_CREDENTIALS).client("s3") + cache_dir, export_dir = self.set_up(tmp_path) - csv1 = CSVS_BEFORE["csv1"] - csv2 = CSVS_AFTER["csv1"] + csv1 = CSVS["mod_2_del_3_add_4"].before + csv2 = CSVS["mod_2_del_3_add_4"].after csv1_buf = StringIO() csv2_buf = StringIO() csv1.to_csv(csv1_buf, index=False) @@ -316,13 +331,11 @@ def test_update_cache(self, tmp_path, s3_client): assert set(listdir(cache_dir)) == {"csv1.csv", "csv2.csv"} @mock_s3 - def test_archive_exports(self, tmp_path, s3_client): - cache_dir = join(str(tmp_path), "cache") - export_dir = join(str(tmp_path), "export") - mkdir(cache_dir) - mkdir(export_dir) + def test_archive_exports(self, tmp_path): + s3_client = Session(**AWS_CREDENTIALS).client("s3") + cache_dir, export_dir = self.set_up(tmp_path) - csv1 = _set_df_datatypes(CSVS_BEFORE["csv1"], dtypes=CSV_DTYPES) + csv1 = CSVS["mod_2_del_3_add_4"].before csv1.to_csv(join(export_dir, "csv1.csv"), index=False) s3_client.create_bucket(Bucket=self.bucket_name) @@ -347,25 +360,23 @@ def test_archive_exports(self, tmp_path, s3_client): assert_frame_equal(pd.read_csv(body, dtype=CSV_DTYPES), csv1) @mock_s3 - def test_run(self, tmp_path, s3_client): - cache_dir = join(str(tmp_path), "cache") - export_dir = join(str(tmp_path), "export") - mkdir(cache_dir) - mkdir(export_dir) + def test_run(self, tmp_path): + s3_client = Session(**AWS_CREDENTIALS).client("s3") + cache_dir, export_dir = self.set_up(tmp_path) - # Set up current buckets to be `CSVS_BEFORE`. s3_client.create_bucket(Bucket=self.bucket_name) - for csv_name, df in CSVS_BEFORE.items(): - csv_buf = StringIO() - df.to_csv(csv_buf, index=False) - s3_client.put_object( - Bucket=self.bucket_name, - Key=f"{self.indicator_prefix}/{csv_name}.csv", - Body=BytesIO(csv_buf.getvalue().encode())) - - # Set up the exported files to be `CSVS_AFTER`. - for csv_name, df in CSVS_AFTER.items(): - df.to_csv(join(export_dir, f"{csv_name}.csv"), index=False) + for csv_name, dfs in CSVS.items(): + # Set up current buckets to be the 'before' files. + if dfs.before is not None: + csv_buf = StringIO() + dfs.before.to_csv(csv_buf, index=False) + s3_client.put_object( + Bucket=self.bucket_name, + Key=f"{self.indicator_prefix}/{csv_name}.csv", + Body=BytesIO(csv_buf.getvalue().encode())) + # Set up the exported files to be the 'after' files. + if dfs.after is not None: + dfs.after.to_csv(join(export_dir, f"{csv_name}.csv"), index=False) # Create and run differ. arch_diff = S3ArchiveDiffer( @@ -375,35 +386,19 @@ def test_run(self, tmp_path, s3_client): arch_diff.run() # Check that the buckets now contain the exported files. - for csv_name, df in CSVS_AFTER.items(): + for csv_name, dfs in CSVS.items(): + if dfs.after is None: + continue body = s3_client.get_object(Bucket=self.bucket_name, Key=f"{self.indicator_prefix}/{csv_name}.csv")["Body"] - assert_frame_equal(pd.read_csv(body, dtype=CSV_DTYPES), _set_df_datatypes(df, dtypes=CSV_DTYPES)) + assert_frame_equal(pd.read_csv(body, dtype=CSV_DTYPES), dfs.after) # Check exports directory just has incremental changes - assert set(listdir(export_dir)) == {"csv1.csv", "csv3.csv", "csv4.csv", "csv5.csv"} - csv1_diff = _set_df_datatypes(pd.DataFrame({ - "geo_id": ["3", "2", "4"], - "val": [np.nan, 2.1, 4.0], - "se": [np.nan, 0.21, np.nan], - "sample_size": [np.nan, 21.0, 40.0], - "missing_val": [Nans.DELETED] + [Nans.NOT_MISSING] * 2, - "missing_se": [Nans.DELETED] + [Nans.NOT_MISSING] * 2, - "missing_sample_size": [Nans.DELETED] + [Nans.NOT_MISSING] * 2, - }), dtypes=CSV_DTYPES) - _assert_frames_equal_ignore_row_order( - pd.read_csv(join(export_dir, "csv1.csv"), dtype=CSV_DTYPES), - csv1_diff, - index_cols=["geo_id"] - ) - + self.check_filtered_exports(export_dir) -class TestGitArchiveDiffer: +class TestGitArchiveDiffer(ArchiveDifferTestlike): def test_init_args(self, tmp_path): - cache_dir = str(tmp_path / "cache") - export_dir = str(tmp_path / "export") - mkdir(cache_dir) - mkdir(export_dir) + cache_dir, export_dir = self.set_up(tmp_path) with pytest.raises(AssertionError): GitArchiveDiffer(cache_dir, export_dir, @@ -419,10 +414,7 @@ def test_init_args(self, tmp_path): assert arch_diff.branch == arch_diff.repo.active_branch def test_update_cache(self, tmp_path): - cache_dir = str(tmp_path / "cache") - export_dir = str(tmp_path / "export") - mkdir(cache_dir) - mkdir(export_dir) + cache_dir, export_dir = self.set_up(tmp_path) Repo.init(cache_dir) @@ -441,10 +433,7 @@ def test_update_cache(self, tmp_path): assert arch_diff2._cache_updated def test_diff_exports(self, tmp_path): - cache_dir = str(tmp_path / "cache") - export_dir = str(tmp_path / "export") - mkdir(cache_dir) - mkdir(export_dir) + cache_dir, export_dir = self.set_up(tmp_path) branch_name = "test-branch" @@ -501,10 +490,7 @@ def test_diff_exports(self, tmp_path): assert set(new_files) == set() def test_archive_exports(self, tmp_path): - cache_dir = str(tmp_path / "cache") - export_dir = str(tmp_path / "export") - mkdir(cache_dir) - mkdir(export_dir) + cache_dir, export_dir = self.set_up(tmp_path) repo = Repo.init(cache_dir) repo.index.commit(message="Initial commit") @@ -561,10 +547,7 @@ def test_archive_exports(self, tmp_path): assert repo.active_branch.set_commit("HEAD~1").commit == orig_commit def test_run(self, tmp_path): - cache_dir = str(tmp_path / "cache") - export_dir = str(tmp_path / "export") - mkdir(cache_dir) - mkdir(export_dir) + cache_dir, export_dir = self.set_up(tmp_path) branch_name = "test-branch" @@ -572,13 +555,13 @@ def test_run(self, tmp_path): repo.index.commit(message="Initial commit") original_branch = repo.active_branch - # Set up the current cache to contain `CSVS_BEFORE`. - for csv_name, df in CSVS_BEFORE.items(): - df.to_csv(join(cache_dir, f"{csv_name}.csv"), index=False) - - # Set up the current cache to contain `CSVS_AFTER`. - for csv_name, df in CSVS_AFTER.items(): - df.to_csv(join(export_dir, f"{csv_name}.csv"), index=False) + for csv_name, dfs in CSVS.items(): + # Set up the current cache to contain 'before' files + if dfs.before is not None: + dfs.before.to_csv(join(cache_dir, f"{csv_name}.csv"), index=False) + # Set up the current export to contain 'after' files + if dfs.after is not None: + dfs.after.to_csv(join(export_dir, f"{csv_name}.csv"), index=False) # Create and run differ. arch_diff = GitArchiveDiffer( @@ -586,29 +569,15 @@ def test_run(self, tmp_path): branch_name=branch_name, override_dirty=True) arch_diff.run() - # Check that the archive branch contains `CSVS_AFTER`. + # Check that the archive branch contains 'after' files. arch_diff.get_branch(branch_name).checkout() - for csv_name, df in CSVS_AFTER.items(): - assert_frame_equal(pd.read_csv(join(cache_dir, f"{csv_name}.csv"), dtype=CSV_DTYPES), _set_df_datatypes(df, dtypes=CSV_DTYPES)) + for csv_name, dfs in CSVS.items(): + if dfs.after is None: continue + assert_frame_equal(pd.read_csv(join(cache_dir, f"{csv_name}.csv"), dtype=CSV_DTYPES), dfs.after) original_branch.checkout() # Check exports directory just has incremental changes - assert set(listdir(export_dir)) == {"csv1.csv", "csv3.csv", "csv4.csv", "csv5.csv"} - csv1_diff = pd.DataFrame({ - "geo_id": ["3", "2", "4"], - "val": [np.nan, 2.1, 4.0], - "se": [np.nan, 0.21, np.nan], - "sample_size": [np.nan, 21.0, 40.0], - "missing_val": [Nans.DELETED] + [Nans.NOT_MISSING] * 2, - "missing_se": [Nans.DELETED] + [Nans.NOT_MISSING] * 2, - "missing_sample_size": [Nans.DELETED] + [Nans.NOT_MISSING] * 2, - }) - _assert_frames_equal_ignore_row_order( - pd.read_csv(join(export_dir, "csv1.csv"), dtype=CSV_DTYPES), - _set_df_datatypes(csv1_diff, dtypes=CSV_DTYPES), - index_cols=["geo_id"] - ) - + self.check_filtered_exports(export_dir) class TestFromParams: """Tests for creating archive differs from params.""" diff --git a/facebook/delphiFacebook/R/binary.R b/facebook/delphiFacebook/R/binary.R index 08305d375..fddc68537 100644 --- a/facebook/delphiFacebook/R/binary.R +++ b/facebook/delphiFacebook/R/binary.R @@ -135,6 +135,26 @@ get_binary_indicators <- function() { "smoothed_try_vaccinate_1m", "weight_unif", "v_try_vaccinate_1m", 6, compute_binary_response, jeffreys_binary, "smoothed_wtry_vaccinate_1m", "weight", "v_try_vaccinate_1m", 6, compute_binary_response, jeffreys_binary, + "smoothed_winitial_dose_one_of_one", "weight", "v_initial_dose_one_of_one", 6, compute_binary_response, jeffreys_multinomial_factory(4), + "smoothed_winitial_dose_one_of_two", "weight", "v_initial_dose_one_of_two", 6, compute_binary_response, jeffreys_multinomial_factory(4), + "smoothed_winitial_dose_two_of_two", "weight", "v_initial_dose_two_of_two", 6, compute_binary_response, jeffreys_multinomial_factory(4), + + "smoothed_wvaccinated_one_booster", "weight", "v_vaccinated_one_booster", 6, compute_binary_response, jeffreys_multinomial_factory(4), + "smoothed_wvaccinated_two_or_more_boosters", "weight", "v_vaccinated_two_or_more_boosters", 6, compute_binary_response, jeffreys_multinomial_factory(4), + "smoothed_wvaccinated_no_booster", "weight", "v_vaccinated_no_booster", 6, compute_binary_response, jeffreys_multinomial_factory(4), + "smoothed_wvaccinated_at_least_one_booster", "weight", "v_vaccinated_at_least_one_booster", 6, compute_binary_response, jeffreys_binary, + + "smoothed_wvaccinated_booster_accept", "weight", "v_vaccinated_booster_accept", 6, compute_binary_response, jeffreys_binary, + "smoothed_wvaccinated_booster_hesitant", "weight", "v_vaccinated_booster_hesitant", 6, compute_binary_response, jeffreys_binary, + + "smoothed_wvaccinated_booster_defyes", "weight", "v_vaccinated_booster_defyes", 6, compute_binary_response, jeffreys_multinomial_factory(4), + "smoothed_wvaccinated_booster_probyes", "weight", "v_vaccinated_booster_probyes", 6, compute_binary_response, jeffreys_multinomial_factory(4), + "smoothed_wvaccinated_booster_probno", "weight", "v_vaccinated_booster_probno", 6, compute_binary_response, jeffreys_multinomial_factory(4), + "smoothed_wvaccinated_booster_defno", "weight", "v_vaccinated_booster_defno", 6, compute_binary_response, jeffreys_multinomial_factory(4), + + "smoothed_wflu_vaccinated_2021", "weight", "v_flu_vaccinated_2021", 6, compute_binary_response, jeffreys_binary, + + # who would make more likely to accept vaccine "smoothed_vaccine_likely_friends", "weight_unif", "v_vaccine_likely_friends", 6, compute_binary_response, jeffreys_binary, "smoothed_wvaccine_likely_friends", "weight", "v_vaccine_likely_friends", 6, compute_binary_response, jeffreys_binary, diff --git a/facebook/delphiFacebook/R/contingency_indicators.R b/facebook/delphiFacebook/R/contingency_indicators.R index 426b362d0..205b06588 100644 --- a/facebook/delphiFacebook/R/contingency_indicators.R +++ b/facebook/delphiFacebook/R/contingency_indicators.R @@ -150,6 +150,26 @@ get_aggs <- function() { "pct_accept_vaccine_no_appointment_probno", "v_accept_vaccine_no_appointment_probno", compute_binary, jeffreys_multinomial_factory(4), "pct_accept_vaccine_no_appointment_defno", "v_accept_vaccine_no_appointment_defno", compute_binary, jeffreys_multinomial_factory(4), + "pct_initial_dose_one_of_one", "v_initial_dose_one_of_one", compute_binary, jeffreys_multinomial_factory(4), + "pct_initial_dose_one_of_two", "v_initial_dose_one_of_two", compute_binary, jeffreys_multinomial_factory(4), + "pct_initial_dose_two_of_two", "v_initial_dose_two_of_two", compute_binary, jeffreys_multinomial_factory(4), + + "pct_vaccinated_one_booster", "v_vaccinated_one_booster", compute_binary, jeffreys_multinomial_factory(4), + "pct_vaccinated_two_or_more_boosters", "v_vaccinated_two_or_more_boosters", compute_binary, jeffreys_multinomial_factory(4), + "pct_vaccinated_no_booster", "v_vaccinated_no_booster", compute_binary, jeffreys_multinomial_factory(4), + "pct_vaccinated_at_least_one_booster", "v_vaccinated_at_least_one_booster", compute_binary, jeffreys_binary, + + "pct_vaccinated_booster_accept", "v_vaccinated_booster_accept", compute_binary, jeffreys_binary, + "pct_vaccinated_booster_hesitant", "v_vaccinated_booster_hesitant", compute_binary, jeffreys_binary, + + "pct_vaccinated_booster_defyes", "v_vaccinated_booster_defyes", compute_binary, jeffreys_multinomial_factory(4), + "pct_vaccinated_booster_probyes", "v_vaccinated_booster_probyes", compute_binary, jeffreys_multinomial_factory(4), + "pct_vaccinated_booster_probno", "v_vaccinated_booster_probno", compute_binary, jeffreys_multinomial_factory(4), + "pct_vaccinated_booster_defno", "v_vaccinated_booster_defno", compute_binary, jeffreys_multinomial_factory(4), + + "pct_flu_vaccinated_2021", "v_flu_vaccinated_2021", compute_binary, jeffreys_binary, + + # vaccine timing "pct_vaccine_timing_weeks", "v_vaccine_timing_weeks", compute_binary, jeffreys_multinomial_factory(7), "pct_vaccine_timing_onemonth", "v_vaccine_timing_onemonth", compute_binary, jeffreys_multinomial_factory(7), diff --git a/facebook/delphiFacebook/R/variables.R b/facebook/delphiFacebook/R/variables.R index b4461776d..bfef39030 100644 --- a/facebook/delphiFacebook/R/variables.R +++ b/facebook/delphiFacebook/R/variables.R @@ -859,6 +859,50 @@ code_vaccines <- function(input_data, wave) { input_data$v_covid_vaccinated_friends <- NA } + if ("V2d" %in% names(input_data)) { + input_data$v_initial_dose_one_of_one <- input_data$V2d == 1 + input_data$v_initial_dose_one_of_two <- input_data$V2d == 2 + input_data$v_initial_dose_two_of_two <- input_data$V2d == 3 + } else { + input_data$v_initial_dose_one_of_one <- NA + input_data$v_initial_dose_one_of_two <- NA + input_data$v_initial_dose_two_of_two <- NA + } + + if ("V2b" %in% names(input_data)) { + input_data$v_vaccinated_one_booster <- input_data$V2b == 1 + input_data$v_vaccinated_two_or_more_boosters <- input_data$V2b == 2 + input_data$v_vaccinated_at_least_one_booster <- input_data$V2b == 1 | input_data$V2b == 2 + input_data$v_vaccinated_no_booster <- input_data$V2b == 3 + } else { + input_data$v_vaccinated_one_booster <- NA + input_data$v_vaccinated_two_or_more_boosters <- NA + input_data$v_vaccinated_at_least_one_booster <- NA + input_data$v_vaccinated_no_booster <- NA + } + + if ("V2c" %in% names(input_data)) { + input_data$v_vaccinated_booster_accept <- input_data$V2c == 1 | input_data$V2c == 2 + input_data$v_vaccinated_booster_hesitant <- input_data$V2c == 3 | input_data$V2c == 4 + input_data$v_vaccinated_booster_defyes <- input_data$V2c == 1 + input_data$v_vaccinated_booster_probyes <- input_data$V2c == 2 + input_data$v_vaccinated_booster_probno <- input_data$V2c == 3 + input_data$v_vaccinated_booster_defno <- input_data$V2c == 4 + } else { + input_data$v_vaccinated_booster_accept <- NA + input_data$v_vaccinated_booster_hesitant <- NA + input_data$v_vaccinated_booster_defyes <- NA + input_data$v_vaccinated_booster_probyes <- NA + input_data$v_vaccinated_booster_probno <- NA + input_data$v_vaccinated_booster_defno <- NA + } + + if ("C17b" %in% names(input_data)) { + input_data$v_flu_vaccinated_2021 <- input_data$C17b == 1 + } else { + input_data$v_flu_vaccinated_2021 <- NA + } + return(input_data) } diff --git a/facebook/qsf-tools/generate-codebook.R b/facebook/qsf-tools/generate-codebook.R index e61a9d05d..05453cadd 100644 --- a/facebook/qsf-tools/generate-codebook.R +++ b/facebook/qsf-tools/generate-codebook.R @@ -76,9 +76,7 @@ process_qsf <- function(path_to_qsf, # get the text of the question: questions <- displayed_questions %>% - map_chr(~ .x$Payload$QuestionText) %>% - str_remove_all("<[^<>]+>") %>% - str_replace_all(" ", " ") + map_chr(~ .x$Payload$QuestionText) # get the type of question: type_map <- c(MC = "Multiple choice", TE = "Text", Matrix = "Matrix") diff --git a/facebook/qsf-tools/qsf-differ.R b/facebook/qsf-tools/qsf-differ.R index 4ac05c937..33ad22c03 100644 --- a/facebook/qsf-tools/qsf-differ.R +++ b/facebook/qsf-tools/qsf-differ.R @@ -14,6 +14,8 @@ options(warn = 1) suppressPackageStartupMessages({ library(jsonlite) library(stringr) + library(dplyr) + library(readr) source("qsf-utils.R") }) @@ -25,7 +27,18 @@ diff_qsf_files <- function(old_qsf_path, new_qsf_path) { old_qsf <- get_qsf_file(old_qsf_path) new_qsf <- get_qsf_file(new_qsf_path) - diff_surveys(old_qsf, new_qsf) + old_wave <- get_wave(old_qsf_path) + new_wave <- get_wave(new_qsf_path) + + out <- diff_surveys(old_qsf, new_qsf) %>% + mutate( + old_wave = old_wave, new_wave = new_wave + ) %>% + select(new_wave, old_wave, everything()) + write_csv( + out, + paste0("diff_", old_wave, "-", new_wave, ".csv", collapse="") + ) return(NULL) } @@ -126,19 +139,22 @@ diff_surveys <- function(old_qsf, new_qsf) { added <- setdiff(new_shown_items, old_shown_items) removed <- setdiff(old_shown_items, new_shown_items) - print_questions(added, "Added", new_questions) - print_questions(removed, "Removed", old_questions) + added_df <- create_diff_df(added, "Added", new_questions) + removed_df <- create_diff_df(removed, "Removed", old_questions) ## For questions that appear in both surveys, check for changes in wording, ## display logic, and answer options. shared <- intersect(old_shown_items, new_shown_items) - diff_question(shared, "QuestionText", old_questions, new_questions) - diff_question(shared, "DisplayLogic", old_questions, new_questions) - diff_question(shared, "Choices", old_questions, new_questions) - diff_question(shared, "Subquestions", old_questions, new_questions) + text_df <- diff_question(shared, "QuestionText", old_questions, new_questions) + logic_df <- diff_question(shared, "DisplayLogic", old_questions, new_questions) + choice_df <- diff_question(shared, "Choices", old_questions, new_questions) + subq_df <- diff_question(shared, "Subquestions", old_questions, new_questions) - return(NULL) + out <- bind_rows( + added_df, removed_df, text_df, logic_df, choice_df, subq_df + ) + return(out) } #' Compare a single question field in the two surveys. @@ -159,9 +175,9 @@ diff_question <- function(names, change_type=c("Choices", "QuestionText", "Displ changed <- append(changed, question) } } - print_questions(changed, change_type, new_qsf) + out <- create_diff_df(changed, change_type, new_qsf) - return(NULL) + return(out) } #' Print results with custom message for each possible change type. @@ -172,26 +188,27 @@ diff_question <- function(names, change_type=c("Choices", "QuestionText", "Displ #' @param reference_qsf named list of trimmed output from `get_qsf_file` for survey that #' contains descriptive info about a particular type of change. For "removed" #' questions, should be older survey, else newer survey. -print_questions <- function(questions, change_type=c("Added", "Removed", "Choices", "QuestionText", "DisplayLogic", "Subquestions"), reference_qsf) { +create_diff_df <- function(questions, change_type=c("Added", "Removed", "Choices", "QuestionText", "DisplayLogic", "Subquestions"), reference_qsf) { + out <- data.frame() + if ( length(questions) > 0 ) { change_type <- match.arg(change_type) - text_options <- list( - Added = "Added: item %s (%s)\n", - Removed = "Removed: item %s (%s)\n", - QuestionText = "Question wording changed: item %s (%s)\n", - DisplayLogic = "Display logic changed: item %s (%s)\n", - Choices = "Answer choices changed: item %s (%s)\n", - Subquestions = "Matrix subquestions changed: item %s (%s)\n" - ) - + change_descriptions <- list( + Added = "Item added", + Removed = "Item removed", + QuestionText = "Question wording changed", + DisplayLogic = "Display logic changed", + Choices = "Answer choices changed", + Subquestions = "Matrix subquestions changed" + ) questions <- sort(questions) qids <- sapply(questions, function(question) { reference_qsf[[question]]$QuestionID }) - cat("\n ") - cat(sprintf(text_options[[change_type]], questions, qids)) + out <- data.frame(change_type=change_descriptions[[change_type]], item=questions, qid=qids) } - return(NULL) + + return(out) }