diff --git a/.bumpversion.cfg b/.bumpversion.cfg
index 9c897f41f..a0ea8cd83 100644
--- a/.bumpversion.cfg
+++ b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.3.6
+current_version = 0.3.7
 commit = True
 message = chore: bump covidcast-indicators to {new_version}
 tag = False
diff --git a/_delphi_utils_python/.bumpversion.cfg b/_delphi_utils_python/.bumpversion.cfg
index b0c60af24..8061dd5cf 100644
--- a/_delphi_utils_python/.bumpversion.cfg
+++ b/_delphi_utils_python/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.3.2
+current_version = 0.3.3
 commit = True
 message = chore: bump delphi_utils to {new_version}
 tag = False
diff --git a/_delphi_utils_python/delphi_utils/__init__.py b/_delphi_utils_python/delphi_utils/__init__.py
index 35d552179..7374a0605 100644
--- a/_delphi_utils_python/delphi_utils/__init__.py
+++ b/_delphi_utils_python/delphi_utils/__init__.py
@@ -15,4 +15,4 @@
 from .nancodes import Nans
 from .weekday import Weekday
 
-__version__ = "0.3.2"
+__version__ = "0.3.3"
diff --git a/_delphi_utils_python/setup.py b/_delphi_utils_python/setup.py
index 782b49ad5..2e495d231 100644
--- a/_delphi_utils_python/setup.py
+++ b/_delphi_utils_python/setup.py
@@ -26,7 +26,7 @@
 
 setup(
     name="delphi_utils",
-    version="0.3.2",
+    version="0.3.3",
     description="Shared Utility Functions for Indicators",
     long_description=long_description,
     long_description_content_type="text/markdown",
diff --git a/_delphi_utils_python/tests/test_archive.py b/_delphi_utils_python/tests/test_archive.py
index 68f05454d..0694bfdfa 100644
--- a/_delphi_utils_python/tests/test_archive.py
+++ b/_delphi_utils_python/tests/test_archive.py
@@ -1,4 +1,4 @@
-
+from dataclasses import dataclass, field
 from io import StringIO, BytesIO
 from os import listdir, mkdir
 from os.path import join
@@ -22,125 +22,197 @@
     "missing_val": "Int64", "missing_se": "Int64", "missing_sample_size": "Int64"
 }
 
-CSVS_BEFORE = {
-    # All rows unchanged
-    "csv0": pd.DataFrame({
-        "geo_id": ["1", "2", "3"],
-        "val": [1.000000001, 2.00000002, 3.00000003],
-        "se": [0.1, 0.2, 0.3],
-        "sample_size": [10.0, 20.0, 30.0],
-        "missing_val": [Nans.NOT_MISSING] * 3,
-        "missing_se": [Nans.NOT_MISSING] * 3,
-        "missing_sample_size": [Nans.NOT_MISSING] * 3,
-    }),
-
-    # One row deleted and one row added
-    "csv1": pd.DataFrame({
-        "geo_id": ["1", "2", "3"],
-        "val": [1.0, 2.0, 3.0],
-        "se": [np.nan, 0.20000002, 0.30000003],
-        "sample_size": [10.0, 20.0, 30.0],
-        "missing_val": [Nans.NOT_MISSING] * 3,
-        "missing_se": [Nans.CENSORED] + [Nans.NOT_MISSING] * 2,
-        "missing_sample_size": [Nans.NOT_MISSING] * 3,
-    }),
-
-    # File deleted
-    "csv2": pd.DataFrame({
-        "geo_id": ["1"],
-        "val": [1.0],
-        "se": [0.1],
-        "sample_size": [10.0],
-        "missing_val": [Nans.NOT_MISSING],
-        "missing_se": [Nans.NOT_MISSING],
-        "missing_sample_size": [Nans.NOT_MISSING],
-    }),
-
-    # All rows common, but missing columns added
-    "csv4": pd.DataFrame({
-        "geo_id": ["1"],
-        "val": [1.0],
-        "se": [0.1],
-        "sample_size": [10.0]
-    }),
-
-    # Same as 1, but no missing columns ("old-style" file)
-    "csv5": pd.DataFrame({
-        "geo_id": ["1", "2", "3"],
-        "val": [1.0, 2.0, 3.0],
-        "se": [np.nan, 0.20000002, 0.30000003],
-        "sample_size": [10.0, 20.0, 30.0]
-    })
-}
-
-CSVS_AFTER = {
-    # All rows unchanged
-    "csv0": pd.DataFrame({
-        "geo_id": ["1", "2", "3"],
-        "val": [1.0, 2.0, 3.0],
-        "se": [0.10000001, 0.20000002, 0.30000003],
-        "sample_size": [10.0, 20.0, 30.0],
-        "missing_val": [Nans.NOT_MISSING] * 3,
-        "missing_se": [Nans.NOT_MISSING] * 3,
-        "missing_sample_size": [Nans.NOT_MISSING] * 3,
-    }),
-
-    # One row deleted and one row added
-    "csv1": pd.DataFrame({
-        "geo_id": ["1", "2", "4"],
-        "val": [1.0, 2.1, 4.0],
-        "se": [np.nan, 0.21, np.nan],
-        "sample_size": [10.0, 21.0, 40.0],
-        "missing_val": [Nans.NOT_MISSING] * 3,
-        "missing_se": [Nans.CENSORED] + [Nans.NOT_MISSING] * 2,
-        "missing_sample_size": [Nans.NOT_MISSING] * 3,
-    }),
-
-    # File added
-    "csv3": pd.DataFrame({
-        "geo_id": ["2"],
-        "val": [2.0000002],
-        "se": [0.2],
-        "sample_size": [20.0],
-        "missing_val": [Nans.NOT_MISSING],
-        "missing_se": [Nans.NOT_MISSING],
-        "missing_sample_size": [Nans.NOT_MISSING],
-    }),
-
-    # All rows common, but missing columns added
-    "csv4": pd.DataFrame({
-        "geo_id": ["1"],
-        "val": [1.0],
-        "se": [0.1],
-        "sample_size": [10.0],
-        "missing_val": [Nans.NOT_MISSING],
-        "missing_se": [Nans.NOT_MISSING],
-        "missing_sample_size": [Nans.NOT_MISSING],
-    }),
-
-    # Row 1 same, row 2 deleted, row 3 added, row 4 deleted previously
-    # and present again as nan, row 5 deleted previously and absent from file
-    # (no missing columns)
-    "csv5": pd.DataFrame({
-        "geo_id": ["1", "2", "4"],
-        "val": [1.0, 2.1, 4.0],
-        "se": [np.nan, 0.21, np.nan],
-        "sample_size": [10.0, 21.0, 40.0]
-    })
+class Example:
+    def __init__(self, before, after, diff):
+        def fix_df(df):
+            if isinstance(df, pd.DataFrame):
+                return Example._set_df_datatypes(df, CSV_DTYPES)
+            return df
+        self.before = fix_df(before)
+        self.after = fix_df(after)
+        self.diff = fix_df(diff)
+    @staticmethod
+    def _set_df_datatypes(df: pd.DataFrame, dtypes: Dict[str, Any]) -> pd.DataFrame:
+        df = df.copy()
+        for k, v in dtypes.items():
+            if k in df.columns:
+                df[k] = df[k].astype(v)
+        return df
+
+@dataclass
+class Expecteds:
+    deleted: List[str]
+    common_diffs: Dict[str, str]
+    new: List[str]
+    raw_exports: List[str] = field(init=False)
+    diffed_exports: List[str] = field(init=False)
+    filtered_exports: List[str] = field(init=False)
+
+    def __post_init__(self):
+        self.raw_exports = list(self.common_diffs.keys()) + self.new
+        self.diffed_exports = self.raw_exports + [diff_name for diff_name in self.common_diffs.values() if diff_name is not None]
+        self.filtered_exports = [f.replace(".diff", "") for f in self.diffed_exports if f.endswith(".diff")] + self.new
+EMPTY = "empty"
+CSVS = {
+    "unchanged": Example( # was: csv0
+        before=pd.DataFrame({
+            "geo_id": ["1", "2", "3"],
+            "val": [1.000000001, 2.00000002, 3.00000003],
+            "se": [0.1, 0.2, 0.3],
+            "sample_size": [10.0, 20.0, 30.0],
+            "missing_val": [Nans.NOT_MISSING] * 3,
+            "missing_se": [Nans.NOT_MISSING] * 3,
+            "missing_sample_size": [Nans.NOT_MISSING] * 3,
+        }),
+        after=pd.DataFrame({
+            "geo_id": ["1", "2", "3"],
+            "val": [1.0, 2.0, 3.0], # ignore changes beyond 7 decimal points
+            "se": [0.10000001, 0.20000002, 0.30000003],
+            "sample_size": [10.0, 20.0, 30.0],
+            "missing_val": [Nans.NOT_MISSING] * 3,
+            "missing_se": [Nans.NOT_MISSING] * 3,
+            "missing_sample_size": [Nans.NOT_MISSING] * 3,
+        }),
+        diff=EMPTY # unchanged names are listed in common files but have no diff file
+    ),
+    "mod_2_del_3_add_4": Example( # was: csv1
+        before=pd.DataFrame({
+            "geo_id": ["1", "2", "3"],
+            "val": [1.0, 2.0, 3.0],
+            "se": [np.nan, 0.20000002, 0.30000003],
+            "sample_size": [10.0, 20.0, 30.0],
+            "missing_val": [Nans.NOT_MISSING] * 3,
+            "missing_se": [Nans.CENSORED] + [Nans.NOT_MISSING] * 2,
+            "missing_sample_size": [Nans.NOT_MISSING] * 3,
+        }),
+        after=pd.DataFrame({
+            "geo_id": ["1", "2", "4"],
+            "val": [1.0, 2.1, 4.0],
+            "se": [np.nan, 0.21, np.nan],
+            "sample_size": [10.0, 21.0, 40.0],
+            "missing_val": [Nans.NOT_MISSING] * 3,
+            "missing_se": [Nans.CENSORED] + [Nans.NOT_MISSING] * 2,
+            "missing_sample_size": [Nans.NOT_MISSING] * 3,
+        }),
+        diff=pd.DataFrame({
+            "geo_id": ["2", "3", "4"],
+            "val": [2.1, np.nan, 4.0],
+            "se": [0.21, np.nan, np.nan],
+            "sample_size": [21.0, np.nan, 40.0],
+            "missing_val": [Nans.NOT_MISSING, Nans.DELETED, Nans.NOT_MISSING],
+            "missing_se": [Nans.NOT_MISSING, Nans.DELETED, Nans.NOT_MISSING],
+            "missing_sample_size": [Nans.NOT_MISSING, Nans.DELETED, Nans.NOT_MISSING],
+        })
+    ),
+    "delete_file": Example( # was: csv2
+        before=pd.DataFrame({
+            "geo_id": ["1"],
+            "val": [1.0],
+            "se": [0.1],
+            "sample_size": [10.0],
+            "missing_val": [Nans.NOT_MISSING],
+            "missing_se": [Nans.NOT_MISSING],
+            "missing_sample_size": [Nans.NOT_MISSING],
+        }),
+        after=None,
+        diff=None
+    ),
+    "add_file": Example( # was: csv3
+        before=None,
+        after=pd.DataFrame({
+            "geo_id": ["2"],
+            "val": [2.0000002],
+            "se": [0.2],
+            "sample_size": [20.0],
+            "missing_val": [Nans.NOT_MISSING],
+            "missing_se": [Nans.NOT_MISSING],
+            "missing_sample_size": [Nans.NOT_MISSING],
+        }),
+        diff=None
+    ),
+    "unchanged_old_new": Example( # was: csv4
+        before=pd.DataFrame({
+            "geo_id": ["1"],
+            "val": [1.0],
+            "se": [0.1],
+            "sample_size": [10.0]
+        }),
+        after=pd.DataFrame({
+            "geo_id": ["1"],
+            "val": [1.0],
+            "se": [0.1],
+            "sample_size": [10.0],
+            "missing_val": [Nans.NOT_MISSING],
+            "missing_se": [Nans.NOT_MISSING],
+            "missing_sample_size": [Nans.NOT_MISSING],
+        }),
+        diff=pd.DataFrame({
+            "geo_id": ["1"],
+            "val": [1.0],
+            "se": [0.1],
+            "sample_size": [10.0],
+            "missing_val": [Nans.NOT_MISSING],
+            "missing_se": [Nans.NOT_MISSING],
+            "missing_sample_size": [Nans.NOT_MISSING],
+        })
+    ),
+    "mod_2_del_3_add_4_old_old": Example( # was: csv5
+        before=pd.DataFrame({
+            "geo_id": ["1", "2", "3"],
+            "val": [1.0, 2.0, 3.0],
+            "se": [np.nan, 0.20000002, 0.30000003],
+            "sample_size": [10.0, 20.0, 30.0]
+        }),
+        after=pd.DataFrame({
+            "geo_id": ["1", "2", "4"],
+            "val": [1.0, 2.1, 4.0],
+            "se": [np.nan, 0.21, np.nan],
+            "sample_size": [10.0, 21.0, 40.0]
+        }),
+        diff=pd.DataFrame({
+            "geo_id": ["2", "3", "4"],
+            "val": [2.1, np.nan, 4.0],
+            "se": [0.21, np.nan, np.nan],
+            "sample_size": [21.0, np.nan, 40.0],
+            "missing_val": [np.nan, Nans.DELETED, np.nan],
+            "missing_se": [np.nan, Nans.DELETED, np.nan],
+            "missing_sample_size": [np.nan, Nans.DELETED, np.nan],
+        })
+    )
 }
+EXPECTEDS = Expecteds(
+    deleted=["delete_file.csv"],
+    common_diffs=dict(
+        (f"{csv_name}.csv", None if dfs.diff is EMPTY else f"{csv_name}.csv.diff")
+        for csv_name, dfs in CSVS.items() if dfs.diff is not None
+    ),
+    new=["add_file.csv"],
+)
+# check for incomplete modifications to tests
+assert set(EXPECTEDS.new) == set(f"{csv_name}.csv" for csv_name, dfs in CSVS.items() if dfs.before is None), \
+    "Bad programmer: added more new files to CSVS.after without updating EXPECTEDS.new"
 
 def _assert_frames_equal_ignore_row_order(df1, df2, index_cols: List[str] = None):
     return assert_frame_equal(df1.set_index(index_cols).sort_index(), df2.set_index(index_cols).sort_index())
 
-def _set_df_datatypes(df: pd.DataFrame, dtypes: Dict[str, Any]) -> pd.DataFrame:
-    df = df.copy()
-    for k, v in dtypes.items():
-        if k in df.columns:
-            df[k] = df[k].astype(v)
-    return df
-
-
-class TestArchiveDiffer:
+class ArchiveDifferTestlike:
+    def set_up(self, tmp_path):
+        cache_dir = join(str(tmp_path), "cache")
+        export_dir = join(str(tmp_path), "export")
+        mkdir(cache_dir)
+        mkdir(export_dir)
+        return cache_dir, export_dir
+    def check_filtered_exports(self, export_dir):
+        assert set(listdir(export_dir)) == set(EXPECTEDS.filtered_exports)
+        for f in EXPECTEDS.filtered_exports:
+            example = CSVS[f.replace(".csv", "")]
+            _assert_frames_equal_ignore_row_order(
+                pd.read_csv(join(export_dir, f), dtype=CSV_DTYPES),
+                example.after if example.diff is None else example.diff,
+                index_cols=["geo_id"]
+            )
+
+class TestArchiveDiffer(ArchiveDifferTestlike):
 
     def test_stubs(self):
         arch_diff = ArchiveDiffer("cache", "export")
@@ -152,30 +224,7 @@ def test_stubs(self):
             arch_diff.archive_exports(None)
 
     def test_diff_and_filter_exports(self, tmp_path):
-        cache_dir = join(str(tmp_path), "cache")
-        export_dir = join(str(tmp_path), "export")
-        mkdir(cache_dir)
-        mkdir(export_dir)
-
-        expected_csv1_diff = _set_df_datatypes(pd.DataFrame({
-            "geo_id": ["2", "3", "4"],
-            "val": [2.1, np.nan, 4.0],
-            "se": [0.21, np.nan, np.nan],
-            "sample_size": [21.0, np.nan, 40.0],
-            "missing_val": [Nans.NOT_MISSING, Nans.DELETED, Nans.NOT_MISSING],
-            "missing_se": [Nans.NOT_MISSING, Nans.DELETED, Nans.NOT_MISSING],
-            "missing_sample_size": [Nans.NOT_MISSING, Nans.DELETED, Nans.NOT_MISSING],
-        }), dtypes=CSV_DTYPES)
-        expected_csv4_diff = _set_df_datatypes(CSVS_AFTER["csv4"], dtypes=CSV_DTYPES)
-        expected_csv5_diff = _set_df_datatypes(pd.DataFrame({
-            "geo_id": ["2", "3", "4"],
-            "val": [2.1, np.nan, 4.0],
-            "se": [0.21, np.nan, np.nan],
-            "sample_size": [21.0, np.nan, 40.0],
-            "missing_val": [np.nan, Nans.DELETED, np.nan],
-            "missing_se": [np.nan, Nans.DELETED, np.nan],
-            "missing_sample_size": [np.nan, Nans.DELETED, np.nan],
-        }), dtypes=CSV_DTYPES)
+        cache_dir, export_dir = self.set_up(tmp_path)
 
         arch_diff = ArchiveDiffer(cache_dir, export_dir)
 
@@ -187,47 +236,38 @@ def test_diff_and_filter_exports(self, tmp_path):
             arch_diff.diff_exports()
 
         # Simulate cache updated, and signal ran finish
-        for csv_name, df in CSVS_BEFORE.items():
-            df.to_csv(join(cache_dir, f"{csv_name}.csv"), index=False)
-        for csv_name, df in CSVS_AFTER.items():
-            df.to_csv(join(export_dir, f"{csv_name}.csv"), index=False)
+        for csv_name, dfs in CSVS.items():
+            if dfs.before is not None:
+                dfs.before.to_csv(join(cache_dir, f"{csv_name}.csv"), index=False)
+            if dfs.after is not None:
+                dfs.after.to_csv(join(export_dir, f"{csv_name}.csv"), index=False)
         arch_diff._cache_updated = True
 
         deleted_files, common_diffs, new_files = arch_diff.diff_exports()
 
-        # Check return values
-        assert set(deleted_files) == {join(cache_dir, "csv2.csv")}
+        # Check deleted, common, diffed, and new file names match expected
+        assert set(deleted_files) == {join(cache_dir, f) for f in EXPECTEDS.deleted}
         assert set(common_diffs.keys()) == {
-            join(export_dir, f) for f in ["csv0.csv", "csv1.csv", "csv4.csv", "csv5.csv"]}
-        assert set(new_files) == {join(export_dir, "csv3.csv")}
-        assert common_diffs[join(export_dir, "csv0.csv")] is None
-        assert common_diffs[join(export_dir, "csv1.csv")] == join(
-            export_dir, "csv1.csv.diff")
+            join(export_dir, csv_name) for csv_name in EXPECTEDS.common_diffs}
+        assert set(new_files) == {join(export_dir, f) for f in EXPECTEDS.new}
+        # Check that diffed file names are identical
+        assert all(
+            (common_diffs[join(export_dir, csv_name)] ==
+             None if diff_name is None else join(export_dir, diff_name))
+            for csv_name, diff_name in EXPECTEDS.common_diffs.items()
+        )
 
         # Check filesystem for actual files
-        assert set(listdir(export_dir)) == {
-            "csv0.csv",
-            "csv1.csv", "csv1.csv.diff",
-            "csv3.csv",
-            "csv4.csv", "csv4.csv.diff",
-            "csv5.csv", "csv5.csv.diff",
-        }
-        # Check that the files look as expected
-        _assert_frames_equal_ignore_row_order(
-            pd.read_csv(join(export_dir, "csv1.csv.diff"), dtype=CSV_DTYPES),
-            expected_csv1_diff,
-            index_cols=["geo_id"]
-        )
-        _assert_frames_equal_ignore_row_order(
-            pd.read_csv(join(export_dir, "csv4.csv.diff"), dtype=CSV_DTYPES),
-            expected_csv4_diff,
-            index_cols=["geo_id"]
-        )
-        _assert_frames_equal_ignore_row_order(
-            pd.read_csv(join(export_dir, "csv5.csv.diff"), dtype=CSV_DTYPES),
-            expected_csv5_diff,
-            index_cols=["geo_id"]
-        )
+        assert set(listdir(export_dir)) == set(EXPECTEDS.diffed_exports)
+
+        # Check that the diff files look as expected
+        for key, diff_name in EXPECTEDS.common_diffs.items():
+            if diff_name is None: continue
+            _assert_frames_equal_ignore_row_order(
+                pd.read_csv(join(export_dir, diff_name), dtype=CSV_DTYPES),
+                CSVS[key.replace(".csv", "")].diff,
+                index_cols=["geo_id"]
+            )
 
 
         # Test filter_exports
@@ -243,49 +283,24 @@ def test_diff_and_filter_exports(self, tmp_path):
         arch_diff.filter_exports(common_diffs)
 
         # Check exports directory just has incremental changes
-        assert set(listdir(export_dir)) == {"csv1.csv", "csv3.csv", "csv4.csv", "csv5.csv"}
-        _assert_frames_equal_ignore_row_order(
-            pd.read_csv(join(export_dir, "csv1.csv"), dtype=CSV_DTYPES),
-            expected_csv1_diff,
-            index_cols=["geo_id"]
-        )
-        _assert_frames_equal_ignore_row_order(
-            pd.read_csv(join(export_dir, "csv4.csv"), dtype=CSV_DTYPES),
-            expected_csv4_diff,
-            index_cols=["geo_id"]
-        )
-        _assert_frames_equal_ignore_row_order(
-            pd.read_csv(join(export_dir, "csv5.csv"), dtype=CSV_DTYPES),
-            expected_csv5_diff,
-            index_cols=["geo_id"]
-        )
-
+        self.check_filtered_exports(export_dir)
 
 AWS_CREDENTIALS = {
     "aws_access_key_id": "FAKE_TEST_ACCESS_KEY_ID",
     "aws_secret_access_key": "FAKE_TEST_SECRET_ACCESS_KEY",
 }
 
-
-@pytest.fixture(scope="function")
-def s3_client():
-    with mock_s3():
-        yield Session(**AWS_CREDENTIALS).client("s3")
-
-
-class TestS3ArchiveDiffer:
+class TestS3ArchiveDiffer(ArchiveDifferTestlike):
     bucket_name = "test-bucket"
     indicator_prefix = "test"
 
     @mock_s3
-    def test_update_cache(self, tmp_path, s3_client):
-        cache_dir = join(str(tmp_path), "cache")
-        export_dir = join(str(tmp_path), "export")
-        mkdir(cache_dir)
-        mkdir(export_dir)
+    def test_update_cache(self, tmp_path):
+        s3_client = Session(**AWS_CREDENTIALS).client("s3")
+        cache_dir, export_dir = self.set_up(tmp_path)
 
-        csv1 = CSVS_BEFORE["csv1"]
-        csv2 = CSVS_AFTER["csv1"]
+        csv1 = CSVS["mod_2_del_3_add_4"].before
+        csv2 = CSVS["mod_2_del_3_add_4"].after
         csv1_buf = StringIO()
         csv2_buf = StringIO()
         csv1.to_csv(csv1_buf, index=False)
@@ -316,13 +331,11 @@ def test_update_cache(self, tmp_path, s3_client):
         assert set(listdir(cache_dir)) == {"csv1.csv", "csv2.csv"}
 
     @mock_s3
-    def test_archive_exports(self, tmp_path, s3_client):
-        cache_dir = join(str(tmp_path), "cache")
-        export_dir = join(str(tmp_path), "export")
-        mkdir(cache_dir)
-        mkdir(export_dir)
+    def test_archive_exports(self, tmp_path):
+        s3_client = Session(**AWS_CREDENTIALS).client("s3")
+        cache_dir, export_dir = self.set_up(tmp_path)
 
-        csv1 = _set_df_datatypes(CSVS_BEFORE["csv1"], dtypes=CSV_DTYPES)
+        csv1 = CSVS["mod_2_del_3_add_4"].before
         csv1.to_csv(join(export_dir, "csv1.csv"), index=False)
 
         s3_client.create_bucket(Bucket=self.bucket_name)
@@ -347,25 +360,23 @@ def test_archive_exports(self, tmp_path, s3_client):
         assert_frame_equal(pd.read_csv(body, dtype=CSV_DTYPES), csv1)
 
     @mock_s3
-    def test_run(self, tmp_path, s3_client):
-        cache_dir = join(str(tmp_path), "cache")
-        export_dir = join(str(tmp_path), "export")
-        mkdir(cache_dir)
-        mkdir(export_dir)
+    def test_run(self, tmp_path):
+        s3_client = Session(**AWS_CREDENTIALS).client("s3")
+        cache_dir, export_dir = self.set_up(tmp_path)
 
-        # Set up current buckets to be `CSVS_BEFORE`.
         s3_client.create_bucket(Bucket=self.bucket_name)
-        for csv_name, df in CSVS_BEFORE.items():
-            csv_buf = StringIO()
-            df.to_csv(csv_buf, index=False)
-            s3_client.put_object(
-                Bucket=self.bucket_name,
-                Key=f"{self.indicator_prefix}/{csv_name}.csv",
-                Body=BytesIO(csv_buf.getvalue().encode()))
-
-        # Set up the exported files to be `CSVS_AFTER`.
-        for csv_name, df in CSVS_AFTER.items():
-            df.to_csv(join(export_dir, f"{csv_name}.csv"), index=False)
+        for csv_name, dfs in CSVS.items():
+            # Set up current buckets to be the 'before' files.
+            if dfs.before is not None:
+                csv_buf = StringIO()
+                dfs.before.to_csv(csv_buf, index=False)
+                s3_client.put_object(
+                    Bucket=self.bucket_name,
+                    Key=f"{self.indicator_prefix}/{csv_name}.csv",
+                    Body=BytesIO(csv_buf.getvalue().encode()))
+            # Set up the exported files to be the 'after' files.
+            if dfs.after is not None:
+                dfs.after.to_csv(join(export_dir, f"{csv_name}.csv"), index=False)
 
         # Create and run differ.
         arch_diff = S3ArchiveDiffer(
@@ -375,35 +386,19 @@ def test_run(self, tmp_path, s3_client):
         arch_diff.run()
 
         # Check that the buckets now contain the exported files.
-        for csv_name, df in CSVS_AFTER.items():
+        for csv_name, dfs in CSVS.items():
+            if dfs.after is None:
+                continue
             body = s3_client.get_object(Bucket=self.bucket_name, Key=f"{self.indicator_prefix}/{csv_name}.csv")["Body"]
-            assert_frame_equal(pd.read_csv(body, dtype=CSV_DTYPES), _set_df_datatypes(df, dtypes=CSV_DTYPES))
+            assert_frame_equal(pd.read_csv(body, dtype=CSV_DTYPES), dfs.after)
 
         # Check exports directory just has incremental changes
-        assert set(listdir(export_dir)) == {"csv1.csv", "csv3.csv", "csv4.csv", "csv5.csv"}
-        csv1_diff = _set_df_datatypes(pd.DataFrame({
-            "geo_id": ["3", "2", "4"],
-            "val": [np.nan, 2.1, 4.0],
-            "se": [np.nan, 0.21, np.nan],
-            "sample_size": [np.nan, 21.0, 40.0],
-            "missing_val": [Nans.DELETED] + [Nans.NOT_MISSING] * 2,
-            "missing_se": [Nans.DELETED] + [Nans.NOT_MISSING] * 2,
-            "missing_sample_size": [Nans.DELETED] + [Nans.NOT_MISSING] * 2,
-        }), dtypes=CSV_DTYPES)
-        _assert_frames_equal_ignore_row_order(
-            pd.read_csv(join(export_dir, "csv1.csv"), dtype=CSV_DTYPES),
-            csv1_diff,
-            index_cols=["geo_id"]
-        )
-
+        self.check_filtered_exports(export_dir)
 
-class TestGitArchiveDiffer:
+class TestGitArchiveDiffer(ArchiveDifferTestlike):
 
     def test_init_args(self, tmp_path):
-        cache_dir = str(tmp_path / "cache")
-        export_dir = str(tmp_path / "export")
-        mkdir(cache_dir)
-        mkdir(export_dir)
+        cache_dir, export_dir = self.set_up(tmp_path)
 
         with pytest.raises(AssertionError):
             GitArchiveDiffer(cache_dir, export_dir,
@@ -419,10 +414,7 @@ def test_init_args(self, tmp_path):
         assert arch_diff.branch == arch_diff.repo.active_branch
 
     def test_update_cache(self, tmp_path):
-        cache_dir = str(tmp_path / "cache")
-        export_dir = str(tmp_path / "export")
-        mkdir(cache_dir)
-        mkdir(export_dir)
+        cache_dir, export_dir = self.set_up(tmp_path)
 
         Repo.init(cache_dir)
 
@@ -441,10 +433,7 @@ def test_update_cache(self, tmp_path):
         assert arch_diff2._cache_updated
 
     def test_diff_exports(self, tmp_path):
-        cache_dir = str(tmp_path / "cache")
-        export_dir = str(tmp_path / "export")
-        mkdir(cache_dir)
-        mkdir(export_dir)
+        cache_dir, export_dir = self.set_up(tmp_path)
 
         branch_name = "test-branch"
 
@@ -501,10 +490,7 @@ def test_diff_exports(self, tmp_path):
         assert set(new_files) == set()
 
     def test_archive_exports(self, tmp_path):
-        cache_dir = str(tmp_path / "cache")
-        export_dir = str(tmp_path / "export")
-        mkdir(cache_dir)
-        mkdir(export_dir)
+        cache_dir, export_dir = self.set_up(tmp_path)
 
         repo = Repo.init(cache_dir)
         repo.index.commit(message="Initial commit")
@@ -561,10 +547,7 @@ def test_archive_exports(self, tmp_path):
         assert repo.active_branch.set_commit("HEAD~1").commit == orig_commit
 
     def test_run(self, tmp_path):
-        cache_dir = str(tmp_path / "cache")
-        export_dir = str(tmp_path / "export")
-        mkdir(cache_dir)
-        mkdir(export_dir)
+        cache_dir, export_dir = self.set_up(tmp_path)
 
         branch_name = "test-branch"
 
@@ -572,13 +555,13 @@ def test_run(self, tmp_path):
         repo.index.commit(message="Initial commit")
         original_branch = repo.active_branch
 
-        # Set up the current cache to contain `CSVS_BEFORE`.
-        for csv_name, df in CSVS_BEFORE.items():
-            df.to_csv(join(cache_dir, f"{csv_name}.csv"), index=False)
-
-        # Set up the current cache to contain `CSVS_AFTER`.
-        for csv_name, df in CSVS_AFTER.items():
-            df.to_csv(join(export_dir, f"{csv_name}.csv"), index=False)
+        for csv_name, dfs in CSVS.items():
+            # Set up the current cache to contain 'before' files
+            if dfs.before is not None:
+                dfs.before.to_csv(join(cache_dir, f"{csv_name}.csv"), index=False)
+            # Set up the current export to contain 'after' files
+            if dfs.after is not None:
+                dfs.after.to_csv(join(export_dir, f"{csv_name}.csv"), index=False)
 
         # Create and run differ.
         arch_diff = GitArchiveDiffer(
@@ -586,29 +569,15 @@ def test_run(self, tmp_path):
             branch_name=branch_name, override_dirty=True)
         arch_diff.run()
 
-        # Check that the archive branch contains `CSVS_AFTER`.
+        # Check that the archive branch contains 'after' files.
         arch_diff.get_branch(branch_name).checkout()
-        for csv_name, df in CSVS_AFTER.items():
-            assert_frame_equal(pd.read_csv(join(cache_dir, f"{csv_name}.csv"), dtype=CSV_DTYPES), _set_df_datatypes(df, dtypes=CSV_DTYPES))
+        for csv_name, dfs in CSVS.items():
+            if dfs.after is None: continue
+            assert_frame_equal(pd.read_csv(join(cache_dir, f"{csv_name}.csv"), dtype=CSV_DTYPES), dfs.after)
         original_branch.checkout()
 
         # Check exports directory just has incremental changes
-        assert set(listdir(export_dir)) == {"csv1.csv", "csv3.csv", "csv4.csv", "csv5.csv"}
-        csv1_diff = pd.DataFrame({
-            "geo_id": ["3", "2", "4"],
-            "val": [np.nan, 2.1, 4.0],
-            "se": [np.nan, 0.21, np.nan],
-            "sample_size": [np.nan, 21.0, 40.0],
-            "missing_val": [Nans.DELETED] + [Nans.NOT_MISSING] * 2,
-            "missing_se": [Nans.DELETED] + [Nans.NOT_MISSING] * 2,
-            "missing_sample_size": [Nans.DELETED] + [Nans.NOT_MISSING] * 2,
-        })
-        _assert_frames_equal_ignore_row_order(
-            pd.read_csv(join(export_dir, "csv1.csv"), dtype=CSV_DTYPES),
-            _set_df_datatypes(csv1_diff, dtypes=CSV_DTYPES),
-            index_cols=["geo_id"]
-        )
-
+        self.check_filtered_exports(export_dir)
 
 class TestFromParams:
     """Tests for creating archive differs from params."""
diff --git a/facebook/delphiFacebook/R/binary.R b/facebook/delphiFacebook/R/binary.R
index 08305d375..fddc68537 100644
--- a/facebook/delphiFacebook/R/binary.R
+++ b/facebook/delphiFacebook/R/binary.R
@@ -135,6 +135,26 @@ get_binary_indicators <- function() {
     "smoothed_try_vaccinate_1m", "weight_unif", "v_try_vaccinate_1m", 6, compute_binary_response, jeffreys_binary,
     "smoothed_wtry_vaccinate_1m", "weight", "v_try_vaccinate_1m", 6, compute_binary_response, jeffreys_binary,
     
+    "smoothed_winitial_dose_one_of_one", "weight", "v_initial_dose_one_of_one", 6, compute_binary_response, jeffreys_multinomial_factory(4),
+    "smoothed_winitial_dose_one_of_two", "weight", "v_initial_dose_one_of_two", 6, compute_binary_response, jeffreys_multinomial_factory(4),
+    "smoothed_winitial_dose_two_of_two", "weight", "v_initial_dose_two_of_two", 6, compute_binary_response, jeffreys_multinomial_factory(4),
+    
+    "smoothed_wvaccinated_one_booster", "weight", "v_vaccinated_one_booster", 6, compute_binary_response, jeffreys_multinomial_factory(4),
+    "smoothed_wvaccinated_two_or_more_boosters", "weight", "v_vaccinated_two_or_more_boosters", 6, compute_binary_response, jeffreys_multinomial_factory(4),
+    "smoothed_wvaccinated_no_booster", "weight", "v_vaccinated_no_booster", 6, compute_binary_response, jeffreys_multinomial_factory(4),
+    "smoothed_wvaccinated_at_least_one_booster", "weight", "v_vaccinated_at_least_one_booster", 6, compute_binary_response, jeffreys_binary,
+    
+    "smoothed_wvaccinated_booster_accept", "weight", "v_vaccinated_booster_accept", 6, compute_binary_response, jeffreys_binary,
+    "smoothed_wvaccinated_booster_hesitant", "weight", "v_vaccinated_booster_hesitant", 6, compute_binary_response, jeffreys_binary,
+    
+    "smoothed_wvaccinated_booster_defyes", "weight", "v_vaccinated_booster_defyes", 6, compute_binary_response, jeffreys_multinomial_factory(4),
+    "smoothed_wvaccinated_booster_probyes", "weight", "v_vaccinated_booster_probyes", 6, compute_binary_response, jeffreys_multinomial_factory(4),
+    "smoothed_wvaccinated_booster_probno", "weight", "v_vaccinated_booster_probno", 6, compute_binary_response, jeffreys_multinomial_factory(4),
+    "smoothed_wvaccinated_booster_defno", "weight", "v_vaccinated_booster_defno", 6, compute_binary_response, jeffreys_multinomial_factory(4),
+    
+    "smoothed_wflu_vaccinated_2021", "weight", "v_flu_vaccinated_2021", 6, compute_binary_response, jeffreys_binary,
+
+    
     # who would make more likely to accept vaccine
     "smoothed_vaccine_likely_friends", "weight_unif", "v_vaccine_likely_friends", 6, compute_binary_response, jeffreys_binary,
     "smoothed_wvaccine_likely_friends", "weight", "v_vaccine_likely_friends", 6, compute_binary_response, jeffreys_binary,
diff --git a/facebook/delphiFacebook/R/contingency_indicators.R b/facebook/delphiFacebook/R/contingency_indicators.R
index 426b362d0..205b06588 100644
--- a/facebook/delphiFacebook/R/contingency_indicators.R
+++ b/facebook/delphiFacebook/R/contingency_indicators.R
@@ -150,6 +150,26 @@ get_aggs <- function() {
     "pct_accept_vaccine_no_appointment_probno", "v_accept_vaccine_no_appointment_probno", compute_binary, jeffreys_multinomial_factory(4),
     "pct_accept_vaccine_no_appointment_defno", "v_accept_vaccine_no_appointment_defno", compute_binary, jeffreys_multinomial_factory(4),
     
+    "pct_initial_dose_one_of_one", "v_initial_dose_one_of_one", compute_binary, jeffreys_multinomial_factory(4),
+    "pct_initial_dose_one_of_two", "v_initial_dose_one_of_two", compute_binary, jeffreys_multinomial_factory(4),
+    "pct_initial_dose_two_of_two", "v_initial_dose_two_of_two", compute_binary, jeffreys_multinomial_factory(4),
+    
+    "pct_vaccinated_one_booster", "v_vaccinated_one_booster", compute_binary, jeffreys_multinomial_factory(4),
+    "pct_vaccinated_two_or_more_boosters", "v_vaccinated_two_or_more_boosters", compute_binary, jeffreys_multinomial_factory(4),
+    "pct_vaccinated_no_booster", "v_vaccinated_no_booster", compute_binary, jeffreys_multinomial_factory(4),
+    "pct_vaccinated_at_least_one_booster", "v_vaccinated_at_least_one_booster", compute_binary, jeffreys_binary,
+    
+    "pct_vaccinated_booster_accept", "v_vaccinated_booster_accept", compute_binary, jeffreys_binary,
+    "pct_vaccinated_booster_hesitant", "v_vaccinated_booster_hesitant", compute_binary, jeffreys_binary,
+    
+    "pct_vaccinated_booster_defyes", "v_vaccinated_booster_defyes", compute_binary, jeffreys_multinomial_factory(4),
+    "pct_vaccinated_booster_probyes", "v_vaccinated_booster_probyes", compute_binary, jeffreys_multinomial_factory(4),
+    "pct_vaccinated_booster_probno", "v_vaccinated_booster_probno", compute_binary, jeffreys_multinomial_factory(4),
+    "pct_vaccinated_booster_defno", "v_vaccinated_booster_defno", compute_binary, jeffreys_multinomial_factory(4),
+    
+    "pct_flu_vaccinated_2021", "v_flu_vaccinated_2021", compute_binary, jeffreys_binary,
+    
+    
     # vaccine timing
     "pct_vaccine_timing_weeks", "v_vaccine_timing_weeks", compute_binary, jeffreys_multinomial_factory(7),
     "pct_vaccine_timing_onemonth", "v_vaccine_timing_onemonth", compute_binary, jeffreys_multinomial_factory(7),
diff --git a/facebook/delphiFacebook/R/variables.R b/facebook/delphiFacebook/R/variables.R
index b4461776d..bfef39030 100644
--- a/facebook/delphiFacebook/R/variables.R
+++ b/facebook/delphiFacebook/R/variables.R
@@ -859,6 +859,50 @@ code_vaccines <- function(input_data, wave) {
     input_data$v_covid_vaccinated_friends <- NA
   }
   
+  if ("V2d" %in% names(input_data)) {
+    input_data$v_initial_dose_one_of_one <- input_data$V2d == 1
+    input_data$v_initial_dose_one_of_two <- input_data$V2d == 2
+    input_data$v_initial_dose_two_of_two <- input_data$V2d == 3
+  } else {
+    input_data$v_initial_dose_one_of_one <- NA
+    input_data$v_initial_dose_one_of_two <- NA
+    input_data$v_initial_dose_two_of_two <- NA
+  }
+  
+  if ("V2b" %in% names(input_data)) {
+    input_data$v_vaccinated_one_booster <- input_data$V2b == 1
+    input_data$v_vaccinated_two_or_more_boosters <- input_data$V2b == 2
+    input_data$v_vaccinated_at_least_one_booster <- input_data$V2b == 1 | input_data$V2b == 2
+    input_data$v_vaccinated_no_booster <- input_data$V2b == 3
+  } else {
+    input_data$v_vaccinated_one_booster <- NA
+    input_data$v_vaccinated_two_or_more_boosters <- NA
+    input_data$v_vaccinated_at_least_one_booster <- NA
+    input_data$v_vaccinated_no_booster <- NA
+  }
+  
+  if ("V2c" %in% names(input_data)) {
+    input_data$v_vaccinated_booster_accept <- input_data$V2c == 1 | input_data$V2c == 2
+    input_data$v_vaccinated_booster_hesitant <- input_data$V2c == 3 | input_data$V2c == 4
+    input_data$v_vaccinated_booster_defyes <- input_data$V2c == 1
+    input_data$v_vaccinated_booster_probyes <- input_data$V2c == 2
+    input_data$v_vaccinated_booster_probno <- input_data$V2c == 3
+    input_data$v_vaccinated_booster_defno   <- input_data$V2c == 4
+  } else {
+    input_data$v_vaccinated_booster_accept <- NA
+    input_data$v_vaccinated_booster_hesitant <- NA
+    input_data$v_vaccinated_booster_defyes <- NA
+    input_data$v_vaccinated_booster_probyes <- NA
+    input_data$v_vaccinated_booster_probno <- NA
+    input_data$v_vaccinated_booster_defno   <- NA
+  }
+  
+  if ("C17b" %in% names(input_data)) {
+    input_data$v_flu_vaccinated_2021 <- input_data$C17b == 1
+  } else {
+    input_data$v_flu_vaccinated_2021 <- NA
+  }
+  
   return(input_data)
 }
 
diff --git a/facebook/qsf-tools/generate-codebook.R b/facebook/qsf-tools/generate-codebook.R
index e61a9d05d..05453cadd 100644
--- a/facebook/qsf-tools/generate-codebook.R
+++ b/facebook/qsf-tools/generate-codebook.R
@@ -76,9 +76,7 @@ process_qsf <- function(path_to_qsf,
   
   # get the text of the question:
   questions <- displayed_questions %>% 
-    map_chr(~ .x$Payload$QuestionText) %>% 
-    str_remove_all("<[^<>]+>") %>% 
-    str_replace_all("&nbsp;", " ")
+    map_chr(~ .x$Payload$QuestionText)
   
   # get the type of question:
   type_map <- c(MC = "Multiple choice", TE = "Text", Matrix = "Matrix")
diff --git a/facebook/qsf-tools/qsf-differ.R b/facebook/qsf-tools/qsf-differ.R
index 4ac05c937..33ad22c03 100644
--- a/facebook/qsf-tools/qsf-differ.R
+++ b/facebook/qsf-tools/qsf-differ.R
@@ -14,6 +14,8 @@ options(warn = 1)
 suppressPackageStartupMessages({
   library(jsonlite)
   library(stringr)
+  library(dplyr)
+  library(readr)
   source("qsf-utils.R")
 })
 
@@ -25,7 +27,18 @@ diff_qsf_files <- function(old_qsf_path, new_qsf_path) {
   old_qsf <- get_qsf_file(old_qsf_path)
   new_qsf <- get_qsf_file(new_qsf_path)
   
-  diff_surveys(old_qsf, new_qsf)
+  old_wave <- get_wave(old_qsf_path)
+  new_wave <- get_wave(new_qsf_path)
+  
+  out <- diff_surveys(old_qsf, new_qsf) %>%
+    mutate(
+      old_wave = old_wave, new_wave = new_wave
+    ) %>% 
+    select(new_wave, old_wave, everything())
+  write_csv(
+    out,
+    paste0("diff_", old_wave, "-", new_wave, ".csv", collapse="")
+  )
   
   return(NULL)
 }
@@ -126,19 +139,22 @@ diff_surveys <- function(old_qsf, new_qsf) {
   added <- setdiff(new_shown_items, old_shown_items)
   removed <- setdiff(old_shown_items, new_shown_items)
   
-  print_questions(added, "Added", new_questions)
-  print_questions(removed, "Removed", old_questions)
+  added_df <- create_diff_df(added, "Added", new_questions)
+  removed_df <- create_diff_df(removed, "Removed", old_questions)
   
   ## For questions that appear in both surveys, check for changes in wording,
   ## display logic, and answer options.
   shared <- intersect(old_shown_items, new_shown_items)
   
-  diff_question(shared, "QuestionText", old_questions, new_questions)
-  diff_question(shared, "DisplayLogic", old_questions, new_questions)
-  diff_question(shared, "Choices", old_questions, new_questions)
-  diff_question(shared, "Subquestions", old_questions, new_questions)
+  text_df <- diff_question(shared, "QuestionText", old_questions, new_questions)
+  logic_df <- diff_question(shared, "DisplayLogic", old_questions, new_questions)
+  choice_df <- diff_question(shared, "Choices", old_questions, new_questions)
+  subq_df <- diff_question(shared, "Subquestions", old_questions, new_questions)
   
-  return(NULL)
+  out <- bind_rows(
+    added_df, removed_df, text_df, logic_df, choice_df, subq_df
+  )
+  return(out)
 }
 
 #' Compare a single question field in the two surveys.
@@ -159,9 +175,9 @@ diff_question <- function(names, change_type=c("Choices", "QuestionText", "Displ
       changed <- append(changed, question)
     }
   }
-  print_questions(changed, change_type, new_qsf)
+  out <- create_diff_df(changed, change_type, new_qsf)
   
-  return(NULL)
+  return(out)
 }
 
 #' Print results with custom message for each possible change type.
@@ -172,26 +188,27 @@ diff_question <- function(names, change_type=c("Choices", "QuestionText", "Displ
 #' @param reference_qsf named list of trimmed output from `get_qsf_file` for survey that
 #'   contains descriptive info about a particular type of change. For "removed"
 #'   questions, should be older survey, else newer survey.
-print_questions <- function(questions, change_type=c("Added", "Removed", "Choices", "QuestionText", "DisplayLogic", "Subquestions"), reference_qsf) {
+create_diff_df <- function(questions, change_type=c("Added", "Removed", "Choices", "QuestionText", "DisplayLogic", "Subquestions"), reference_qsf) {
+  out <- data.frame()
+  
   if ( length(questions) > 0 ) {
     change_type <- match.arg(change_type)
     
-    text_options <- list(
-      Added = "Added: item %s (%s)\n",
-      Removed = "Removed: item %s (%s)\n",
-      QuestionText = "Question wording changed: item %s (%s)\n",
-      DisplayLogic = "Display logic changed: item %s (%s)\n",
-      Choices = "Answer choices changed: item %s (%s)\n",
-      Subquestions = "Matrix subquestions changed: item %s (%s)\n"
-    )
-    
+    change_descriptions <- list(
+      Added = "Item added",
+      Removed = "Item removed",
+      QuestionText = "Question wording changed",
+      DisplayLogic = "Display logic changed",
+      Choices = "Answer choices changed",
+      Subquestions = "Matrix subquestions changed"
+    )    
     questions <- sort(questions)
     qids <- sapply(questions, function(question) { reference_qsf[[question]]$QuestionID })
     
-    cat("\n ")
-    cat(sprintf(text_options[[change_type]], questions, qids))
+    out <- data.frame(change_type=change_descriptions[[change_type]], item=questions, qid=qids)
   }
-  return(NULL)
+  
+  return(out)
 }