cmu-delphi
diff --git a/‎.github/workflows/python-ci.yml
Lines changed: 0 additions & 1 deletion b/‎.github/workflows/python-ci.yml
Lines changed: 0 additions & 1 deletion
diff --git a/‎_delphi_utils_python/Makefile
Lines changed: 3 additions & 5 deletions b/‎_delphi_utils_python/Makefile
Lines changed: 3 additions & 5 deletions
diff --git a/‎_delphi_utils_python/delphi_utils/__init__.py
Lines changed: 1 addition & 1 deletion b/‎_delphi_utils_python/delphi_utils/__init__.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎_delphi_utils_python/delphi_utils/geomap.py
Lines changed: 87 additions & 89 deletions b/‎_delphi_utils_python/delphi_utils/geomap.py
Lines changed: 87 additions & 89 deletions
diff --git a/‎_delphi_utils_python/delphi_utils/nancodes.py
Lines changed: 12 additions & 7 deletions b/‎_delphi_utils_python/delphi_utils/nancodes.py
Lines changed: 12 additions & 7 deletions
diff --git a/‎_delphi_utils_python/delphi_utils/runner.py
Lines changed: 35 additions & 0 deletions b/‎_delphi_utils_python/delphi_utils/runner.py
Lines changed: 35 additions & 0 deletions
diff --git a/‎_delphi_utils_python/delphi_utils/smooth.py
Lines changed: 28 additions & 26 deletions b/‎_delphi_utils_python/delphi_utils/smooth.py
Lines changed: 28 additions & 26 deletions
diff --git a/‎validator/PLANS.md renamed to ‎_delphi_utils_python/delphi_utils/validator/PLANS.md b/‎validator/PLANS.md renamed to ‎_delphi_utils_python/delphi_utils/validator/PLANS.md
diff --git a/‎validator/README.md renamed to ‎_delphi_utils_python/delphi_utils/validator/README.md
Lines changed: 1 addition & 1 deletion b/‎validator/README.md renamed to ‎_delphi_utils_python/delphi_utils/validator/README.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎validator/REVIEW.md renamed to ‎_delphi_utils_python/delphi_utils/validator/REVIEW.md b/‎validator/REVIEW.md renamed to ‎_delphi_utils_python/delphi_utils/validator/REVIEW.md
@@ -34,7 +34,6 @@ jobs:
       run: |
         make install
     - name: Lint
-      if: ${{ matrix.packages != 'doctor_visits' }}
       run: |
         make lint
     - name: Test
 
@@ -1,7 +1,5 @@
 .PHONY = venv, lint, test, clean
 
-dir = $(shell find ./delphi_* -name __init__.py | grep -o 'delphi_[_[:alnum:]]*')
-
 venv:
 	python3.8 -m venv env
 
@@ -11,12 +9,12 @@ install: venv
 	pip install -e .
 
 lint:
-	. env/bin/activate; pylint $(dir)
-	. env/bin/activate; pydocstyle $(dir)
+	. env/bin/activate; pylint delphi_utils
+	. env/bin/activate; pydocstyle delphi_utils
 
 test:
 	. env/bin/activate ;\
-	(cd tests && ../env/bin/pytest --cov=$(dir) --cov-report=term-missing)
+	(cd tests && ../env/bin/pytest --cov=delphi_utils --cov-report=term-missing)
 
 clean:
 	rm -rf env
 
@@ -12,6 +12,6 @@
 from .geomap import GeoMapper
 from .smooth import Smoother
 from .signal import add_prefix
-from .nancodes import NAN_CODES
+from .nancodes import Nans
 
 __version__ = "0.1.0"
@@ -141,85 +141,77 @@ def _load_crosswalk(self, from_code, to_code):
         assert from_code in self.crosswalk_filepaths, \
             f"No crosswalk files for {from_code}; try {'; '.join(self.crosswalk_filepaths.keys())}"
         assert to_code in self.crosswalk_filepaths[from_code], \
-            f"No crosswalk file from {from_code} to {to_code}; try" \
+            f"No crosswalk file from {from_code} to {to_code}; try " \
             f"{'; '.join(self.crosswalk_filepaths[from_code].keys())}"
+
+        if self.crosswalks[from_code][to_code] is None:
+            self.crosswalks[from_code][to_code] = self._load_crosswalk_from_file(from_code, to_code)
+        return self.crosswalks[from_code][to_code]
+
+    def _load_crosswalk_from_file(self, from_code, to_code):
         stream = pkg_resources.resource_stream(
             __name__, self.crosswalk_filepaths[from_code][to_code]
         )
-        if self.crosswalks[from_code][to_code] is None:
-            # Weighted crosswalks
-            if (from_code, to_code) in [
-                ("zip", "fips"),
-                ("fips", "zip"),
-                ("jhu_uid", "fips"),
-                ("zip", "msa"),
-                ("fips", "hrr"),
-                ("zip", "hhs")
-            ]:
-                self.crosswalks[from_code][to_code] = pd.read_csv(
-                    stream,
-                    dtype={
-                        from_code: str,
-                        to_code: str,
-                        "weight": float,
-                    },
-                )
-            # Unweighted crosswalks
-            elif (from_code, to_code) in [
-                ("zip", "hrr"),
-                ("fips", "msa"),
-                ("fips", "hhs"),
-                ("state_code", "hhs")
-            ]:
-                self.crosswalks[from_code][to_code] = pd.read_csv(
-                    stream,
-                    dtype={from_code: str, to_code: str},
-                )
-            # Special table of state codes, state IDs, and state names
-            elif (from_code, to_code) == ("state", "state"):
-                self.crosswalks[from_code][to_code] = pd.read_csv(
-                    stream,
-                    dtype={
-                        "state_code": str,
-                        "state_id": str,
-                        "state_name": str,
-                    },
-                )
-            elif (from_code, to_code) == ("zip", "state"):
-                self.crosswalks[from_code][to_code] = pd.read_csv(
-                    stream,
-                    dtype={
-                        "zip": str,
-                        "weight": float,
-                        "state_code": str,
-                        "state_id": str,
-                        "state_name": str,
-                    },
-                )
-            elif (from_code, to_code) == ("fips", "state"):
-                self.crosswalks[from_code][to_code] = pd.read_csv(
-                    stream,
-                    dtype={
-                        "fips": str,
-                        "state_code": str,
-                        "state_id": str,
-                        "state_name": str,
-                    },
-                )
-            # Population tables
-            elif to_code == "pop":
-                self.crosswalks[from_code][to_code] = pd.read_csv(
-                    stream,
-                    dtype={
-                        from_code: str,
-                        "pop": int,
-                    },
-                    usecols=[
-                        from_code,
-                        "pop"
-                    ]
-                )
-        return self.crosswalks[from_code][to_code]
+        usecols = None
+        dtype = None
+        # Weighted crosswalks
+        if (from_code, to_code) in [
+            ("zip", "fips"),
+            ("fips", "zip"),
+            ("jhu_uid", "fips"),
+            ("zip", "msa"),
+            ("fips", "hrr"),
+            ("zip", "hhs")
+        ]:
+            dtype = {
+                from_code: str,
+                to_code: str,
+                "weight": float,
+            }
+
+        # Unweighted crosswalks
+        elif (from_code, to_code) in [
+            ("zip", "hrr"),
+            ("fips", "msa"),
+            ("fips", "hhs"),
+            ("state_code", "hhs")
+        ]:
+            dtype = {from_code: str, to_code: str}
+
+        # Special table of state codes, state IDs, and state names
+        elif (from_code, to_code) == ("state", "state"):
+            dtype = {
+                "state_code": str,
+                "state_id": str,
+                "state_name": str,
+            }
+        elif (from_code, to_code) == ("zip", "state"):
+            dtype = {
+                "zip": str,
+                "weight": float,
+                "state_code": str,
+                "state_id": str,
+                "state_name": str,
+            }
+        elif (from_code, to_code) == ("fips", "state"):
+            dtype = {
+                    "fips": str,
+                    "state_code": str,
+                    "state_id": str,
+                    "state_name": str,
+            }
+
+        # Population tables
+        elif to_code == "pop":
+            dtype = {
+                from_code: str,
+                "pop": int,
+            }
+            usecols = [
+                from_code,
+                "pop"
+            ]
+        return pd.read_csv(stream, dtype=dtype, usecols=usecols)
 
     @staticmethod
     def convert_fips_to_mega(data, fips_col="fips", mega_col="megafips"):
@@ -333,19 +325,8 @@ def add_geocode(
             else:
                 df[from_col] = df[from_col].astype(str)
 
-        # Assuming that the passed-in records are all United States data, at the moment
-        if (from_code, new_code) in [("fips", "nation"), # pylint: disable=no-else-return
-                                     ("zip", "nation"),
-                                     ("state_code", "nation"),
-                                     ("state_name", "nation"),
-                                     ("state_id", "nation")]:
-            df[new_col] = df[from_col].apply(lambda x: "us")
-            return df
-        elif new_code == "nation":
-            raise ValueError(
-                f"Conversion to the nation level is not supported "
-                f"from {from_code}; try fips, zip, or state_*"
-            )
+        if new_code == "nation":
+            return self._add_nation_geocode(df, from_code, from_col, new_col)
 
         # state codes are all stored in one table
         if from_code in state_codes and new_code in state_codes:
@@ -375,11 +356,28 @@ def add_geocode(
             df.drop(columns=state_codes, inplace=True)
         elif new_code in state_codes and from_code in state_codes:
             state_codes.remove(new_code)
-            state_codes.remove(from_code)
+            if from_code in state_codes:
+                state_codes.remove(from_code)
             df.drop(columns=state_codes, inplace=True)
 
         return df
 
+    def _add_nation_geocode(self, df, from_code, from_col, new_col):
+        """Add a nation geocode column to a dataframe.
+
+        See `add_geocode()` documentation for argument description.
+        """
+        valid_from_codes = ["fips", "zip", "state_code", "state_name", "state_id"]
+        # Assuming that the passed-in records are all United States data, at the moment
+        if from_code in valid_from_codes:
+            df[new_col] = df[from_col].apply(lambda x: "us")
+            return df
+
+        raise ValueError(
+            f"Conversion to the nation level is not supported "
+            f"from {from_code}; try {valid_from_codes}"
+        )
+
     def replace_geocode(
         self,
         df,
 
@@ -1,8 +1,13 @@
+"""Provides unified not-a-number codes for the indicators."""
 
-NAN_CODES = {
-    "Not Missing": 0,
-    "Not Applicable": 1,
-    "Region Exception": 2,
-    "Data Insufficient": 3,
-    "Unknown": 4
-}
+from enum import IntEnum
+
+class Nans(IntEnum):
+    """An enum of not-a-number codes for the indicators."""
+
+    NOT_MISSING = 0
+    NOT_APPLICABLE = 1
+    REGION_EXCEPTION = 2
+    DATA_INSUFFICIENT = 3
+    PRIVACY = 4
+    UNKNOWN = 5
@@ -0,0 +1,35 @@
+"""Indicator running utilities."""
+from typing import Any, Callable, Dict, Optional
+from .archive import ArchiveDiffer
+from .utils import read_params
+from .validator.validate import Validator
+
+Params = Dict[str, Any]
+
+# Trivial function to use as default value for validator and archive functions.
+NULL_FN = lambda x: None
+
+def run_indicator_pipeline(indicator_fn:  Callable[[Params], None],
+                           validator_fn:  Callable[[Params], Optional[Validator]] = NULL_FN,
+                           archiver_fn:  Callable[[Params], Optional[ArchiveDiffer]] = NULL_FN):
+    """Run an indicator with its optional validation and archiving.
+
+    Arguments
+    ---------
+    indicator_fn: Callable[[Params], None]
+        function that takes a dictionary of parameters and produces indicator output
+    validator_fn: Callable[[Params], Optional[Validator]]
+        function that takes a dictionary of parameters and produces the associated Validator or
+        None if no validation should be performed.
+    archiver_fn: Callable[[Params], Optional[ArchiveDiffer]]
+        function that takes a dictionary of parameters and produces the associated ArchiveDiffer or
+        None if no archiving should be performed.
+    """
+    params = read_params()
+    indicator_fn(params)
+    validator = validator_fn(params)
+    archiver = archiver_fn(params)
+    if validator:
+        validation_report = validator.validate()
+    if archiver and (not validator or validation_report.success()):
+        archiver.archive()
@@ -192,14 +192,7 @@ def smooth(
             signal = self.impute(signal, impute_order=impute_order)
 
             # Smooth
-            if self.smoother_name == "savgol":
-                signal_smoothed = self.savgol_smoother(signal)
-            elif self.smoother_name == "left_gauss_linear":
-                signal_smoothed = self.left_gauss_linear_smoother(signal)
-            elif self.smoother_name == "moving_average":
-                signal_smoothed = self.moving_average_smoother(signal)
-            elif self.smoother_name == "identity":
-                signal_smoothed = signal
+            signal_smoothed = self._select_smoother()(signal)
 
         # Append the nans back, since we want to preserve length
         signal_smoothed = np.hstack([np.nan * np.ones(ix), signal_smoothed])
@@ -209,6 +202,18 @@ def smooth(
             signal_smoothed.index = pandas_index
         return signal_smoothed
 
+    def _select_smoother(self):
+        """Select a smoothing method based on the smoother type."""
+        if self.smoother_name == "savgol":
+            return self.savgol_smoother
+        if self.smoother_name == "left_gauss_linear":
+            return self.left_gauss_linear_smoother
+        if self.smoother_name == "moving_average":
+            return self.moving_average_smoother
+        if self.smoother_name == "identity":
+            return lambda x: x
+        raise ValueError(f"invalid smoother {self.smoother_name}")
+
     def impute(self, signal, impute_order=2):
         """Impute the nan values in the signal.
 
@@ -421,26 +426,23 @@ def savgol_smoother(self, signal):  # pylint: disable=inconsistent-return-statem
         # - shortened_window (default) applies savgol with a smaller window to do the fit
         # - identity keeps the original signal (doesn't smooth)
         # - nan writes nans
-        if self.boundary_method == "shortened_window":  # pylint: disable=no-else-return
-            for ix in range(min(len(self.coeffs), len(signal))):
-                if ix == 0:
-                    signal_smoothed[ix] = signal[ix]
-                else:
-                    # At the very edge, the design matrix is often singular, in which case
-                    # we just fall back to the raw signal
-                    try:
-                        signal_smoothed[ix] = self.savgol_predict(
-                            signal[: ix + 1], self.poly_fit_degree, 0
-                        )
-                    except np.linalg.LinAlgError:  # for small ix, the design matrix is singular
-                        signal_smoothed[ix] = signal[ix]
+        if self.boundary_method == "nan":
             return signal_smoothed
-        elif self.boundary_method == "identity":
-            for ix in range(min(len(self.coeffs), len(signal))):
+
+        # boundary methods "identity" and "shortened window"
+        for ix in range(min(len(self.coeffs), len(signal))):
+            if ix == 0 or self.boundary_method == "identity":
                 signal_smoothed[ix] = signal[ix]
-            return signal_smoothed
-        elif self.boundary_method == "nan":
-            return signal_smoothed
+            else:
+                # At the very edge, the design matrix is often singular, in which case
+                # we just fall back to the raw signal
+                try:
+                    signal_smoothed[ix] = self.savgol_predict(
+                        signal[: ix + 1], self.poly_fit_degree, 0
+                    )
+                except np.linalg.LinAlgError:  # for small ix, the design matrix is singular
+                    signal_smoothed[ix] = signal[ix]
+        return signal_smoothed
 
     def savgol_impute(self, signal, impute_order):
         """Impute the nan values in signal using savgol.
 
@@ -51,7 +51,7 @@ All of the user-changable parameters are stored in the `validation` field of the
 
 Please update the follow settings:
 
-* `global`: global validation settings
+* `common`: global validation settings
    * `data_source`: should match the [formatting](https://cmu-delphi.github.io/delphi-epidata/api/covidcast_signals.html) as used in COVIDcast API calls
    * `end_date`: specifies the last date to be checked; if set to "latest", `end_date` will always be the current date
    * `span_length`: specifies the number of days before the `end_date` to check. `span_length` should be long enough to contain all recent source data that is still in the process of being updated (i.e. in the backfill period), for example, if the data source of interest has a 2-week lag before all reports are in for a given date, `scan_length` should be 14 days