Add population functions, zip to state, msa

dshemetov · dshemetov · commit 9e10f004852e · 2020-08-27T13:49:13.000-07:00
diff --git a/_delphi_utils_python/delphi_utils/geomap.py b/_delphi_utils_python/delphi_utils/geomap.py
@@ -64,7 +64,7 @@ class GeoMapper:
 
     The GeoMapper instance loads crosswalk tables from the package data_dir. The
     crosswalk tables are assumed to have been built using the geo_data_proc.py script
-    in data_proc/geomap. If a mapping between codes is NOT one to many, then the table has 
+    in data_proc/geomap. If a mapping between codes is NOT one to many, then the table has
     just two colums. If the mapping IS one to many, then a third column, the weight column,
     exists (e.g. zip, fips, weight; satisfying (sum(weights) where zip==ZIP) == 1).
 
@@ -73,7 +73,7 @@ class GeoMapper:
     - load_* : load a crosswalk table into the instance (e.g. zip to fips).
     - convert_* : add a new column to a dataframe by joining with a crosswalk table
     - *_to_* : replace a geo code column with another, using weighted sum aggregation where
-               necessary (e.g. (sum(weights*count_column) groupby fips) would convert zip 
+               necessary (e.g. (sum(weights*count_column) groupby fips) would convert zip
                level data to fips level data)
     """
 
@@ -111,26 +111,40 @@ def load_crosswalk(self, from_code, to_code):
                 ("fips", "hrr"),
             ]:
                 self.crosswalks[from_code][to_code] = pd.read_csv(
-                    stream, dtype={from_code: str, to_code: str, "weight": float,},
+                    stream,
+                    dtype={
+                        from_code: str,
+                        to_code: str,
+                        "weight": float,
+                    },
                 )
             # Unweighted crosswalks
             elif (from_code, to_code) in [
                 ("zip", "hrr"),
                 ("fips", "msa"),
             ]:
                 self.crosswalks[from_code][to_code] = pd.read_csv(
-                    stream, dtype={from_code: str, to_code: str},
+                    stream,
+                    dtype={from_code: str, to_code: str},
                 )
             # Special table of state codes, state IDs, and state names
             elif (from_code, to_code) == ("state", "state"):
                 self.crosswalks[from_code][to_code] = pd.read_csv(
                     stream,
-                    dtype={"state_code": str, "state_id": str, "state_name": str,},
+                    dtype={
+                        "state_code": str,
+                        "state_id": str,
+                        "state_name": str,
+                    },
                 )
             # Population tables
             elif (from_code, to_code) in [("fips", "pop"), ("zip", "pop")]:
                 self.crosswalks[from_code][to_code] = pd.read_csv(
-                    stream, dtype={from_code: str, "pop": int,},
+                    stream,
+                    dtype={
+                        from_code: str,
+                        "pop": int,
+                    },
                 )
         return self.crosswalks[from_code][to_code]
 
@@ -200,7 +214,7 @@ def add_new_code(self, df, from_code, new_code, from_col=None, new_col=None):
 
         Parameters
         ---------
-        df: pd.DataFrame 
+        df: pd.DataFrame
             Input dataframe.
         from_code: {'fips', 'zip', 'jhu_uid', 'state_code', 'state_id', 'state_name'}
             Specifies the geocode type of the data in from_col.
@@ -231,7 +245,9 @@ def add_new_code(self, df, from_code, new_code, from_col=None, new_col=None):
         crosswalk = self.load_crosswalk(from_code=from_code, to_code=new_code)
         crosswalk = crosswalk.rename(columns={from_code: from_col, new_code: new_col})
 
-        df = df.merge(crosswalk, left_on=from_col, right_on=from_col, how="left").dropna(subset=[new_col])
+        df = df.merge(
+            crosswalk, left_on=from_col, right_on=from_col, how="left"
+        ).dropna(subset=[new_col])
 
         # Drop extra state columns
         state_codes = ["state_code", "state_id", "state_name"]
@@ -255,7 +271,7 @@ def convert_to_new_code(
 
         Parameters
         ---------
-        df: pd.DataFrame 
+        df: pd.DataFrame
             Input dataframe.
         from_col: str
             Name of the column in data to match and remove.
@@ -311,7 +327,7 @@ def convert_fips_to_state_code(
 
         Parameters
         ---------
-        data: pd.DataFrame 
+        data: pd.DataFrame
             Input dataframe.
         fips_col: str
             Name of FIPS column to convert in data.
@@ -361,13 +377,13 @@ def convert_fips_to_msa(
 
         Parameters
         ---------
-        data: pd.DataFrame 
+        data: pd.DataFrame
             Input data.
         fips_col: str
             Name of dataframe column containing fips codes.
         date_col: str
             Name of dataframe column containing the dates.
-        count_cols: str 
+        count_cols: str
             Name of dataframe column containing the data. If None (default) all non fips/date are used.
         msa_col: str
             Name of dataframe column to contain the msa codes.
@@ -387,8 +403,7 @@ def convert_fips_to_msa(
         data = data.merge(msa_table, left_on=fips_col, right_on="fips", how="left")
 
         # Megacounty codes are 1, followed by up to 4 leading zeros, and ending with
-        # two digits of the state's FIPS code.
-        # TODO: Does this need to be improved?
+        # two digits of the state's FIPS code.=
         if create_mega:
             data_st = data.loc[data[msa_col].isna(), fips_col]
             data.loc[data[msa_col].isna(), msa_col] = "1" + data_st.astype(str).str[
@@ -404,7 +419,7 @@ def convert_fips_to_zip(
 
         Parameters
         ---------
-        data: pd.DataFrame 
+        data: pd.DataFrame
             Input data.
         fips_col: str
             Name of dataframe column containing fips codes.
@@ -424,7 +439,9 @@ def convert_fips_to_zip(
             data[fips_col] = data[fips_col].astype(str).str.zfill(5)
 
         cross = df.rename(columns={"zip": zip_col, "weight": weight_col})
-        data = data.merge(cross, left_on=fips_col, right_on="fips", how="left").dropna(subset=[zip_col])
+        data = data.merge(cross, left_on=fips_col, right_on="fips", how="left").dropna(
+            subset=[zip_col]
+        )
         return data
 
     def convert_state_code_to_state_id(
@@ -582,6 +599,53 @@ def zip_to_state_code(
         data = data.groupby([date_col, state_code_col], dropna=False).sum()
         return data.reset_index()
 
+    def convert_zip_to_state_id(
+        self,
+        data,
+        zip_col="zip",
+        state_id_col="state_id",
+        date_col="date",
+        count_cols=None,
+    ):
+        zip_to_state_cross = self.load_crosswalk(from_code="zip", to_code="state")
+        zip_to_state_cross = zip_to_state_cross.drop(
+            columns=["state_code", "state_name"]
+        ).rename({"state_id": state_id_col})
+
+        if count_cols:
+            data = data[[zip_col, date_col] + count_cols].copy()
+
+        if not is_string_dtype(data[zip_col]):
+            data[zip_col] = data[zip_col].astype(str).str.zfill(5)
+
+        data = data.merge(zip_to_state_cross, left_on="zip", right_on="zip", how="left")
+        return data
+
+    def zip_to_state_id(
+        self,
+        data,
+        zip_col="zip",
+        state_id_col="state_id",
+        date_col="date",
+        count_cols=None,
+    ):
+        data = self.convert_zip_to_state_id(
+            data,
+            zip_col=zip_col,
+            state_id_col=state_id_col,
+            date_col=date_col,
+            count_cols=count_cols,
+        )
+        data.drop(columns="zip", inplace=True)
+
+        if count_cols is None:
+            count_cols = list(set(data.columns) - {date_col, state_id_col, "weight"})
+
+        data[count_cols] = data[count_cols].multiply(data["weight"], axis=0)
+        data.drop("weight", axis=1, inplace=True)
+        data = data.groupby([date_col, state_id_col], dropna=False).sum()
+        return data.reset_index()
+
     def fips_to_state_id(
         self,
         data,
@@ -594,13 +658,13 @@ def fips_to_state_id(
 
         Parameters
         ---------
-        data: pd.DataFrame 
+        data: pd.DataFrame
             Input data.
         fips_col: str
             Name of dataframe column containing fips codes.
         date_col: str
             Name of dataframe column containing the dates.
-        count_cols: str 
+        count_cols: str
             Name of dataframe column containing the data. If None (default) all non fips/date are used.
         state_id_col: str
             Name of dataframe column to contain the state codes.
@@ -629,20 +693,20 @@ def fips_to_msa(
         msa_col="msa",
     ):
         """Translate dataframe from fips to metropolitan statistical area (msa).
-        
+
         The encoding we use is based on the most recent Census Bureau release of CBSA (March 2020)
         All counties not mapped to MSAs have msa encoded as 000XX where XX is the fips state code
         To see how the crosswalk table is derived look at _delphi_utils_python/data_proc/geomap/*
 
         Parameters
         ---------
-        data: pd.DataFrame 
+        data: pd.DataFrame
             Input data.
         fips_col: str
             Name of dataframe column containing fips codes.
         date_col: str
             Name of dataframe column containing the dates.
-        count_cols: str 
+        count_cols: str
             Name of dataframe column containing the data. If None (default) all non fips/date are used.
         msa_col: str
             Name of dataframe column to contain the msa codes.
@@ -661,9 +725,9 @@ def fips_to_msa(
         data.drop(fips_col, axis=1, inplace=True)
         data.dropna(axis=0, subset=[msa_col], inplace=True)
         if date_col:
-            data = data.groupby([date_col, msa_col], dropna=False).sum()
+            data = data.groupby([date_col, msa_col]).sum()
         else:
-            data = data.groupby(msa_col, dropna=False).sum()
+            data = data.groupby(msa_col).sum()
         return data.reset_index()
 
     def zip_to_fips(
@@ -699,9 +763,9 @@ def zip_to_fips(
         data.drop([zip_col, "weight"], axis=1, inplace=True)
 
         if date_col:
-            data = data.groupby([date_col, fips_col], dropna=False).sum()
+            data = data.groupby([date_col, fips_col]).sum()
         else:
-            data = data.groupby(fips_col, dropna=False).sum()
+            data = data.groupby(fips_col).sum()
         return data.reset_index()
 
     def fips_to_megacounty(
@@ -750,7 +814,7 @@ def fips_to_megacounty(
         )
         data.set_index([fips_col, date_col], inplace=True)
         data = data.join(mega_data)
-        data = data.reset_index().groupby([date_col, mega_col], dropna=False).sum()
+        data = data.reset_index().groupby([date_col, mega_col]).sum()
         return data.reset_index()
 
     def zip_to_hrr(
@@ -835,7 +899,7 @@ def jhu_uid_to_fips(
         data.dropna(subset=[fips_col], axis=0, inplace=True)
         data[count_cols] = data[count_cols].multiply(data["weight"], axis=0)
         data.drop([jhu_col, "weight"], axis=1, inplace=True)
-        data = data.groupby([date_col, fips_col], dropna=False).sum()
+        data = data.groupby([date_col, fips_col]).sum()
         return data.reset_index()
 
     def fips_to_zip(
@@ -864,7 +928,7 @@ def fips_to_zip(
         data.drop(fips_col, axis=1, inplace=True)
         data[count_cols] = data[count_cols].multiply(data["weight"], axis=0)
         data.drop("weight", axis=1, inplace=True)
-        data = data.groupby([date_col, zip_col], dropna=False).sum()
+        data = data.groupby([date_col, zip_col]).sum()
         return data.reset_index()
 
     def fips_to_hrr(
@@ -901,3 +965,33 @@ def fips_to_hrr(
             hrr_col=hrr_col,
         )
         return data
+
+    def add_population_column(self, data, geocode_type, geocode_col=None):
+        """
+        Appends a population column to a dateframe, based on the FIPS or ZIP code.
+
+        Parameters
+        ---------
+        data: pd.DataFrame
+            The dataframe with a FIPS code column.
+        geocode_type: {"fips", "zip"}
+            The type of the geocode contained in geocode_col.
+        geocode_col: str, default None
+            The name of the column containing the geocodes. If None, uses the geocode_type
+            as the name.
+
+        Returns
+        --------
+        data_with_pop: pd.Dataframe
+            A dataframe with a population column appended.
+        """
+        geocode_col = geocode_type if geocode_col is None else geocode_col
+        pop_df = self.load_crosswalk(from_code=geocode_type, to_code="pop")
+        data_with_pop = (
+            data.copy()
+            .merge(pop_df, left_on=geocode_col, right_on=geocode_type, how="left")
+            .dropna(subset=["pop"])
+            .rename(columns={"pop": "population"})
+        )
+        data_with_pop["population"] = data_with_pop["population"].astype(int)
+        return data_with_pop
diff --git a/_delphi_utils_python/tests/test_geomap.py b/_delphi_utils_python/tests/test_geomap.py
@@ -208,3 +208,49 @@ def test_fips_to_hrr(self):
         gmpr = GeoMapper()
         new_data = gmpr.fips_to_hrr(self.fips_data_3)
         assert new_data.shape == (2,4)
+
+
+    def test_convert_zip_to_msa(self):
+        gmpr = GeoMapper()
+        new_data = gmpr.convert_zip_to_msa(self.zip_data)
+        assert new_data['msa'][2] == "40900"
+        assert new_data['count'].multiply(new_data['weight']).sum() == self.zip_data['count'].sum()
+
+    def test_zip_to_msa(self):
+        gmpr = GeoMapper()
+        new_data = gmpr.zip_to_msa(self.zip_data)
+        assert new_data.shape[0] == 6
+        assert np.allclose(new_data['count'].sum(), self.zip_data['count'].sum())
+
+    def test_convert_zip_to_state_code(self):
+        gmpr = GeoMapper()
+        new_data = gmpr.convert_zip_to_state_code(self.zip_data)
+        assert new_data.shape[0] == 12
+        assert np.allclose(new_data['count'].multiply(new_data['weight']).sum(), self.zip_data['count'].sum()) 
+
+    def test_zip_to_state_code(self):
+        gmpr = GeoMapper()
+        new_data = gmpr.zip_to_state_code(self.zip_data)
+        assert new_data.shape[0] == 4
+        assert np.allclose(new_data['count'].sum(), self.zip_data['count'].sum()) 
+
+    def test_convert_zip_to_state_id(self):
+        gmpr = GeoMapper()
+        new_data = gmpr.convert_zip_to_state_id(self.zip_data)
+        assert new_data.shape[0] == 12
+        assert np.allclose(new_data['count'].multiply(new_data['weight']).sum(), self.zip_data['count'].sum()) 
+
+    def test_zip_to_state_id(self):
+        gmpr = GeoMapper()
+        new_data = gmpr.zip_to_state_id(self.zip_data)
+        assert new_data.shape[0] == 4
+        assert np.allclose(new_data['count'].sum(), self.zip_data['count'].sum()) 
+
+    def test_add_population_column(self):
+        gmpr = GeoMapper()
+        self.fips_data_3["fips"] = self.fips_data_3["fips"].astype(str).str.zfill(5)
+        self.zip_data["zip"] = self.zip_data["zip"].astype(str).str.zfill(5)
+        new_data = gmpr.add_population_column(self.fips_data_3, "fips")
+        assert new_data["population"].sum() == 268155
+        new_data = gmpr.add_population_column(self.zip_data, "zip")
+        assert new_data["population"].sum() == 255160