Skip to content

Commit 2cb9807

Browse files
committed
Utils geo_data_proc: more minor changes
1 parent 1ff8b17 commit 2cb9807

File tree

1 file changed

+32
-68
lines changed

1 file changed

+32
-68
lines changed

_delphi_utils_python/data_proc/geomap/geo_data_proc.py

Lines changed: 32 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -57,18 +57,12 @@
5757

5858
def create_fips_zip_crosswalk():
5959
"""Build (weighted) crosswalk tables for FIPS to ZIP and ZIP to FIPS."""
60-
pop_df = pd.read_csv(FIPS_BY_ZIP_POP_URL)
61-
60+
pop_df = pd.read_csv(FIPS_BY_ZIP_POP_URL).rename(columns={"POPPT": "pop"})
6261
# Create the FIPS column by combining the state and county codes
63-
state_codes = pop_df["STATE"].astype(str).str.zfill(2)
64-
county_codes = pop_df["COUNTY"].astype(str).str.zfill(3)
65-
pop_df["fips"] = state_codes + county_codes
66-
62+
pop_df["fips"] = pop_df["STATE"].astype(str).str.zfill(2) + pop_df["COUNTY"].astype(str).str.zfill(3)
6763
# Create the ZIP column by adding leading zeros to the ZIP
6864
pop_df["zip"] = pop_df["ZCTA5"].astype(str).str.zfill(5)
69-
70-
# Pare down the dataframe to just the relevant columns: zip, fips, and population
71-
pop_df = pop_df[["zip", "fips", "POPPT"]].rename(columns={"POPPT": "pop"})
65+
pop_df = pop_df[["zip", "fips", "pop"]]
7266

7367
# Find the population fractions (the heaviest computation, takes about a minute)
7468
# Note that the denominator in the fractions is the source population
@@ -77,54 +71,39 @@ def create_fips_zip_crosswalk():
7771
zip_fips: DataFrame = pop_df.groupby("zip", as_index=False).apply(lambda g: g["pop"] / g["pop"].sum())
7872

7973
# Rename and write to file
80-
fips_zip = fips_zip.reset_index(level=["fips", "zip"]).rename(columns={"pop": "weight"})
81-
fips_zip = fips_zip[fips_zip["weight"] > 0.0]
74+
fips_zip = fips_zip.reset_index(level=["fips", "zip"]).rename(columns={"pop": "weight"}).query("weight > 0.0")
8275
fips_zip.sort_values(["fips", "zip"]).to_csv(join(OUTPUT_DIR, FIPS_ZIP_OUT_FILENAME), index=False)
8376

84-
zip_fips = zip_fips.reset_index(level=["fips", "zip"]).rename(columns={"pop": "weight"})
85-
zip_fips = zip_fips[zip_fips["weight"] > 0.0]
77+
zip_fips = zip_fips.reset_index(level=["fips", "zip"]).rename(columns={"pop": "weight"}).query("weight > 0.0")
8678
zip_fips.sort_values(["zip", "fips"]).to_csv(join(OUTPUT_DIR, ZIP_FIPS_OUT_FILENAME), index=False)
8779

8880

8981
def create_zip_hsa_hrr_crosswalk():
9082
"""Build a crosswalk table for ZIP to HSA and for ZIP to HRR."""
91-
zipped_csv = ZipFile(BytesIO(requests.get(ZIP_HSA_HRR_URL).content))
92-
zip_df = pd.read_csv(zipped_csv.open(ZIP_HSA_HRR_FILENAME))
83+
with ZipFile(BytesIO(requests.get(ZIP_HSA_HRR_URL).content)) as zipped_csv:
84+
zip_df = pd.read_csv(zipped_csv.open(ZIP_HSA_HRR_FILENAME))
9385

94-
# Build the HSA table
9586
hsa_df = zip_df[["zipcode18", "hsanum"]].rename(columns={"zipcode18": "zip", "hsanum": "hsa"})
87+
hsa_df["zip"] = hsa_df["zip"].astype(str).str.zfill(5)
88+
hsa_df["hsa"] = hsa_df["hsa"].astype(str)
89+
hsa_df.sort_values(["zip", "hsa"]).to_csv(join(OUTPUT_DIR, ZIP_HSA_OUT_FILENAME), index=False)
9690

97-
# Build the HRR table
9891
hrr_df = zip_df[["zipcode18", "hrrnum"]].rename(columns={"zipcode18": "zip", "hrrnum": "hrr"})
99-
100-
# Convert to zero-padded strings
10192
hrr_df["zip"] = hrr_df["zip"].astype(str).str.zfill(5)
10293
hrr_df["hrr"] = hrr_df["hrr"].astype(str)
103-
hsa_df["zip"] = hsa_df["zip"].astype(str).str.zfill(5)
104-
hsa_df["hsa"] = hsa_df["hsa"].astype(str)
105-
106-
hsa_df.sort_values(["zip", "hsa"]).to_csv(join(OUTPUT_DIR, ZIP_HSA_OUT_FILENAME), index=False)
10794
hrr_df.sort_values(["zip", "hrr"]).to_csv(join(OUTPUT_DIR, ZIP_HRR_OUT_FILENAME), index=False)
10895

10996

11097
def create_fips_msa_crosswalk():
11198
"""Build a crosswalk table for FIPS to MSA."""
112-
msa_cols = {
113-
"CBSA Code": int,
114-
"Metropolitan/Micropolitan Statistical Area": str,
115-
"FIPS State Code": str,
116-
"FIPS County Code": str,
117-
}
118-
# The following line requires the xlrd package.
119-
msa_df = pd.read_excel(FIPS_MSA_URL, skiprows=2, skipfooter=4, usecols=msa_cols.keys(), dtype=msa_cols)
120-
121-
metro_bool = msa_df["Metropolitan/Micropolitan Statistical Area"] == "Metropolitan Statistical Area"
122-
msa_df = msa_df[metro_bool]
99+
# Requires xlrd.
100+
msa_df = pd.read_excel(FIPS_MSA_URL, skiprows=2, skipfooter=4, dtype={"CBSA Code": int, "Metropolitan/Micropolitan Statistical Area": str, "FIPS State Code": str, "FIPS County Code": str}).rename(columns={"CBSA Code": "msa"})
101+
msa_df = msa_df[msa_df["Metropolitan/Micropolitan Statistical Area"] == "Metropolitan Statistical Area"]
123102

124103
# Combine state and county codes into a single FIPS code
125104
msa_df["fips"] = msa_df["FIPS State Code"].str.cat(msa_df["FIPS County Code"])
126105

127-
msa_df.rename(columns={"CBSA Code": "msa"}).sort_values(["fips", "msa"]).to_csv(join(OUTPUT_DIR, FIPS_MSA_OUT_FILENAME), columns=["fips", "msa"], index=False)
106+
msa_df.sort_values(["fips", "msa"]).to_csv(join(OUTPUT_DIR, FIPS_MSA_OUT_FILENAME), columns=["fips", "msa"], index=False)
128107

129108

130109
def create_jhu_uid_fips_crosswalk():
@@ -177,23 +156,19 @@ def create_jhu_uid_fips_crosswalk():
177156
{"jhu_uid": "84070020", "fips": "49000", "weight": 1.0},
178157
]
179158
)
159+
# Map the Unassigned category to a custom megaFIPS XX000
180160
unassigned_states = pd.DataFrame(
181-
[
182-
# Map the Unassigned category to a custom megaFIPS XX000
183-
{"jhu_uid": str(x), "fips": str(x)[-2:].ljust(5, "0"), "weight": 1.0}
184-
for x in range(84090001, 84090057)
185-
]
161+
{"jhu_uid": str(x), "fips": str(x)[-2:].ljust(5, "0"), "weight": 1.0}
162+
for x in range(84090001, 84090057)
186163
)
164+
# Map the Out of State category to a custom megaFIPS XX000
187165
out_of_state = pd.DataFrame(
188-
[
189-
# Map the Out of State category to a custom megaFIPS XX000
190-
{"jhu_uid": str(x), "fips": str(x)[-2:].ljust(5, "0"), "weight": 1.0}
191-
for x in range(84080001, 84080057)
192-
]
166+
{"jhu_uid": str(x), "fips": str(x)[-2:].ljust(5, "0"), "weight": 1.0}
167+
for x in range(84080001, 84080057)
193168
)
169+
# Map the Unassigned and Out of State categories to the cusom megaFIPS 72000
194170
puerto_rico_unassigned = pd.DataFrame(
195171
[
196-
# Map the Unassigned and Out of State categories to the cusom megaFIPS 72000
197172
{"jhu_uid": "63072888", "fips": "72000", "weight": 1.0},
198173
{"jhu_uid": "63072999", "fips": "72000", "weight": 1.0},
199174
]
@@ -206,35 +181,29 @@ def create_jhu_uid_fips_crosswalk():
206181
)
207182

208183
jhu_df = pd.read_csv(JHU_FIPS_URL, dtype={"UID": str, "FIPS": str}).query("Country_Region == 'US'")
209-
jhu_df = jhu_df.rename(columns={"UID": "jhu_uid", "FIPS": "fips"}).dropna(subset=["fips"])[["jhu_uid", "fips"]]
184+
jhu_df = jhu_df.rename(columns={"UID": "jhu_uid", "FIPS": "fips"}).dropna(subset=["fips"])
210185

211186
# FIPS Codes that are just two digits long should be zero filled on the right.
212187
# These are US state codes (XX) and the territories Guam (66), Northern Mariana Islands (69),
213188
# Virgin Islands (78), and Puerto Rico (72).
214-
fips_st = jhu_df["fips"].str.len() <= 2
215-
jhu_df.loc[fips_st, "fips"] = jhu_df.loc[fips_st, "fips"].str.ljust(5, "0")
189+
fips_territories = jhu_df["fips"].str.len() <= 2
190+
jhu_df.loc[fips_territories, "fips"] = jhu_df.loc[fips_territories, "fips"].str.ljust(5, "0")
216191

217192
# Drop the JHU UIDs that were hand-modified
218193
manual_correction_ids = pd.concat([hand_additions, unassigned_states, out_of_state, puerto_rico_unassigned, cruise_ships])["jhu_uid"]
219-
dup_ind = jhu_df["jhu_uid"].isin(manual_correction_ids)
220-
jhu_df.drop(jhu_df.index[dup_ind], inplace=True)
194+
jhu_df.drop(jhu_df.index[jhu_df["jhu_uid"].isin(manual_correction_ids)], inplace=True)
221195

222196
# Add weights of 1.0 to everything not in hand additions, then merge in hand-additions
223197
# Finally, zero fill FIPS
224198
jhu_df["weight"] = 1.0
225199
jhu_df = pd.concat([jhu_df, hand_additions, unassigned_states, out_of_state, puerto_rico_unassigned])
226200
jhu_df["fips"] = jhu_df["fips"].astype(int).astype(str).str.zfill(5)
227-
jhu_df.sort_values(["jhu_uid", "fips"]).to_csv(join(OUTPUT_DIR, JHU_FIPS_OUT_FILENAME), index=False)
201+
jhu_df.sort_values(["jhu_uid", "fips"]).to_csv(join(OUTPUT_DIR, JHU_FIPS_OUT_FILENAME), columns=["jhu_uid", "fips", "weight"], index=False)
228202

229203

230204
def create_state_codes_crosswalk():
231205
"""Build a State ID -> State Name -> State code crosswalk file."""
232-
column_rename_map = {
233-
"STATE": "state_code",
234-
"STUSAB": "state_id",
235-
"STATE_NAME": "state_name",
236-
}
237-
df = pd.read_csv(STATE_CODES_URL, delimiter="|").drop(columns="STATENS").rename(columns=column_rename_map)
206+
df = pd.read_csv(STATE_CODES_URL, delimiter="|").drop(columns="STATENS").rename(columns={"STATE": "state_code", "STUSAB": "state_id", "STATE_NAME": "state_name"})
238207
df["state_code"] = df["state_code"].astype(str).str.zfill(2)
239208
df["state_id"] = df["state_id"].astype(str).str.lower()
240209

@@ -259,7 +228,6 @@ def create_state_codes_crosswalk():
259228
]
260229
)
261230
df = pd.concat((df, territories))
262-
263231
df.sort_values("state_code").to_csv(join(OUTPUT_DIR, STATE_OUT_FILENAME), index=False)
264232

265233

@@ -288,8 +256,7 @@ def create_state_hhs_crosswalk():
288256
hhs_state_pairs.append((9, "Northern Mariana Islands"))
289257

290258
# Make dataframe
291-
hhs_df = pd.DataFrame(hhs_state_pairs, columns=["hhs", "state_name"])
292-
hhs_df["hhs"] = hhs_df["hhs"].astype(str)
259+
hhs_df = pd.DataFrame(hhs_state_pairs, columns=["hhs", "state_name"], dtype=str)
293260

294261
ss_df = ss_df.merge(hhs_df, on="state_name", how="left").dropna()
295262
ss_df.sort_values("state_code").to_csv(join(OUTPUT_DIR, STATE_HHS_OUT_FILENAME), columns=["state_code", "hhs"], index=False)
@@ -319,13 +286,12 @@ def create_fips_population_table():
319286
census_pop = census_pop.reset_index(drop=True)
320287

321288
# Get the file with Puerto Rico populations
322-
df_pr = pd.read_csv(FIPS_PUERTO_RICO_POPULATION_URL)
289+
df_pr = pd.read_csv(FIPS_PUERTO_RICO_POPULATION_URL).rename(columns={"POPPT": "pop"})
323290
df_pr["fips"] = df_pr["STATE"].astype(str).str.zfill(2) + df_pr["COUNTY"].astype(str).str.zfill(3)
324-
df_pr = df_pr.rename(columns={"POPPT": "pop"})[["fips", "pop"]]
291+
df_pr = df_pr[["fips", "pop"]]
325292
# Create the Puerto Rico megaFIPS
326293
df_pr = df_pr[df_pr["fips"].isin([str(x) for x in range(72000, 72999)])]
327294
df_pr = pd.concat([df_pr, pd.DataFrame([{"fips": "72000", "pop": df_pr["pop"].sum()}])])
328-
329295
# Fill the missing Puerto Rico data with 2010 information
330296
df_pr = df_pr.groupby("fips").sum().reset_index()
331297
df_pr = df_pr[~df_pr["fips"].isin(census_pop["fips"])]
@@ -354,8 +320,7 @@ def create_state_population_table():
354320

355321
census_pop = pd.read_csv(join(OUTPUT_DIR, FIPS_POPULATION_OUT_FILENAME), dtype={"fips": str, "pop": int})
356322
state: DataFrame = pd.read_csv(join(OUTPUT_DIR, FIPS_STATE_OUT_FILENAME), dtype=str)
357-
combined = state.merge(census_pop, on="fips")
358-
state_pop = combined.groupby(["state_code", "state_id", "state_name"], as_index=False).sum()
323+
state_pop = state.merge(census_pop, on="fips").groupby(["state_code", "state_id", "state_name"], as_index=False).sum()
359324
state_pop.sort_values("state_code").to_csv(join(OUTPUT_DIR, STATE_POPULATION_OUT_FILENAME), index=False)
360325

361326

@@ -369,8 +334,7 @@ def create_hhs_population_table():
369334

370335
state_pop = pd.read_csv(join(OUTPUT_DIR, STATE_POPULATION_OUT_FILENAME), dtype={"state_code": str, "hhs": int}, usecols=["state_code", "pop"])
371336
state_hhs = pd.read_csv(join(OUTPUT_DIR, STATE_HHS_OUT_FILENAME), dtype=str)
372-
combined = state_pop.merge(state_hhs, on="state_code")
373-
hhs_pop = combined.groupby("hhs", as_index=False).sum()
337+
hhs_pop = state_pop.merge(state_hhs, on="state_code").groupby("hhs", as_index=False).sum()
374338
hhs_pop.sort_values("hhs").to_csv(join(OUTPUT_DIR, HHS_POPULATION_OUT_FILENAME), index=False)
375339

376340

0 commit comments

Comments
 (0)