Skip to content

Commit 1ff8b17

Browse files
committed
Utils geo_data_proc: more readability improvements
* type hinting for language server * function docstrings * split really long pandas chains in two
1 parent b5668ee commit 1ff8b17

File tree

1 file changed

+61
-65
lines changed

1 file changed

+61
-65
lines changed

_delphi_utils_python/data_proc/geomap/geo_data_proc.py

Lines changed: 61 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,18 @@
11
"""
22
Authors: Dmitry Shemetov @dshemetov, James Sharpnack @jsharpna
3+
4+
Intended execution:
5+
6+
cd _delphi_utils/data_proc/geomap
7+
chmod u+x geo_data_proc.py
8+
python geo_data_proc.py
39
"""
410

511
from io import BytesIO
612
from os import remove, listdir
713
from os.path import join, isfile
814
from zipfile import ZipFile
15+
from pandas.core.frame import DataFrame
916

1017
import requests
1118
import pandas as pd
@@ -49,10 +56,7 @@
4956

5057

5158
def create_fips_zip_crosswalk():
52-
"""
53-
Creates the (weighted) crosswalk tables between FIPS to ZIP and ZIP to FIPS
54-
from source.
55-
"""
59+
"""Build (weighted) crosswalk tables for FIPS to ZIP and ZIP to FIPS."""
5660
pop_df = pd.read_csv(FIPS_BY_ZIP_POP_URL)
5761

5862
# Create the FIPS column by combining the state and county codes
@@ -69,8 +73,8 @@ def create_fips_zip_crosswalk():
6973
# Find the population fractions (the heaviest computation, takes about a minute)
7074
# Note that the denominator in the fractions is the source population
7175
pop_df.set_index(["fips", "zip"], inplace=True)
72-
fips_zip = pop_df.groupby("fips", as_index=False).apply(lambda g: g["pop"] / g["pop"].sum())
73-
zip_fips = pop_df.groupby("zip", as_index=False).apply(lambda g: g["pop"] / g["pop"].sum())
76+
fips_zip: DataFrame = pop_df.groupby("fips", as_index=False).apply(lambda g: g["pop"] / g["pop"].sum())
77+
zip_fips: DataFrame = pop_df.groupby("zip", as_index=False).apply(lambda g: g["pop"] / g["pop"].sum())
7478

7579
# Rename and write to file
7680
fips_zip = fips_zip.reset_index(level=["fips", "zip"]).rename(columns={"pop": "weight"})
@@ -83,7 +87,7 @@ def create_fips_zip_crosswalk():
8387

8488

8589
def create_zip_hsa_hrr_crosswalk():
86-
"""Creates the crosswalk table from ZIP to HSA and from ZIP to HRR from source."""
90+
"""Build a crosswalk table for ZIP to HSA and for ZIP to HRR."""
8791
zipped_csv = ZipFile(BytesIO(requests.get(ZIP_HSA_HRR_URL).content))
8892
zip_df = pd.read_csv(zipped_csv.open(ZIP_HSA_HRR_FILENAME))
8993

@@ -104,33 +108,27 @@ def create_zip_hsa_hrr_crosswalk():
104108

105109

106110
def create_fips_msa_crosswalk():
107-
"""Creates the crosswalk table from FIPS to MSA from source."""
111+
"""Build a crosswalk table for FIPS to MSA."""
108112
msa_cols = {
109113
"CBSA Code": int,
110114
"Metropolitan/Micropolitan Statistical Area": str,
111115
"FIPS State Code": str,
112116
"FIPS County Code": str,
113117
}
114118
# The following line requires the xlrd package.
115-
msa_df = pd.read_excel(
116-
FIPS_MSA_URL,
117-
skiprows=2,
118-
skipfooter=4,
119-
usecols=msa_cols.keys(),
120-
dtype=msa_cols,
121-
)
119+
msa_df = pd.read_excel(FIPS_MSA_URL, skiprows=2, skipfooter=4, usecols=msa_cols.keys(), dtype=msa_cols)
122120

123121
metro_bool = msa_df["Metropolitan/Micropolitan Statistical Area"] == "Metropolitan Statistical Area"
124122
msa_df = msa_df[metro_bool]
125123

126124
# Combine state and county codes into a single FIPS code
127125
msa_df["fips"] = msa_df["FIPS State Code"].str.cat(msa_df["FIPS County Code"])
128126

129-
msa_df.rename(columns={"CBSA Code": "msa"})[["fips", "msa"]].sort_values(["fips", "msa"]).to_csv(join(OUTPUT_DIR, FIPS_MSA_OUT_FILENAME), index=False)
127+
msa_df.rename(columns={"CBSA Code": "msa"}).sort_values(["fips", "msa"]).to_csv(join(OUTPUT_DIR, FIPS_MSA_OUT_FILENAME), columns=["fips", "msa"], index=False)
130128

131129

132130
def create_jhu_uid_fips_crosswalk():
133-
"""Creates the crosswalk table from JHU UID to FIPS from source."""
131+
"""Build a crosswalk table from JHU UID to FIPS."""
134132
# These are hand modifications that need to be made to the translation
135133
# between JHU UID and FIPS. See below for the special cases information
136134
# https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html#geographical-exceptions
@@ -207,7 +205,8 @@ def create_jhu_uid_fips_crosswalk():
207205
]
208206
)
209207

210-
jhu_df = pd.read_csv(JHU_FIPS_URL, dtype={"UID": str, "FIPS": str}).query("Country_Region == 'US'")[["UID", "FIPS"]].rename(columns={"UID": "jhu_uid", "FIPS": "fips"}).dropna(subset=["fips"])
208+
jhu_df = pd.read_csv(JHU_FIPS_URL, dtype={"UID": str, "FIPS": str}).query("Country_Region == 'US'")
209+
jhu_df = jhu_df.rename(columns={"UID": "jhu_uid", "FIPS": "fips"}).dropna(subset=["fips"])[["jhu_uid", "fips"]]
211210

212211
# FIPS Codes that are just two digits long should be zero filled on the right.
213212
# These are US state codes (XX) and the territories Guam (66), Northern Mariana Islands (69),
@@ -216,7 +215,8 @@ def create_jhu_uid_fips_crosswalk():
216215
jhu_df.loc[fips_st, "fips"] = jhu_df.loc[fips_st, "fips"].str.ljust(5, "0")
217216

218217
# Drop the JHU UIDs that were hand-modified
219-
dup_ind = jhu_df["jhu_uid"].isin(pd.concat([hand_additions, unassigned_states, out_of_state, puerto_rico_unassigned, cruise_ships])["jhu_uid"].values)
218+
manual_correction_ids = pd.concat([hand_additions, unassigned_states, out_of_state, puerto_rico_unassigned, cruise_ships])["jhu_uid"]
219+
dup_ind = jhu_df["jhu_uid"].isin(manual_correction_ids)
220220
jhu_df.drop(jhu_df.index[dup_ind], inplace=True)
221221

222222
# Add weights of 1.0 to everything not in hand additions, then merge in hand-additions
@@ -228,13 +228,13 @@ def create_jhu_uid_fips_crosswalk():
228228

229229

230230
def create_state_codes_crosswalk():
231-
"""Create the State ID -> State Name -> State code crosswalk file."""
231+
"""Build a State ID -> State Name -> State code crosswalk file."""
232232
column_rename_map = {
233233
"STATE": "state_code",
234234
"STUSAB": "state_id",
235235
"STATE_NAME": "state_name",
236236
}
237-
df = pd.read_csv(STATE_CODES_URL, delimiter="|").drop(columns="STATENS").rename(column_rename_map)
237+
df = pd.read_csv(STATE_CODES_URL, delimiter="|").drop(columns="STATENS").rename(columns=column_rename_map)
238238
df["state_code"] = df["state_code"].astype(str).str.zfill(2)
239239
df["state_id"] = df["state_id"].astype(str).str.lower()
240240

@@ -264,9 +264,7 @@ def create_state_codes_crosswalk():
264264

265265

266266
def create_state_hhs_crosswalk():
267-
"""
268-
Create the state to hhs crosswalk.
269-
"""
267+
"""Build a state to HHS crosswalk."""
270268
if not isfile(join(OUTPUT_DIR, STATE_OUT_FILENAME)):
271269
create_state_codes_crosswalk()
272270

@@ -293,19 +291,18 @@ def create_state_hhs_crosswalk():
293291
hhs_df = pd.DataFrame(hhs_state_pairs, columns=["hhs", "state_name"])
294292
hhs_df["hhs"] = hhs_df["hhs"].astype(str)
295293

296-
ss_df.merge(hhs_df, on="state_name", how="left").dropna()[["state_code", "hhs"]].sort_values("state_code").to_csv(join(OUTPUT_DIR, STATE_HHS_OUT_FILENAME), index=False)
294+
ss_df = ss_df.merge(hhs_df, on="state_name", how="left").dropna()
295+
ss_df.sort_values("state_code").to_csv(join(OUTPUT_DIR, STATE_HHS_OUT_FILENAME), columns=["state_code", "hhs"], index=False)
297296

298297

299298
def create_fips_population_table():
300-
"""
301-
Build a table of populations by FIPS county codes. Uses US Census Bureau population
302-
data from 2019, supplemented with 2010 population data for Puerto Rico, and a few
303-
small counties.
299+
"""Build a table of populations by FIPS county codes.
300+
301+
Uses US Census Bureau population data from 2020, with 2010 population data for Puerto Rico and a few exceptions.
304302
"""
305303
census_pop = pd.read_csv(FIPS_POPULATION_URL, encoding="ISO-8859-1")
306304
census_pop["fips"] = census_pop.apply(lambda x: f"{x['STATE']:02d}{x['COUNTY']:03d}", axis=1)
307-
census_pop["pop"] = census_pop["POPESTIMATE2020"]
308-
census_pop = census_pop[["fips", "pop"]]
305+
census_pop = census_pop.rename(columns={"POPESTIMATE2020": "pop"})[["fips", "pop"]]
309306

310307
# Set population for Dukes and Nantucket combo county
311308
dukes_pop = int(census_pop.loc[census_pop["fips"] == "25007", "pop"])
@@ -324,8 +321,7 @@ def create_fips_population_table():
324321
# Get the file with Puerto Rico populations
325322
df_pr = pd.read_csv(FIPS_PUERTO_RICO_POPULATION_URL)
326323
df_pr["fips"] = df_pr["STATE"].astype(str).str.zfill(2) + df_pr["COUNTY"].astype(str).str.zfill(3)
327-
df_pr["pop"] = df_pr["POPPT"]
328-
df_pr = df_pr[["fips", "pop"]]
324+
df_pr = df_pr.rename(columns={"POPPT": "pop"})[["fips", "pop"]]
329325
# Create the Puerto Rico megaFIPS
330326
df_pr = df_pr[df_pr["fips"].isin([str(x) for x in range(72000, 72999)])]
331327
df_pr = pd.concat([df_pr, pd.DataFrame([{"fips": "72000", "pop": df_pr["pop"].sum()}])])
@@ -349,22 +345,28 @@ def create_fips_population_table():
349345

350346

351347
def create_state_population_table():
348+
"""Build a state population table."""
352349
if not isfile(join(OUTPUT_DIR, FIPS_POPULATION_OUT_FILENAME)):
353350
create_fips_population_table()
351+
354352
if not isfile(join(OUTPUT_DIR, FIPS_STATE_OUT_FILENAME)):
355353
derive_fips_state_crosswalk()
354+
356355
census_pop = pd.read_csv(join(OUTPUT_DIR, FIPS_POPULATION_OUT_FILENAME), dtype={"fips": str, "pop": int})
357-
state = pd.read_csv(join(OUTPUT_DIR, FIPS_STATE_OUT_FILENAME), dtype=str)
356+
state: DataFrame = pd.read_csv(join(OUTPUT_DIR, FIPS_STATE_OUT_FILENAME), dtype=str)
358357
combined = state.merge(census_pop, on="fips")
359358
state_pop = combined.groupby(["state_code", "state_id", "state_name"], as_index=False).sum()
360359
state_pop.sort_values("state_code").to_csv(join(OUTPUT_DIR, STATE_POPULATION_OUT_FILENAME), index=False)
361360

362361

363362
def create_hhs_population_table():
363+
"""Build an HHS population table."""
364364
if not isfile(join(OUTPUT_DIR, STATE_POPULATION_OUT_FILENAME)):
365365
create_state_population_table()
366+
366367
if not isfile(join(OUTPUT_DIR, STATE_HHS_OUT_FILENAME)):
367368
create_state_hhs_crosswalk()
369+
368370
state_pop = pd.read_csv(join(OUTPUT_DIR, STATE_POPULATION_OUT_FILENAME), dtype={"state_code": str, "hhs": int}, usecols=["state_code", "pop"])
369371
state_hhs = pd.read_csv(join(OUTPUT_DIR, STATE_HHS_OUT_FILENAME), dtype=str)
370372
combined = state_pop.merge(state_hhs, on="state_code")
@@ -373,18 +375,17 @@ def create_hhs_population_table():
373375

374376

375377
def create_nation_population_table():
378+
"""Build a nation population table."""
376379
if not isfile(join(OUTPUT_DIR, FIPS_POPULATION_OUT_FILENAME)):
377380
create_fips_population_table()
381+
378382
census_pop = pd.read_csv(join(OUTPUT_DIR, FIPS_POPULATION_OUT_FILENAME), dtype={"fips": str, "pop": int})
379383
nation_pop = pd.DataFrame({"nation": ["us"], "pop": [census_pop["pop"].sum()]})
380384
nation_pop.to_csv(join(OUTPUT_DIR, NATION_POPULATION_OUT_FILENAME), index=False)
381385

382386

383387
def derive_zip_population_table():
384-
"""
385-
Builds a table of populations by ZIP code. Combines the tble of populations by
386-
FIPS code with the FIPS to ZIP code mapping.
387-
"""
388+
"""Build a table of populations by ZIP code by translating from FIPS populations."""
388389
if not isfile(join(OUTPUT_DIR, FIPS_POPULATION_OUT_FILENAME)):
389390
create_fips_population_table()
390391

@@ -402,8 +403,7 @@ def derive_zip_population_table():
402403

403404

404405
def derive_fips_hrr_crosswalk():
405-
"""Derives a crosswalk file from FIPS to HRR through FIPZ -> ZIP -> HRR
406-
from the crosswalk files made by the functions above."""
406+
"""Derive a crosswalk file from FIPS to HRR through FIPS -> ZIP -> HRR."""
407407
if not isfile(join(OUTPUT_DIR, FIPS_ZIP_OUT_FILENAME)):
408408
create_fips_zip_crosswalk()
409409

@@ -413,29 +413,25 @@ def derive_fips_hrr_crosswalk():
413413
fz_df = pd.read_csv(join(OUTPUT_DIR, FIPS_ZIP_OUT_FILENAME), dtype={"fips": str, "zip": str, "weight": float})
414414
zh_df = pd.read_csv(join(OUTPUT_DIR, ZIP_HRR_OUT_FILENAME), dtype={"zip": str, "hrr": str})
415415

416-
fz_df.merge(zh_df, on="zip", how="left").drop(columns="zip").groupby(["fips", "hrr"]).sum().reset_index().sort_values(["fips", "hrr"]).to_csv(join(OUTPUT_DIR, FIPS_HRR_OUT_FILENAME), index=False)
416+
fz_df = fz_df.merge(zh_df, on="zip", how="left").drop(columns="zip").groupby(["fips", "hrr"]).sum().reset_index()
417+
fz_df.sort_values(["fips", "hrr"]).to_csv(join(OUTPUT_DIR, FIPS_HRR_OUT_FILENAME), index=False)
417418

418419

419420
def derive_fips_state_crosswalk():
420-
"""
421-
Builds a crosswalk between FIPS county codes and state information (number,
422-
abbreviation, name).
423-
"""
421+
"""Derive a crosswalk between FIPS county codes and state information (number, abbreviation, name)."""
424422
fips_pop = pd.read_csv(join(OUTPUT_DIR, FIPS_POPULATION_OUT_FILENAME), dtype={"fips": str, "pop": int})
425423

426424
megafips = pd.DataFrame({"fips": [fips + "000" for fips in fips_pop.fips.str[:2].unique()], "pop": np.nan})
427425
fips_pop = pd.concat([fips_pop, megafips])
428426

429427
state_codes = pd.read_csv(join(OUTPUT_DIR, STATE_OUT_FILENAME), dtype={"state_code": str, "state_id": str, "state_name": str})
430428
fips_pop["state_code"] = fips_pop["fips"].str[:2]
431-
fips_pop.merge(state_codes, on="state_code", how="left").drop(columns="pop").sort_values(["fips", "state_code"]).to_csv(join(OUTPUT_DIR, FIPS_STATE_OUT_FILENAME), index=False)
429+
fips_pop = fips_pop.merge(state_codes, on="state_code", how="left").drop(columns="pop")
430+
fips_pop.sort_values(["fips", "state_code"]).to_csv(join(OUTPUT_DIR, FIPS_STATE_OUT_FILENAME), index=False)
432431

433432

434433
def derive_zip_msa_crosswalk():
435-
"""
436-
Derives a crosswalk file from ZIP to MSA through ZIP -> FIPS -> HRR
437-
from the crosswalk files made by the functions above.
438-
"""
434+
"""Derive a crosswalk file from ZIP to MSA through ZIP -> FIPS -> HRR."""
439435
if not isfile(join(OUTPUT_DIR, ZIP_FIPS_OUT_FILENAME)):
440436
create_fips_zip_crosswalk()
441437

@@ -445,32 +441,31 @@ def derive_zip_msa_crosswalk():
445441
zf_df = pd.read_csv(join(OUTPUT_DIR, ZIP_FIPS_OUT_FILENAME), dtype={"zip": str, "fips": str, "weight": float})
446442
fm_df = pd.read_csv(join(OUTPUT_DIR, FIPS_MSA_OUT_FILENAME), dtype={"fips": str, "msa": str})
447443

448-
zf_df.merge(fm_df, on="fips").drop(columns="fips").groupby(["msa", "zip"]).sum().reset_index().sort_values(["zip", "msa"]).to_csv(join(OUTPUT_DIR, ZIP_MSA_OUT_FILENAME), index=False)
444+
zf_df = zf_df.merge(fm_df, on="fips").drop(columns="fips").groupby(["msa", "zip"]).sum().reset_index()
445+
zf_df.sort_values(["zip", "msa"]).to_csv(join(OUTPUT_DIR, ZIP_MSA_OUT_FILENAME), index=False)
449446

450447

451448
def derive_zip_to_state_code():
452-
"""
453-
Builds a crosswalk between ZIP codes and state information (number, abbreviation,
454-
name).
455-
"""
449+
"""Derive a crosswalk between ZIP codes and state information (number, abbreviation, name)."""
456450
if not isfile(join(OUTPUT_DIR, STATE_OUT_FILENAME)):
457451
create_state_codes_crosswalk()
452+
458453
if not isfile(join(OUTPUT_DIR, ZIP_FIPS_OUT_FILENAME)):
459454
create_fips_zip_crosswalk()
460455

461456
sdf = pd.read_csv(join(OUTPUT_DIR, STATE_OUT_FILENAME), dtype={"state_code": str, "state_id": str, "state_name": str})
462457
zf_cf = pd.read_csv(join(OUTPUT_DIR, ZIP_FIPS_OUT_FILENAME), dtype={"zip": str, "fips": str})
463458

464459
zf_cf["state_code"] = zf_cf["fips"].str[:2]
465-
zf_cf.merge(sdf, left_on="state_code", right_on="state_code", how="left").drop(columns=["fips"]).sort_values(["zip", "state_code"]).to_csv(join(OUTPUT_DIR, ZIP_STATE_CODE_OUT_FILENAME), index=False)
460+
zf_cf = zf_cf.merge(sdf, left_on="state_code", right_on="state_code", how="left").drop(columns=["fips"])
461+
zf_cf.sort_values(["zip", "state_code"]).to_csv(join(OUTPUT_DIR, ZIP_STATE_CODE_OUT_FILENAME), index=False)
466462

467463

468464
def derive_fips_hhs_crosswalk():
469-
"""
470-
Builds a crosswalk between FIPS county codes and HHS regions.
471-
"""
465+
"""Derive a crosswalk between FIPS county codes and HHS regions."""
472466
if not isfile(join(OUTPUT_DIR, STATE_HHS_OUT_FILENAME)):
473467
create_state_hhs_crosswalk()
468+
474469
if not isfile(join(OUTPUT_DIR, FIPS_POPULATION_OUT_FILENAME)):
475470
create_fips_population_table()
476471

@@ -481,22 +476,23 @@ def derive_fips_hhs_crosswalk():
481476
state_hhs = pd.read_csv(join(OUTPUT_DIR, STATE_HHS_OUT_FILENAME), dtype={"state_code": str, "hhs": str})
482477

483478
fips_pop["state_code"] = fips_pop["fips"].str[:2]
484-
fips_pop.merge(state_hhs, on="state_code", how="left").drop(columns=["state_code", "pop"]).sort_values(["fips", "hhs"]).to_csv(join(OUTPUT_DIR, FIPS_HHS_FILENAME), index=False)
479+
fips_pop = fips_pop.merge(state_hhs, on="state_code", how="left").drop(columns=["state_code", "pop"])
480+
fips_pop.sort_values(["fips", "hhs"]).to_csv(join(OUTPUT_DIR, FIPS_HHS_FILENAME), index=False)
485481

486482

487483
def derive_zip_hhs_crosswalk():
488-
"""
489-
Builds a crosswalk between zip code and HHS regions.
490-
"""
484+
"""Derive a crosswalk between zip code and HHS regions."""
491485
if not isfile(join(OUTPUT_DIR, STATE_HHS_OUT_FILENAME)):
492486
create_state_hhs_crosswalk()
487+
493488
if not isfile(join(OUTPUT_DIR, ZIP_STATE_CODE_OUT_FILENAME)):
494489
derive_zip_to_state_code()
495490

496491
zip_state = pd.read_csv(join(OUTPUT_DIR, ZIP_STATE_CODE_OUT_FILENAME), dtype={"zip": str, "pop": int, "state_code": str})
497492
state_hhs = pd.read_csv(join(OUTPUT_DIR, STATE_HHS_OUT_FILENAME), dtype={"state_code": str, "hhs": str})
498493

499-
zip_state.merge(state_hhs, on="state_code", how="left").drop(columns=["state_code", "state_id", "state_name"]).sort_values(["zip", "hhs"]).to_csv(join(OUTPUT_DIR, ZIP_HHS_FILENAME), index=False)
494+
zip_state = zip_state.merge(state_hhs, on="state_code", how="left").drop(columns=["state_code", "state_id", "state_name"])
495+
zip_state.sort_values(["zip", "hhs"]).to_csv(join(OUTPUT_DIR, ZIP_HHS_FILENAME), index=False)
500496

501497

502498
def clear_dir(dir_path: str):
@@ -524,4 +520,4 @@ def clear_dir(dir_path: str):
524520
derive_fips_state_crosswalk()
525521
derive_zip_population_table()
526522
derive_fips_hhs_crosswalk()
527-
derive_zip_hhs_crosswalk()
523+
derive_zip_hhs_crosswalk()

0 commit comments

Comments
 (0)