Skip to content

Commit ccc7406

Browse files
committed
Fix weights issue back, simplify derived crosswalk
1 parent 9de4af2 commit ccc7406

File tree

6 files changed

+159394
-200752
lines changed

6 files changed

+159394
-200752
lines changed

_delphi_utils_python/data_proc/geomap/geo_data_proc.py

Lines changed: 146 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
1-
"""Needed to process the geo files to get from xls file to a simpler csv.
2-
pip install xlrd
3-
1+
"""
42
Author: James Sharpnack @jsharpna
53
Refactored by: Dmitry Shemetov @dshemetov
64
"""
@@ -54,45 +52,57 @@ def create_fips_zip_crosswalk():
5452
pop_df = pd.read_csv(FIPS_BY_ZIP_POP_URL)
5553

5654
# Create the FIPS column by combining the state and county codes
57-
pop_df["fips"] = pop_df["STATE"].astype(str).str.zfill(2) + pop_df["COUNTY"].astype(
58-
str
59-
).str.zfill(3)
55+
state_codes = pop_df["STATE"].astype(str).str.zfill(2)
56+
county_codes = pop_df["COUNTY"].astype(str).str.zfill(3)
57+
pop_df["fips"] = state_codes + county_codes
6058

6159
# Create the ZIP column by adding leading zeros to the ZIP
6260
pop_df["zip"] = pop_df["ZCTA5"].astype(str).str.zfill(5)
6361

6462
# Pare down the dataframe to just the relevant columns: zip, fips, and population
6563
pop_df = pop_df[["zip", "fips", "POPPT"]].rename(columns={"POPPT": "pop"})
6664

67-
# Find the populations by FIPS and ZIP
68-
pop_fips = pop_df[["fips", "pop"]].groupby("fips").sum()
69-
pop_zip = pop_df[["zip", "pop"]].groupby("zip").sum()
70-
pop_fips.to_csv(join(OUTPUT_DIR, FIPS_POPULATION_OUT_FILENAME))
71-
pop_zip.to_csv(join(OUTPUT_DIR, ZIP_POPULATION_OUT_FILENAME))
65+
# Find the populations by FIPS and ZIP and write them to files
66+
(
67+
pop_df[["fips", "pop"]]
68+
.groupby("fips")
69+
.sum()
70+
.to_csv(join(OUTPUT_DIR, FIPS_POPULATION_OUT_FILENAME))
71+
)
72+
(
73+
pop_df[["zip", "pop"]]
74+
.groupby("zip")
75+
.sum()
76+
.to_csv(join(OUTPUT_DIR, ZIP_POPULATION_OUT_FILENAME))
77+
)
7278

7379
# Find the population fractions (the heaviest computation, takes about a minute)
74-
# Note that the denominator in the fractions is the target code population
80+
# Note that the denominator in the fractions is the source population
7581
pop_df.set_index(["fips", "zip"], inplace=True)
76-
fips_zip = pop_df.groupby("zip", as_index=False).apply(
82+
fips_zip = pop_df.groupby("fips", as_index=False).apply(
7783
lambda g: g["pop"] / g["pop"].sum()
7884
)
79-
zip_fips = pop_df.groupby("fips", as_index=False).apply(
85+
zip_fips = pop_df.groupby("zip", as_index=False).apply(
8086
lambda g: g["pop"] / g["pop"].sum()
8187
)
8288

8389
# Rename and write to file
84-
fips_zip.reset_index(level=["fips", "zip"]).rename(
85-
columns={"pop": "weight"}
86-
).to_csv(join(OUTPUT_DIR, FIPS_ZIP_OUT_FILENAME), index=False)
87-
zip_fips.reset_index(level=["fips", "zip"]).rename(
88-
columns={"pop": "weight"}
89-
).to_csv(join(OUTPUT_DIR, ZIP_FIPS_OUT_FILENAME), index=False)
90+
(
91+
fips_zip.reset_index(level=["fips", "zip"])
92+
.rename(columns={"pop": "weight"})
93+
.to_csv(join(OUTPUT_DIR, FIPS_ZIP_OUT_FILENAME), index=False)
94+
)
95+
(
96+
zip_fips.reset_index(level=["fips", "zip"])
97+
.rename(columns={"pop": "weight"})
98+
.to_csv(join(OUTPUT_DIR, ZIP_FIPS_OUT_FILENAME), index=False)
99+
)
90100

91101

92102
def create_zip_hsa_hrr_crosswalk():
93103
"""Creates the crosswalk table from ZIP to HSA and from ZIP to HRR from source."""
94-
zipped_csv = BytesIO(requests.get(ZIP_HSA_HRR_URL).content)
95-
zip_df = pd.read_csv(ZipFile(zipped_csv).open(ZIP_HSA_HRR_FILENAME))
104+
zipped_csv = ZipFile(BytesIO(requests.get(ZIP_HSA_HRR_URL).content))
105+
zip_df = pd.read_csv(zipped_csv.open(ZIP_HSA_HRR_FILENAME))
96106

97107
# Build the HSA table
98108
hsa_df = zip_df[["zipcode18", "hsanum"]].rename(
@@ -122,8 +132,13 @@ def create_fips_msa_crosswalk():
122132
"FIPS State Code": str,
123133
"FIPS County Code": str,
124134
}
125-
msa_df = pd.read_excel(
126-
FIPS_MSA_URL, skiprows=2, skipfooter=4, usecols=msa_cols.keys(), dtype=msa_cols,
135+
# The following line requires the xlrd package.
136+
msa_df = pd.read_excel(
137+
FIPS_MSA_URL,
138+
skiprows=2,
139+
skipfooter=4,
140+
usecols=msa_cols.keys(),
141+
dtype=msa_cols,
127142
)
128143

129144
metro_bool = (
@@ -134,9 +149,10 @@ def create_fips_msa_crosswalk():
134149

135150
# Combine state and county codes into a single FIPS code
136151
msa_df["fips"] = msa_df["FIPS State Code"].str.cat(msa_df["FIPS County Code"])
137-
msa_df.rename(columns={"CBSA Code": "msa"}, inplace=True)
138-
msa_df = msa_df[["fips", "msa"]]
139-
msa_df.to_csv(join(OUTPUT_DIR, FIPS_MSA_OUT_FILENAME), index=False)
152+
153+
msa_df.rename(columns={"CBSA Code": "msa"})[["fips", "msa"]].to_csv(
154+
join(OUTPUT_DIR, FIPS_MSA_OUT_FILENAME), index=False
155+
)
140156

141157

142158
def create_jhu_uid_fips_crosswalk():
@@ -147,37 +163,72 @@ def create_jhu_uid_fips_crosswalk():
147163
hand_additions = pd.DataFrame(
148164
[
149165
# Split aggregation of Dukes and Nantucket, Massachusetts
150-
{"jhu_uid": 84070002, "fips": "25007", "weight": 16535/(16535 + 10172)}, # Population: 16535
151-
{"jhu_uid": 84070002, "fips": "25019", "weight": 10172/(16535 + 10172)}, # 10172
166+
{
167+
"jhu_uid": 84070002,
168+
"fips": "25007",
169+
"weight": 16535 / (16535 + 10172),
170+
}, # Population: 16535
171+
{
172+
"jhu_uid": 84070002,
173+
"fips": "25019",
174+
"weight": 10172 / (16535 + 10172),
175+
}, # 10172
152176
# Kansas City, Missouri
153-
{"jhu_uid": 84070003, "fips": "29095", "weight": 674158 / 1084897}, # Population: 674158
154-
{"jhu_uid": 84070003, "fips": "29165", "weight": 89322 / 1084897}, # 89322
155-
{"jhu_uid": 84070003, "fips": "29037", "weight": 99478 / 1084897}, # 99478
156-
{"jhu_uid": 84070003, "fips": "29047", "weight": 221939 / 1084897}, # 221939
177+
{
178+
"jhu_uid": 84070003,
179+
"fips": "29095",
180+
"weight": 674158 / 1084897,
181+
}, # Population: 674158
182+
{"jhu_uid": 84070003, "fips": "29165", "weight": 89322 / 1084897}, # 89322
183+
{"jhu_uid": 84070003, "fips": "29037", "weight": 99478 / 1084897}, # 99478
184+
{
185+
"jhu_uid": 84070003,
186+
"fips": "29047",
187+
"weight": 221939 / 1084897,
188+
}, # 221939
157189
# Kusilvak, Alaska
158190
{"jhu_uid": 84002158, "fips": "02270", "weight": 1.0},
159191
# Oglala Lakota
160192
{"jhu_uid": 84046102, "fips": "46113", "weight": 1.0},
161193
# Split aggregation of New York County (populations from JHU documentation)
162-
{"jhu_uid": 84036061, "fips": "36005", "weight": 1418207/8336817}, # Population: 1,418,207
163-
{"jhu_uid": 84036061, "fips": "36047", "weight": 2559903/8336817}, # 2,559,903
164-
{"jhu_uid": 84036061, "fips": "36061", "weight": 1628706/8336817}, # 1,628,706
165-
{"jhu_uid": 84036061, "fips": "36081", "weight": 2253858/8336817}, # 2,253,858
166-
{"jhu_uid": 84036061, "fips": "36085", "weight": 476143/8336817}, # 476,143
194+
{
195+
"jhu_uid": 84036061,
196+
"fips": "36005",
197+
"weight": 1418207 / 8336817,
198+
}, # Population: 1,418,207
199+
{
200+
"jhu_uid": 84036061,
201+
"fips": "36047",
202+
"weight": 2559903 / 8336817,
203+
}, # 2,559,903
204+
{
205+
"jhu_uid": 84036061,
206+
"fips": "36061",
207+
"weight": 1628706 / 8336817,
208+
}, # 1,628,706
209+
{
210+
"jhu_uid": 84036061,
211+
"fips": "36081",
212+
"weight": 2253858 / 8336817,
213+
}, # 2,253,858
214+
{
215+
"jhu_uid": 84036061,
216+
"fips": "36085",
217+
"weight": 476143 / 8336817,
218+
}, # 476,143
167219
# Aggregate Utah into a "State FIPS"
168-
{'jhu_uid': 84070015, 'fips': "49000", 'weight': 1.},
169-
{'jhu_uid': 84070016, 'fips': "49000", 'weight': 1.},
170-
{'jhu_uid': 84070017, 'fips': "49000", 'weight': 1.},
171-
{'jhu_uid': 84070018, 'fips': "49000", 'weight': 1.},
172-
{'jhu_uid': 84070019, 'fips': "49000", 'weight': 1.},
173-
{'jhu_uid': 84070020, 'fips': "49000", 'weight': 1.}
220+
{"jhu_uid": 84070015, "fips": "49000", "weight": 1.0},
221+
{"jhu_uid": 84070016, "fips": "49000", "weight": 1.0},
222+
{"jhu_uid": 84070017, "fips": "49000", "weight": 1.0},
223+
{"jhu_uid": 84070018, "fips": "49000", "weight": 1.0},
224+
{"jhu_uid": 84070019, "fips": "49000", "weight": 1.0},
225+
{"jhu_uid": 84070020, "fips": "49000", "weight": 1.0},
174226
]
175227
)
176228

177-
jhu_df = pd.read_csv(JHU_FIPS_URL, dtype={"UID": str, "FIPS": str})
178-
jhu_df = jhu_df.query("Country_Region == 'US'")
179229
jhu_df = (
180-
jhu_df[["UID", "FIPS"]]
230+
pd.read_csv(JHU_FIPS_URL, dtype={"UID": str, "FIPS": str})
231+
.query("Country_Region == 'US'")[["UID", "FIPS"]]
181232
.rename(columns={"UID": "jhu_uid", "FIPS": "fips"})
182233
.dropna(subset=["fips"])
183234
)
@@ -186,7 +237,9 @@ def create_jhu_uid_fips_crosswalk():
186237
# These are Guam (66), Northern Mariana Islands (69), Virgin Islands (78),
187238
# and Puerto Rico (72).
188239
fips_st = jhu_df["fips"].str.len() <= 2
189-
jhu_df.loc[fips_st, "fips"] = jhu_df.loc[fips_st, "fips"].astype(str).str.ljust(5, '0')
240+
jhu_df.loc[fips_st, "fips"] = (
241+
jhu_df.loc[fips_st, "fips"].astype(str).str.ljust(5, "0")
242+
)
190243

191244
# Drop the JHU UIDs that were hand-modified
192245
dup_ind = jhu_df["jhu_uid"].isin(hand_additions["jhu_uid"].values)
@@ -206,22 +259,23 @@ def create_jhu_uid_fips_crosswalk():
206259

207260
def create_state_codes_crosswalk():
208261
"""Creat the State ID -> State Name -> State code crosswalk file."""
209-
df = pd.read_csv(
210-
"http://www2.census.gov/geo/docs/reference/state.txt?#", delimiter="|"
211-
)
212-
df = df.drop(columns="STATENS").rename(
213-
columns={
214-
"STATE": "state_code",
215-
"STUSAB": "state_id",
216-
"STATE_NAME": "state_name",
217-
}
262+
df = (
263+
pd.read_csv(STATE_CODES_URL, delimiter="|")
264+
.drop(columns="STATENS")
265+
.rename(
266+
columns={
267+
"STATE": "state_code",
268+
"STUSAB": "state_id",
269+
"STATE_NAME": "state_name",
270+
}
271+
)
218272
)
219273
df["state_code"] = df["state_code"].astype(str).str.zfill(2)
220274
df.to_csv(join(OUTPUT_DIR, STATE_OUT_FILENAME), index=False)
221275

222276

223277
def derive_fips_hrr_crosswalk():
224-
"""Derives a crosswalk file from FIPS to HRR through FIPZ -> ZIP -> HRR
278+
"""Derives a crosswalk file from FIPS to HRR through FIPZ -> ZIP -> HRR
225279
from the crosswalk files made by the functions above."""
226280
if not isfile(join(OUTPUT_DIR, FIPS_ZIP_OUT_FILENAME)):
227281
create_fips_zip_crosswalk()
@@ -235,22 +289,17 @@ def derive_fips_hrr_crosswalk():
235289
)
236290
zh_df = pd.read_csv(
237291
join(OUTPUT_DIR, ZIP_HRR_OUT_FILENAME),
238-
dtype={"fips": str, "zip": str, "weight": float},
292+
dtype={"zip": str, "hrr": str},
239293
)
240294

241-
df = fz_df.join(zh_df.set_index("zip"), on="zip")
242-
df = df.drop(columns="zip")
243-
df = df.reset_index().set_index(["fips", "hrr"])
244-
df = df.groupby(["hrr"], as_index=False).apply(
245-
lambda g: g["weight"] / g["weight"].sum()
295+
(
296+
fz_df.merge(zh_df, on="zip", how="left")
297+
.drop(columns="zip")
298+
.groupby(["fips", "hrr"])
299+
.sum()
300+
.reset_index()
301+
.to_csv(join(OUTPUT_DIR, FIPS_HRR_OUT_FILENAME), index=False)
246302
)
247-
df = df.reset_index(level=["fips", "hrr"])
248-
249-
# Cast back to str
250-
df["hrr"] = df["hrr"].astype(int).astype(str)
251-
df["fips"] = df["fips"].astype(str).str.zfill(5)
252-
253-
df.to_csv(join(OUTPUT_DIR, FIPS_HRR_OUT_FILENAME), index=False)
254303

255304

256305
def derive_fips_state_crosswalk():
@@ -263,34 +312,38 @@ def derive_fips_state_crosswalk():
263312
)
264313

265314
fips_pop["state_code"] = fips_pop["fips"].str[:2]
266-
fips_pop = fips_pop.merge(state_codes, on="state_code", how="left")
267-
fips_pop = fips_pop.drop(columns="pop")
268-
269-
fips_pop.to_csv(join(OUTPUT_DIR, FIPS_STATE_OUT_FILENAME), index=False)
315+
(
316+
fips_pop.merge(state_codes, on="state_code", how="left")
317+
.drop(columns="pop")
318+
.to_csv(join(OUTPUT_DIR, FIPS_STATE_OUT_FILENAME), index=False)
319+
)
270320

271321

272322
def derive_zip_msa_crosswalk():
273-
"""Derives a crosswalk file from ZIP to MSA through ZIP -> FIPS -> HRR
323+
"""Derives a crosswalk file from ZIP to MSA through ZIP -> FIPS -> HRR
274324
from the crosswalk files made by the functions above."""
275325
if not isfile(join(OUTPUT_DIR, ZIP_FIPS_OUT_FILENAME)):
276326
create_fips_zip_crosswalk()
277327

278328
if not isfile(join(OUTPUT_DIR, FIPS_MSA_OUT_FILENAME)):
279329
create_fips_msa_crosswalk()
280330

281-
zf_df = pd.read_csv(join(OUTPUT_DIR, ZIP_FIPS_OUT_FILENAME))
282-
fm_df = pd.read_csv(join(OUTPUT_DIR, FIPS_MSA_OUT_FILENAME))
331+
zf_df = pd.read_csv(
332+
join(OUTPUT_DIR, ZIP_FIPS_OUT_FILENAME),
333+
dtype={"zip": str, "fips": str, "weight": float},
334+
)
335+
fm_df = pd.read_csv(
336+
join(OUTPUT_DIR, FIPS_MSA_OUT_FILENAME), dtype={"fips": str, "msa": str}
337+
)
283338

284-
df = zf_df.join(fm_df.set_index("fips"), on="fips")
285-
df = df.drop(columns="fips")
286-
df = df.set_index(["zip", "msa"])
287-
df = df.groupby(["msa"], as_index=False).apply(
288-
lambda g: g["weight"] / g["weight"].sum()
339+
(
340+
zf_df.merge(fm_df, on="fips")
341+
.drop(columns="fips")
342+
.groupby(["msa", "zip"])
343+
.sum()
344+
.reset_index()
345+
.to_csv(join(OUTPUT_DIR, ZIP_MSA_OUT_FILENAME), index=False)
289346
)
290-
df = df.reset_index(level=["zip", "msa"])
291-
df["zip"] = df["zip"].astype(str).str.zfill(5)
292-
df["msa"] = df["msa"].astype(int).astype(str)
293-
df.to_csv(join(OUTPUT_DIR, ZIP_MSA_OUT_FILENAME), index=False)
294347

295348

296349
def derive_zip_to_state_code():
@@ -306,10 +359,13 @@ def derive_zip_to_state_code():
306359
zf_cf = pd.read_csv(
307360
join(OUTPUT_DIR, ZIP_FIPS_OUT_FILENAME), dtype={"zip": str, "fips": str}
308361
)
362+
309363
zf_cf["state_code"] = zf_cf["fips"].str[:2]
310-
df = zf_cf.merge(sdf, left_on="state_code", right_on="state_code", how="left")
311-
df = df.drop(columns=["fips"])
312-
df.to_csv(join(OUTPUT_DIR, ZIP_STATE_CODE_OUT_FILENAME), index=False)
364+
(
365+
zf_cf.merge(sdf, left_on="state_code", right_on="state_code", how="left")
366+
.drop(columns=["fips"])
367+
.to_csv(join(OUTPUT_DIR, ZIP_STATE_CODE_OUT_FILENAME), index=False)
368+
)
313369

314370

315371
if __name__ == "__main__":

0 commit comments

Comments
 (0)