1
- """Needed to process the geo files to get from xls file to a simpler csv.
2
- pip install xlrd
3
-
1
+ """
4
2
Author: James Sharpnack @jsharpna
5
3
Refactored by: Dmitry Shemetov @dshemetov
6
4
"""
@@ -54,45 +52,57 @@ def create_fips_zip_crosswalk():
54
52
pop_df = pd .read_csv (FIPS_BY_ZIP_POP_URL )
55
53
56
54
# Create the FIPS column by combining the state and county codes
57
- pop_df [ "fips" ] = pop_df ["STATE" ].astype (str ).str .zfill (2 ) + pop_df [ "COUNTY" ]. astype (
58
- str
59
- ). str . zfill ( 3 )
55
+ state_codes = pop_df ["STATE" ].astype (str ).str .zfill (2 )
56
+ county_codes = pop_df [ "COUNTY" ]. astype ( str ). str . zfill ( 3 )
57
+ pop_df [ "fips" ] = state_codes + county_codes
60
58
61
59
# Create the ZIP column by adding leading zeros to the ZIP
62
60
pop_df ["zip" ] = pop_df ["ZCTA5" ].astype (str ).str .zfill (5 )
63
61
64
62
# Pare down the dataframe to just the relevant columns: zip, fips, and population
65
63
pop_df = pop_df [["zip" , "fips" , "POPPT" ]].rename (columns = {"POPPT" : "pop" })
66
64
67
- # Find the populations by FIPS and ZIP
68
- pop_fips = pop_df [["fips" , "pop" ]].groupby ("fips" ).sum ()
69
- pop_zip = pop_df [["zip" , "pop" ]].groupby ("zip" ).sum ()
70
- pop_fips .to_csv (join (OUTPUT_DIR , FIPS_POPULATION_OUT_FILENAME ))
71
- pop_zip .to_csv (join (OUTPUT_DIR , ZIP_POPULATION_OUT_FILENAME ))
65
+ # Find the populations by FIPS and ZIP and write them to files
66
+ (
67
+ pop_df [["fips" , "pop" ]]
68
+ .groupby ("fips" )
69
+ .sum ()
70
+ .to_csv (join (OUTPUT_DIR , FIPS_POPULATION_OUT_FILENAME ))
71
+ )
72
+ (
73
+ pop_df [["zip" , "pop" ]]
74
+ .groupby ("zip" )
75
+ .sum ()
76
+ .to_csv (join (OUTPUT_DIR , ZIP_POPULATION_OUT_FILENAME ))
77
+ )
72
78
73
79
# Find the population fractions (the heaviest computation, takes about a minute)
74
- # Note that the denominator in the fractions is the target code population
80
+ # Note that the denominator in the fractions is the source population
75
81
pop_df .set_index (["fips" , "zip" ], inplace = True )
76
- fips_zip = pop_df .groupby ("zip " , as_index = False ).apply (
82
+ fips_zip = pop_df .groupby ("fips " , as_index = False ).apply (
77
83
lambda g : g ["pop" ] / g ["pop" ].sum ()
78
84
)
79
- zip_fips = pop_df .groupby ("fips " , as_index = False ).apply (
85
+ zip_fips = pop_df .groupby ("zip " , as_index = False ).apply (
80
86
lambda g : g ["pop" ] / g ["pop" ].sum ()
81
87
)
82
88
83
89
# Rename and write to file
84
- fips_zip .reset_index (level = ["fips" , "zip" ]).rename (
85
- columns = {"pop" : "weight" }
86
- ).to_csv (join (OUTPUT_DIR , FIPS_ZIP_OUT_FILENAME ), index = False )
87
- zip_fips .reset_index (level = ["fips" , "zip" ]).rename (
88
- columns = {"pop" : "weight" }
89
- ).to_csv (join (OUTPUT_DIR , ZIP_FIPS_OUT_FILENAME ), index = False )
90
+ (
91
+ fips_zip .reset_index (level = ["fips" , "zip" ])
92
+ .rename (columns = {"pop" : "weight" })
93
+ .to_csv (join (OUTPUT_DIR , FIPS_ZIP_OUT_FILENAME ), index = False )
94
+ )
95
+ (
96
+ zip_fips .reset_index (level = ["fips" , "zip" ])
97
+ .rename (columns = {"pop" : "weight" })
98
+ .to_csv (join (OUTPUT_DIR , ZIP_FIPS_OUT_FILENAME ), index = False )
99
+ )
90
100
91
101
92
102
def create_zip_hsa_hrr_crosswalk ():
93
103
"""Creates the crosswalk table from ZIP to HSA and from ZIP to HRR from source."""
94
- zipped_csv = BytesIO (requests .get (ZIP_HSA_HRR_URL ).content )
95
- zip_df = pd .read_csv (ZipFile ( zipped_csv ) .open (ZIP_HSA_HRR_FILENAME ))
104
+ zipped_csv = ZipFile ( BytesIO (requests .get (ZIP_HSA_HRR_URL ).content ) )
105
+ zip_df = pd .read_csv (zipped_csv .open (ZIP_HSA_HRR_FILENAME ))
96
106
97
107
# Build the HSA table
98
108
hsa_df = zip_df [["zipcode18" , "hsanum" ]].rename (
@@ -122,8 +132,13 @@ def create_fips_msa_crosswalk():
122
132
"FIPS State Code" : str ,
123
133
"FIPS County Code" : str ,
124
134
}
125
- msa_df = pd .read_excel (
126
- FIPS_MSA_URL , skiprows = 2 , skipfooter = 4 , usecols = msa_cols .keys (), dtype = msa_cols ,
135
+ # The following line requires the xlrd package.
136
+ msa_df = pd .read_excel (
137
+ FIPS_MSA_URL ,
138
+ skiprows = 2 ,
139
+ skipfooter = 4 ,
140
+ usecols = msa_cols .keys (),
141
+ dtype = msa_cols ,
127
142
)
128
143
129
144
metro_bool = (
@@ -134,9 +149,10 @@ def create_fips_msa_crosswalk():
134
149
135
150
# Combine state and county codes into a single FIPS code
136
151
msa_df ["fips" ] = msa_df ["FIPS State Code" ].str .cat (msa_df ["FIPS County Code" ])
137
- msa_df .rename (columns = {"CBSA Code" : "msa" }, inplace = True )
138
- msa_df = msa_df [["fips" , "msa" ]]
139
- msa_df .to_csv (join (OUTPUT_DIR , FIPS_MSA_OUT_FILENAME ), index = False )
152
+
153
+ msa_df .rename (columns = {"CBSA Code" : "msa" })[["fips" , "msa" ]].to_csv (
154
+ join (OUTPUT_DIR , FIPS_MSA_OUT_FILENAME ), index = False
155
+ )
140
156
141
157
142
158
def create_jhu_uid_fips_crosswalk ():
@@ -147,37 +163,72 @@ def create_jhu_uid_fips_crosswalk():
147
163
hand_additions = pd .DataFrame (
148
164
[
149
165
# Split aggregation of Dukes and Nantucket, Massachusetts
150
- {"jhu_uid" : 84070002 , "fips" : "25007" , "weight" : 16535 / (16535 + 10172 )}, # Population: 16535
151
- {"jhu_uid" : 84070002 , "fips" : "25019" , "weight" : 10172 / (16535 + 10172 )}, # 10172
166
+ {
167
+ "jhu_uid" : 84070002 ,
168
+ "fips" : "25007" ,
169
+ "weight" : 16535 / (16535 + 10172 ),
170
+ }, # Population: 16535
171
+ {
172
+ "jhu_uid" : 84070002 ,
173
+ "fips" : "25019" ,
174
+ "weight" : 10172 / (16535 + 10172 ),
175
+ }, # 10172
152
176
# Kansas City, Missouri
153
- {"jhu_uid" : 84070003 , "fips" : "29095" , "weight" : 674158 / 1084897 }, # Population: 674158
154
- {"jhu_uid" : 84070003 , "fips" : "29165" , "weight" : 89322 / 1084897 }, # 89322
155
- {"jhu_uid" : 84070003 , "fips" : "29037" , "weight" : 99478 / 1084897 }, # 99478
156
- {"jhu_uid" : 84070003 , "fips" : "29047" , "weight" : 221939 / 1084897 }, # 221939
177
+ {
178
+ "jhu_uid" : 84070003 ,
179
+ "fips" : "29095" ,
180
+ "weight" : 674158 / 1084897 ,
181
+ }, # Population: 674158
182
+ {"jhu_uid" : 84070003 , "fips" : "29165" , "weight" : 89322 / 1084897 }, # 89322
183
+ {"jhu_uid" : 84070003 , "fips" : "29037" , "weight" : 99478 / 1084897 }, # 99478
184
+ {
185
+ "jhu_uid" : 84070003 ,
186
+ "fips" : "29047" ,
187
+ "weight" : 221939 / 1084897 ,
188
+ }, # 221939
157
189
# Kusilvak, Alaska
158
190
{"jhu_uid" : 84002158 , "fips" : "02270" , "weight" : 1.0 },
159
191
# Oglala Lakota
160
192
{"jhu_uid" : 84046102 , "fips" : "46113" , "weight" : 1.0 },
161
193
# Split aggregation of New York County (populations from JHU documentation)
162
- {"jhu_uid" : 84036061 , "fips" : "36005" , "weight" : 1418207 / 8336817 }, # Population: 1,418,207
163
- {"jhu_uid" : 84036061 , "fips" : "36047" , "weight" : 2559903 / 8336817 }, # 2,559,903
164
- {"jhu_uid" : 84036061 , "fips" : "36061" , "weight" : 1628706 / 8336817 }, # 1,628,706
165
- {"jhu_uid" : 84036061 , "fips" : "36081" , "weight" : 2253858 / 8336817 }, # 2,253,858
166
- {"jhu_uid" : 84036061 , "fips" : "36085" , "weight" : 476143 / 8336817 }, # 476,143
194
+ {
195
+ "jhu_uid" : 84036061 ,
196
+ "fips" : "36005" ,
197
+ "weight" : 1418207 / 8336817 ,
198
+ }, # Population: 1,418,207
199
+ {
200
+ "jhu_uid" : 84036061 ,
201
+ "fips" : "36047" ,
202
+ "weight" : 2559903 / 8336817 ,
203
+ }, # 2,559,903
204
+ {
205
+ "jhu_uid" : 84036061 ,
206
+ "fips" : "36061" ,
207
+ "weight" : 1628706 / 8336817 ,
208
+ }, # 1,628,706
209
+ {
210
+ "jhu_uid" : 84036061 ,
211
+ "fips" : "36081" ,
212
+ "weight" : 2253858 / 8336817 ,
213
+ }, # 2,253,858
214
+ {
215
+ "jhu_uid" : 84036061 ,
216
+ "fips" : "36085" ,
217
+ "weight" : 476143 / 8336817 ,
218
+ }, # 476,143
167
219
# Aggregate Utah into a "State FIPS"
168
- {' jhu_uid' : 84070015 , ' fips' : "49000" , ' weight' : 1. },
169
- {' jhu_uid' : 84070016 , ' fips' : "49000" , ' weight' : 1. },
170
- {' jhu_uid' : 84070017 , ' fips' : "49000" , ' weight' : 1. },
171
- {' jhu_uid' : 84070018 , ' fips' : "49000" , ' weight' : 1. },
172
- {' jhu_uid' : 84070019 , ' fips' : "49000" , ' weight' : 1. },
173
- {' jhu_uid' : 84070020 , ' fips' : "49000" , ' weight' : 1. }
220
+ {" jhu_uid" : 84070015 , " fips" : "49000" , " weight" : 1.0 },
221
+ {" jhu_uid" : 84070016 , " fips" : "49000" , " weight" : 1.0 },
222
+ {" jhu_uid" : 84070017 , " fips" : "49000" , " weight" : 1.0 },
223
+ {" jhu_uid" : 84070018 , " fips" : "49000" , " weight" : 1.0 },
224
+ {" jhu_uid" : 84070019 , " fips" : "49000" , " weight" : 1.0 },
225
+ {" jhu_uid" : 84070020 , " fips" : "49000" , " weight" : 1.0 },
174
226
]
175
227
)
176
228
177
- jhu_df = pd .read_csv (JHU_FIPS_URL , dtype = {"UID" : str , "FIPS" : str })
178
- jhu_df = jhu_df .query ("Country_Region == 'US'" )
179
229
jhu_df = (
180
- jhu_df [["UID" , "FIPS" ]]
230
+ pd .read_csv (JHU_FIPS_URL , dtype = {"UID" : str , "FIPS" : str })
231
+ .query ("Country_Region == 'US'" )[["UID" , "FIPS" ]]
181
232
.rename (columns = {"UID" : "jhu_uid" , "FIPS" : "fips" })
182
233
.dropna (subset = ["fips" ])
183
234
)
@@ -186,7 +237,9 @@ def create_jhu_uid_fips_crosswalk():
186
237
# These are Guam (66), Northern Mariana Islands (69), Virgin Islands (78),
187
238
# and Puerto Rico (72).
188
239
fips_st = jhu_df ["fips" ].str .len () <= 2
189
- jhu_df .loc [fips_st , "fips" ] = jhu_df .loc [fips_st , "fips" ].astype (str ).str .ljust (5 , '0' )
240
+ jhu_df .loc [fips_st , "fips" ] = (
241
+ jhu_df .loc [fips_st , "fips" ].astype (str ).str .ljust (5 , "0" )
242
+ )
190
243
191
244
# Drop the JHU UIDs that were hand-modified
192
245
dup_ind = jhu_df ["jhu_uid" ].isin (hand_additions ["jhu_uid" ].values )
@@ -206,22 +259,23 @@ def create_jhu_uid_fips_crosswalk():
206
259
207
260
def create_state_codes_crosswalk ():
208
261
"""Creat the State ID -> State Name -> State code crosswalk file."""
209
- df = pd .read_csv (
210
- "http://www2.census.gov/geo/docs/reference/state.txt?#" , delimiter = "|"
211
- )
212
- df = df .drop (columns = "STATENS" ).rename (
213
- columns = {
214
- "STATE" : "state_code" ,
215
- "STUSAB" : "state_id" ,
216
- "STATE_NAME" : "state_name" ,
217
- }
262
+ df = (
263
+ pd .read_csv (STATE_CODES_URL , delimiter = "|" )
264
+ .drop (columns = "STATENS" )
265
+ .rename (
266
+ columns = {
267
+ "STATE" : "state_code" ,
268
+ "STUSAB" : "state_id" ,
269
+ "STATE_NAME" : "state_name" ,
270
+ }
271
+ )
218
272
)
219
273
df ["state_code" ] = df ["state_code" ].astype (str ).str .zfill (2 )
220
274
df .to_csv (join (OUTPUT_DIR , STATE_OUT_FILENAME ), index = False )
221
275
222
276
223
277
def derive_fips_hrr_crosswalk ():
224
- """Derives a crosswalk file from FIPS to HRR through FIPZ -> ZIP -> HRR
278
+ """Derives a crosswalk file from FIPS to HRR through FIPZ -> ZIP -> HRR
225
279
from the crosswalk files made by the functions above."""
226
280
if not isfile (join (OUTPUT_DIR , FIPS_ZIP_OUT_FILENAME )):
227
281
create_fips_zip_crosswalk ()
@@ -235,22 +289,17 @@ def derive_fips_hrr_crosswalk():
235
289
)
236
290
zh_df = pd .read_csv (
237
291
join (OUTPUT_DIR , ZIP_HRR_OUT_FILENAME ),
238
- dtype = {"fips" : str , " zip" : str , "weight " : float },
292
+ dtype = {"zip" : str , "hrr " : str },
239
293
)
240
294
241
- df = fz_df .join (zh_df .set_index ("zip" ), on = "zip" )
242
- df = df .drop (columns = "zip" )
243
- df = df .reset_index ().set_index (["fips" , "hrr" ])
244
- df = df .groupby (["hrr" ], as_index = False ).apply (
245
- lambda g : g ["weight" ] / g ["weight" ].sum ()
295
+ (
296
+ fz_df .merge (zh_df , on = "zip" , how = "left" )
297
+ .drop (columns = "zip" )
298
+ .groupby (["fips" , "hrr" ])
299
+ .sum ()
300
+ .reset_index ()
301
+ .to_csv (join (OUTPUT_DIR , FIPS_HRR_OUT_FILENAME ), index = False )
246
302
)
247
- df = df .reset_index (level = ["fips" , "hrr" ])
248
-
249
- # Cast back to str
250
- df ["hrr" ] = df ["hrr" ].astype (int ).astype (str )
251
- df ["fips" ] = df ["fips" ].astype (str ).str .zfill (5 )
252
-
253
- df .to_csv (join (OUTPUT_DIR , FIPS_HRR_OUT_FILENAME ), index = False )
254
303
255
304
256
305
def derive_fips_state_crosswalk ():
@@ -263,34 +312,38 @@ def derive_fips_state_crosswalk():
263
312
)
264
313
265
314
fips_pop ["state_code" ] = fips_pop ["fips" ].str [:2 ]
266
- fips_pop = fips_pop .merge (state_codes , on = "state_code" , how = "left" )
267
- fips_pop = fips_pop .drop (columns = "pop" )
268
-
269
- fips_pop .to_csv (join (OUTPUT_DIR , FIPS_STATE_OUT_FILENAME ), index = False )
315
+ (
316
+ fips_pop .merge (state_codes , on = "state_code" , how = "left" )
317
+ .drop (columns = "pop" )
318
+ .to_csv (join (OUTPUT_DIR , FIPS_STATE_OUT_FILENAME ), index = False )
319
+ )
270
320
271
321
272
322
def derive_zip_msa_crosswalk ():
273
- """Derives a crosswalk file from ZIP to MSA through ZIP -> FIPS -> HRR
323
+ """Derives a crosswalk file from ZIP to MSA through ZIP -> FIPS -> HRR
274
324
from the crosswalk files made by the functions above."""
275
325
if not isfile (join (OUTPUT_DIR , ZIP_FIPS_OUT_FILENAME )):
276
326
create_fips_zip_crosswalk ()
277
327
278
328
if not isfile (join (OUTPUT_DIR , FIPS_MSA_OUT_FILENAME )):
279
329
create_fips_msa_crosswalk ()
280
330
281
- zf_df = pd .read_csv (join (OUTPUT_DIR , ZIP_FIPS_OUT_FILENAME ))
282
- fm_df = pd .read_csv (join (OUTPUT_DIR , FIPS_MSA_OUT_FILENAME ))
331
+ zf_df = pd .read_csv (
332
+ join (OUTPUT_DIR , ZIP_FIPS_OUT_FILENAME ),
333
+ dtype = {"zip" : str , "fips" : str , "weight" : float },
334
+ )
335
+ fm_df = pd .read_csv (
336
+ join (OUTPUT_DIR , FIPS_MSA_OUT_FILENAME ), dtype = {"fips" : str , "msa" : str }
337
+ )
283
338
284
- df = zf_df .join (fm_df .set_index ("fips" ), on = "fips" )
285
- df = df .drop (columns = "fips" )
286
- df = df .set_index (["zip" , "msa" ])
287
- df = df .groupby (["msa" ], as_index = False ).apply (
288
- lambda g : g ["weight" ] / g ["weight" ].sum ()
339
+ (
340
+ zf_df .merge (fm_df , on = "fips" )
341
+ .drop (columns = "fips" )
342
+ .groupby (["msa" , "zip" ])
343
+ .sum ()
344
+ .reset_index ()
345
+ .to_csv (join (OUTPUT_DIR , ZIP_MSA_OUT_FILENAME ), index = False )
289
346
)
290
- df = df .reset_index (level = ["zip" , "msa" ])
291
- df ["zip" ] = df ["zip" ].astype (str ).str .zfill (5 )
292
- df ["msa" ] = df ["msa" ].astype (int ).astype (str )
293
- df .to_csv (join (OUTPUT_DIR , ZIP_MSA_OUT_FILENAME ), index = False )
294
347
295
348
296
349
def derive_zip_to_state_code ():
@@ -306,10 +359,13 @@ def derive_zip_to_state_code():
306
359
zf_cf = pd .read_csv (
307
360
join (OUTPUT_DIR , ZIP_FIPS_OUT_FILENAME ), dtype = {"zip" : str , "fips" : str }
308
361
)
362
+
309
363
zf_cf ["state_code" ] = zf_cf ["fips" ].str [:2 ]
310
- df = zf_cf .merge (sdf , left_on = "state_code" , right_on = "state_code" , how = "left" )
311
- df = df .drop (columns = ["fips" ])
312
- df .to_csv (join (OUTPUT_DIR , ZIP_STATE_CODE_OUT_FILENAME ), index = False )
364
+ (
365
+ zf_cf .merge (sdf , left_on = "state_code" , right_on = "state_code" , how = "left" )
366
+ .drop (columns = ["fips" ])
367
+ .to_csv (join (OUTPUT_DIR , ZIP_STATE_CODE_OUT_FILENAME ), index = False )
368
+ )
313
369
314
370
315
371
if __name__ == "__main__" :
0 commit comments