Skip to content

Commit cdce10e

Browse files
committed
Fix JHU UID, fix tests
1 parent ccc7406 commit cdce10e

File tree

4 files changed

+261
-236
lines changed

4 files changed

+261
-236
lines changed

_delphi_utils_python/data_proc/geomap/geo_data_proc.py

Lines changed: 33 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -164,65 +164,71 @@ def create_jhu_uid_fips_crosswalk():
164164
[
165165
# Split aggregation of Dukes and Nantucket, Massachusetts
166166
{
167-
"jhu_uid": 84070002,
167+
"jhu_uid": "84070002",
168168
"fips": "25007",
169169
"weight": 16535 / (16535 + 10172),
170170
}, # Population: 16535
171171
{
172-
"jhu_uid": 84070002,
172+
"jhu_uid": "84070002",
173173
"fips": "25019",
174174
"weight": 10172 / (16535 + 10172),
175175
}, # 10172
176176
# Kansas City, Missouri
177177
{
178-
"jhu_uid": 84070003,
178+
"jhu_uid": "84070003",
179179
"fips": "29095",
180180
"weight": 674158 / 1084897,
181181
}, # Population: 674158
182-
{"jhu_uid": 84070003, "fips": "29165", "weight": 89322 / 1084897}, # 89322
183-
{"jhu_uid": 84070003, "fips": "29037", "weight": 99478 / 1084897}, # 99478
182+
{"jhu_uid": "84070003", "fips": "29165", "weight": 89322 / 1084897}, # 89322
183+
{"jhu_uid": "84070003", "fips": "29037", "weight": 99478 / 1084897}, # 99478
184184
{
185-
"jhu_uid": 84070003,
185+
"jhu_uid": "84070003",
186186
"fips": "29047",
187187
"weight": 221939 / 1084897,
188188
}, # 221939
189189
# Kusilvak, Alaska
190-
{"jhu_uid": 84002158, "fips": "02270", "weight": 1.0},
190+
{"jhu_uid": "84002158", "fips": "02270", "weight": 1.0},
191191
# Oglala Lakota
192-
{"jhu_uid": 84046102, "fips": "46113", "weight": 1.0},
192+
{"jhu_uid": "84046102", "fips": "46113", "weight": 1.0},
193193
# Split aggregation of New York County (populations from JHU documentation)
194194
{
195-
"jhu_uid": 84036061,
195+
"jhu_uid": "84036061",
196196
"fips": "36005",
197197
"weight": 1418207 / 8336817,
198198
}, # Population: 1,418,207
199199
{
200-
"jhu_uid": 84036061,
200+
"jhu_uid": "84036061",
201201
"fips": "36047",
202202
"weight": 2559903 / 8336817,
203203
}, # 2,559,903
204204
{
205-
"jhu_uid": 84036061,
205+
"jhu_uid": "84036061",
206206
"fips": "36061",
207207
"weight": 1628706 / 8336817,
208208
}, # 1,628,706
209209
{
210-
"jhu_uid": 84036061,
210+
"jhu_uid": "84036061",
211211
"fips": "36081",
212212
"weight": 2253858 / 8336817,
213213
}, # 2,253,858
214214
{
215-
"jhu_uid": 84036061,
215+
"jhu_uid": "84036061",
216216
"fips": "36085",
217217
"weight": 476143 / 8336817,
218218
}, # 476,143
219219
# Aggregate Utah into a "State FIPS"
220-
{"jhu_uid": 84070015, "fips": "49000", "weight": 1.0},
221-
{"jhu_uid": 84070016, "fips": "49000", "weight": 1.0},
222-
{"jhu_uid": 84070017, "fips": "49000", "weight": 1.0},
223-
{"jhu_uid": 84070018, "fips": "49000", "weight": 1.0},
224-
{"jhu_uid": 84070019, "fips": "49000", "weight": 1.0},
225-
{"jhu_uid": 84070020, "fips": "49000", "weight": 1.0},
220+
{"jhu_uid": "84070015", "fips": "49000", "weight": 1.0},
221+
{"jhu_uid": "84070016", "fips": "49000", "weight": 1.0},
222+
{"jhu_uid": "84070017", "fips": "49000", "weight": 1.0},
223+
{"jhu_uid": "84070018", "fips": "49000", "weight": 1.0},
224+
{"jhu_uid": "84070019", "fips": "49000", "weight": 1.0},
225+
{"jhu_uid": "84070020", "fips": "49000", "weight": 1.0},
226+
]
227+
)
228+
unassigned_states = pd.DataFrame(
229+
[
230+
# Map the Unassigned states FIPS to XX000
231+
{"jhu_uid": str(x), "fips": str(x)[-2:].ljust(5, "0"), "weight": 1.0} for x in range(84090001, 84090099)
226232
]
227233
)
228234

@@ -234,25 +240,25 @@ def create_jhu_uid_fips_crosswalk():
234240
)
235241

236242
# FIPS Codes that are just two digits long should be zero filled on the right.
237-
# These are Guam (66), Northern Mariana Islands (69), Virgin Islands (78),
238-
# and Puerto Rico (72).
243+
# These are US states and territories Guam (66), Northern Mariana Islands (69),
244+
# Virgin Islands (78), and Puerto Rico (72).
239245
fips_st = jhu_df["fips"].str.len() <= 2
240246
jhu_df.loc[fips_st, "fips"] = (
241-
jhu_df.loc[fips_st, "fips"].astype(str).str.ljust(5, "0")
247+
jhu_df.loc[fips_st, "fips"].str.ljust(5, "0")
242248
)
243249

244-
# Drop the JHU UIDs that were hand-modified
245-
dup_ind = jhu_df["jhu_uid"].isin(hand_additions["jhu_uid"].values)
250+
# Drop the FIPS codes in JHU that were hand-modified
251+
dup_ind = jhu_df["fips"].isin(hand_additions["fips"].values) | jhu_df["fips"].isin(["02158", "46102"])
246252
jhu_df.drop(jhu_df.index[dup_ind], inplace=True)
247253

248-
# Drop the FIPS codes in JHU that were hand-modified
249-
dup_ind = jhu_df["fips"].isin(hand_additions["fips"].values)
254+
# Drop the JHU UIDs that were hand-modified
255+
dup_ind = jhu_df["jhu_uid"].isin(hand_additions["jhu_uid"].values) | jhu_df["jhu_uid"].isin(unassigned_states["jhu_uid"].values)
250256
jhu_df.drop(jhu_df.index[dup_ind], inplace=True)
251257

252258
# Add weights of 1.0 to everything not in hand additions, then merge in hand-additions
253259
# Finally, zero fill FIPS
254260
jhu_df["weight"] = 1.0
255-
jhu_df = pd.concat((jhu_df, hand_additions))
261+
jhu_df = pd.concat((jhu_df, hand_additions, unassigned_states))
256262
jhu_df["fips"] = jhu_df["fips"].astype(int).astype(str).str.zfill(5)
257263
jhu_df.to_csv(join(OUTPUT_DIR, JHU_FIPS_OUT_FILENAME), index=False)
258264

_delphi_utils_python/delphi_utils/data/jhu_uid_fips_table.csv

Lines changed: 98 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -187,57 +187,6 @@ jhu_uid,fips,weight
187187
84080054,80054,1.0
188188
84080055,80055,1.0
189189
84080056,80056,1.0
190-
84090001,90001,1.0
191-
84090002,90002,1.0
192-
84090004,90004,1.0
193-
84090005,90005,1.0
194-
84090006,90006,1.0
195-
84090008,90008,1.0
196-
84090009,90009,1.0
197-
84090010,90010,1.0
198-
84090011,90011,1.0
199-
84090012,90012,1.0
200-
84090013,90013,1.0
201-
84090015,90015,1.0
202-
84090016,90016,1.0
203-
84090017,90017,1.0
204-
84090018,90018,1.0
205-
84090019,90019,1.0
206-
84090020,90020,1.0
207-
84090021,90021,1.0
208-
84090022,90022,1.0
209-
84090023,90023,1.0
210-
84090024,90024,1.0
211-
84090025,90025,1.0
212-
84090026,90026,1.0
213-
84090027,90027,1.0
214-
84090028,90028,1.0
215-
84090029,90029,1.0
216-
84090030,90030,1.0
217-
84090031,90031,1.0
218-
84090032,90032,1.0
219-
84090033,90033,1.0
220-
84090034,90034,1.0
221-
84090035,90035,1.0
222-
84090036,90036,1.0
223-
84090037,90037,1.0
224-
84090038,90038,1.0
225-
84090039,90039,1.0
226-
84090040,90040,1.0
227-
84090041,90041,1.0
228-
84090042,90042,1.0
229-
84090044,90044,1.0
230-
84090045,90045,1.0
231-
84090046,90046,1.0
232-
84090047,90047,1.0
233-
84090048,90048,1.0
234-
84090049,90049,1.0
235-
84090050,90050,1.0
236-
84090051,90051,1.0
237-
84090053,90053,1.0
238-
84090054,90054,1.0
239-
84090055,90055,1.0
240-
84090056,90056,1.0
241190
84001001,01001,1.0
242191
84001003,01003,1.0
243192
84001005,01005,1.0
@@ -319,7 +268,6 @@ jhu_uid,fips,weight
319268
84002122,02122,1.0
320269
84002130,02130,1.0
321270
84002150,02150,1.0
322-
84002158,02158,1.0
323271
84002164,02164,1.0
324272
84002170,02170,1.0
325273
84002180,02180,1.0
@@ -2639,7 +2587,6 @@ jhu_uid,fips,weight
26392587
84046097,46097,1.0
26402588
84046099,46099,1.0
26412589
84046101,46101,1.0
2642-
84046102,46102,1.0
26432590
84046103,46103,1.0
26442591
84046105,46105,1.0
26452592
84046107,46107,1.0
@@ -3388,3 +3335,101 @@ jhu_uid,fips,weight
33883335
84070018,49000,1.0
33893336
84070019,49000,1.0
33903337
84070020,49000,1.0
3338+
84090001,01000,1.0
3339+
84090002,02000,1.0
3340+
84090003,03000,1.0
3341+
84090004,04000,1.0
3342+
84090005,05000,1.0
3343+
84090006,06000,1.0
3344+
84090007,07000,1.0
3345+
84090008,08000,1.0
3346+
84090009,09000,1.0
3347+
84090010,10000,1.0
3348+
84090011,11000,1.0
3349+
84090012,12000,1.0
3350+
84090013,13000,1.0
3351+
84090014,14000,1.0
3352+
84090015,15000,1.0
3353+
84090016,16000,1.0
3354+
84090017,17000,1.0
3355+
84090018,18000,1.0
3356+
84090019,19000,1.0
3357+
84090020,20000,1.0
3358+
84090021,21000,1.0
3359+
84090022,22000,1.0
3360+
84090023,23000,1.0
3361+
84090024,24000,1.0
3362+
84090025,25000,1.0
3363+
84090026,26000,1.0
3364+
84090027,27000,1.0
3365+
84090028,28000,1.0
3366+
84090029,29000,1.0
3367+
84090030,30000,1.0
3368+
84090031,31000,1.0
3369+
84090032,32000,1.0
3370+
84090033,33000,1.0
3371+
84090034,34000,1.0
3372+
84090035,35000,1.0
3373+
84090036,36000,1.0
3374+
84090037,37000,1.0
3375+
84090038,38000,1.0
3376+
84090039,39000,1.0
3377+
84090040,40000,1.0
3378+
84090041,41000,1.0
3379+
84090042,42000,1.0
3380+
84090043,43000,1.0
3381+
84090044,44000,1.0
3382+
84090045,45000,1.0
3383+
84090046,46000,1.0
3384+
84090047,47000,1.0
3385+
84090048,48000,1.0
3386+
84090049,49000,1.0
3387+
84090050,50000,1.0
3388+
84090051,51000,1.0
3389+
84090052,52000,1.0
3390+
84090053,53000,1.0
3391+
84090054,54000,1.0
3392+
84090055,55000,1.0
3393+
84090056,56000,1.0
3394+
84090057,57000,1.0
3395+
84090058,58000,1.0
3396+
84090059,59000,1.0
3397+
84090060,60000,1.0
3398+
84090061,61000,1.0
3399+
84090062,62000,1.0
3400+
84090063,63000,1.0
3401+
84090064,64000,1.0
3402+
84090065,65000,1.0
3403+
84090066,66000,1.0
3404+
84090067,67000,1.0
3405+
84090068,68000,1.0
3406+
84090069,69000,1.0
3407+
84090070,70000,1.0
3408+
84090071,71000,1.0
3409+
84090072,72000,1.0
3410+
84090073,73000,1.0
3411+
84090074,74000,1.0
3412+
84090075,75000,1.0
3413+
84090076,76000,1.0
3414+
84090077,77000,1.0
3415+
84090078,78000,1.0
3416+
84090079,79000,1.0
3417+
84090080,80000,1.0
3418+
84090081,81000,1.0
3419+
84090082,82000,1.0
3420+
84090083,83000,1.0
3421+
84090084,84000,1.0
3422+
84090085,85000,1.0
3423+
84090086,86000,1.0
3424+
84090087,87000,1.0
3425+
84090088,88000,1.0
3426+
84090089,89000,1.0
3427+
84090090,90000,1.0
3428+
84090091,91000,1.0
3429+
84090092,92000,1.0
3430+
84090093,93000,1.0
3431+
84090094,94000,1.0
3432+
84090095,95000,1.0
3433+
84090096,96000,1.0
3434+
84090097,97000,1.0
3435+
84090098,98000,1.0

_delphi_utils_python/delphi_utils/geomap.py

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -224,14 +224,14 @@ def add_new_code(self, df, from_code, new_code, from_col=None, new_col=None):
224224

225225
if not is_string_dtype(df[from_col]):
226226
if from_code in ["fips", "zip"]:
227-
df[from_col] = df[from_col].astype(str).zfill(5)
227+
df[from_col] = df[from_col].astype(str).str.zfill(5)
228228
else:
229229
df[from_col] = df[from_col].astype(str)
230230

231231
crosswalk = self.load_crosswalk(from_code=from_code, to_code=new_code)
232232
crosswalk = crosswalk.rename(columns={from_code: from_col, new_code: new_col})
233233

234-
df = df.merge(crosswalk, left_on=from_col, right_on=from_col, how="left")
234+
df = df.merge(crosswalk, left_on=from_col, right_on=from_col, how="left").dropna(subset=[new_col])
235235

236236
# Drop extra state columns
237237
state_codes = ["state_code", "state_id", "state_name"]
@@ -279,8 +279,7 @@ def convert_to_new_code(
279279

280280
df = self.add_new_code(
281281
df, from_code, new_code, from_col=from_col, new_col=new_col
282-
)
283-
df.drop(columns=from_col, inplace=True)
282+
).drop(columns=from_col)
284283

285284
if "weight" in df.columns:
286285
if data_cols is None:
@@ -327,7 +326,7 @@ def convert_fips_to_state_code(
327326
data = data.copy()
328327

329328
if not is_string_dtype(data[fips_col]):
330-
data[fips_col] = data[fips_col].astype(str).zfill(5)
329+
data[fips_col] = data[fips_col].astype(str).str.zfill(5)
331330

332331
# Take the first two digits of the FIPS code
333332
data[state_code_col] = data[fips_col].str[:2]
@@ -425,8 +424,7 @@ def convert_fips_to_zip(
425424
data[fips_col] = data[fips_col].astype(str).str.zfill(5)
426425

427426
cross = df.rename(columns={"zip": zip_col, "weight": weight_col})
428-
data = data.merge(cross, left_on=fips_col, right_on="fips", how="left")
429-
427+
data = data.merge(cross, left_on=fips_col, right_on="fips", how="left").dropna(subset=[zip_col])
430428
return data
431429

432430
def convert_state_code_to_state_id(
@@ -617,9 +615,8 @@ def fips_to_state_id(
617615
data = self.convert_fips_to_state_id(
618616
data, fips_col=fips_col, state_id_col=state_id_col
619617
)
620-
data.dropna(subset=["state_code"], axis=0, inplace=True)
621-
data.drop([fips_col, "state_code"], axis=1, inplace=True)
622-
data = data.groupby([date_col, state_id_col], dropna=False).sum()
618+
# data.drop([fips_col, "state_code"], axis=1, inplace=True)
619+
data = data.groupby([date_col, state_id_col]).sum()
623620
return data.reset_index()
624621

625622
def fips_to_msa(
@@ -780,7 +777,7 @@ def zip_to_hrr(
780777

781778
data = data[[zip_col, date_col] + count_cols].copy()
782779
data = self.convert_zip_to_hrr(data, zip_col=zip_col, hrr_col=hrr_col)
783-
data = data.groupby([date_col, hrr_col], dropna=False).sum()
780+
data = data.groupby([date_col, hrr_col]).sum()
784781
return data.reset_index()
785782

786783
def convert_jhu_uid_to_fips(

0 commit comments

Comments
 (0)