Skip to content

Commit 7aa5300

Browse files
neelmramanfeefladder
authored andcommitted
BUG: json_normalize not consistently ignoring errors (pandas-dev#41876) (pandas-dev#42179)
1 parent d508a49 commit 7aa5300

File tree

3 files changed

+67
-20
lines changed

3 files changed

+67
-20
lines changed

doc/source/whatsnew/v1.4.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -233,6 +233,7 @@ MultiIndex
233233
I/O
234234
^^^
235235
- Bug in :func:`read_excel` attempting to read chart sheets from .xlsx files (:issue:`41448`)
236+
- Bug in :func:`json_normalize` where ``errors=ignore`` could fail to ignore missing values of ``meta`` when ``record_path`` has a length greater than one (:issue:`41876`)
236237
- Bug in :func:`read_csv` with multi-header input and arguments referencing column names as tuples (:issue:`42446`)
237238
-
238239

pandas/io/json/_normalize.py

+25-17
Original file line numberDiff line numberDiff line change
@@ -380,14 +380,31 @@ def _json_normalize(
380380
Returns normalized data with columns prefixed with the given string.
381381
"""
382382

383-
def _pull_field(js: dict[str, Any], spec: list | str) -> Scalar | Iterable:
383+
def _pull_field(
384+
js: dict[str, Any], spec: list | str, extract_record: bool = False
385+
) -> Scalar | Iterable:
384386
"""Internal function to pull field"""
385387
result = js
386-
if isinstance(spec, list):
387-
for field in spec:
388-
result = result[field]
389-
else:
390-
result = result[spec]
388+
try:
389+
if isinstance(spec, list):
390+
for field in spec:
391+
result = result[field]
392+
else:
393+
result = result[spec]
394+
except KeyError as e:
395+
if extract_record:
396+
raise KeyError(
397+
f"Key {e} not found. If specifying a record_path, all elements of "
398+
f"data should have the path."
399+
) from e
400+
elif errors == "ignore":
401+
return np.nan
402+
else:
403+
raise KeyError(
404+
f"Key {e} not found. To replace missing values of {e} with "
405+
f"np.nan, pass in errors='ignore'"
406+
) from e
407+
391408
return result
392409

393410
def _pull_records(js: dict[str, Any], spec: list | str) -> list:
@@ -396,7 +413,7 @@ def _pull_records(js: dict[str, Any], spec: list | str) -> list:
396413
_pull_field, but require to return list. And will raise error
397414
if has non iterable value.
398415
"""
399-
result = _pull_field(js, spec)
416+
result = _pull_field(js, spec, extract_record=True)
400417

401418
# GH 31507 GH 30145, GH 26284 if result is not list, raise TypeError if not
402419
# null, otherwise return an empty list
@@ -488,16 +505,7 @@ def _recursive_extract(data, path, seen_meta, level=0):
488505
if level + 1 > len(val):
489506
meta_val = seen_meta[key]
490507
else:
491-
try:
492-
meta_val = _pull_field(obj, val[level:])
493-
except KeyError as e:
494-
if errors == "ignore":
495-
meta_val = np.nan
496-
else:
497-
raise KeyError(
498-
"Try running with errors='ignore' as key "
499-
f"{e} is not always present"
500-
) from e
508+
meta_val = _pull_field(obj, val[level:])
501509
meta_vals[key].append(meta_val)
502510
records.extend(recs)
503511

pandas/tests/io/json/test_normalize.py

+41-3
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ def missing_metadata():
105105
"zip": 44646,
106106
}
107107
],
108+
"previous_residences": {"cities": [{"city_name": "Foo York City"}]},
108109
},
109110
{
110111
"addresses": [
@@ -115,7 +116,8 @@ def missing_metadata():
115116
"state": "TN",
116117
"zip": 37643,
117118
}
118-
]
119+
],
120+
"previous_residences": {"cities": [{"city_name": "Barmingham"}]},
119121
},
120122
]
121123

@@ -598,7 +600,10 @@ def test_json_normalize_errors(self, missing_metadata):
598600
# If meta keys are not always present a new option to set
599601
# errors='ignore' has been implemented
600602

601-
msg = "Try running with errors='ignore' as key 'name' is not always present"
603+
msg = (
604+
"Key 'name' not found. To replace missing values of "
605+
"'name' with np.nan, pass in errors='ignore'"
606+
)
602607
with pytest.raises(KeyError, match=msg):
603608
json_normalize(
604609
data=missing_metadata,
@@ -618,11 +623,44 @@ def test_missing_meta(self, missing_metadata):
618623
[9562, "Morris St.", "Massillon", "OH", 44646, "Alice"],
619624
[8449, "Spring St.", "Elizabethton", "TN", 37643, np.nan],
620625
]
621-
columns = ["city", "number", "state", "street", "zip", "name"]
622626
columns = ["number", "street", "city", "state", "zip", "name"]
623627
expected = DataFrame(ex_data, columns=columns)
624628
tm.assert_frame_equal(result, expected)
625629

630+
def test_missing_meta_multilevel_record_path_errors_raise(self, missing_metadata):
631+
# GH41876
632+
# Ensure errors='raise' works as intended even when a record_path of length
633+
# greater than one is passed in
634+
msg = (
635+
"Key 'name' not found. To replace missing values of "
636+
"'name' with np.nan, pass in errors='ignore'"
637+
)
638+
with pytest.raises(KeyError, match=msg):
639+
json_normalize(
640+
data=missing_metadata,
641+
record_path=["previous_residences", "cities"],
642+
meta="name",
643+
errors="raise",
644+
)
645+
646+
def test_missing_meta_multilevel_record_path_errors_ignore(self, missing_metadata):
647+
# GH41876
648+
# Ensure errors='ignore' works as intended even when a record_path of length
649+
# greater than one is passed in
650+
result = json_normalize(
651+
data=missing_metadata,
652+
record_path=["previous_residences", "cities"],
653+
meta="name",
654+
errors="ignore",
655+
)
656+
ex_data = [
657+
["Foo York City", "Alice"],
658+
["Barmingham", np.nan],
659+
]
660+
columns = ["city_name", "name"]
661+
expected = DataFrame(ex_data, columns=columns)
662+
tm.assert_frame_equal(result, expected)
663+
626664
def test_donot_drop_nonevalues(self):
627665
# GH21356
628666
data = [

0 commit comments

Comments
 (0)