diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index e1fe2f7fe77e2..2111fd2f1dd37 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -83,6 +83,9 @@ Performance improvements Bug fixes ~~~~~~~~~ +- Fix bug in :meth:`io.json.json_normalize` when nested meta paths with a nested record path. (:issue:`27220`) +- +- Categorical ^^^^^^^^^^^ diff --git a/pandas/io/json/_normalize.py b/pandas/io/json/_normalize.py index 24a255c78f3c0..b5aabea0957b0 100644 --- a/pandas/io/json/_normalize.py +++ b/pandas/io/json/_normalize.py @@ -288,12 +288,14 @@ def _recursive_extract(data, path, seen_meta, level=0): if len(path) > 1: for obj in data: for val, key in zip(meta, meta_keys): - if level + 1 == len(val): - seen_meta[key] = _pull_field(obj, val[-1]) + + # Pull value for all the keys in case meta path and + # record path are on two branches + seen_meta[key] = _pull_field(obj, val[0]) _recursive_extract(obj[path[0]], path[1:], seen_meta, level=level + 1) else: - for obj in data: + for ind, obj in enumerate(data): recs = _pull_field(obj, path[0]) recs = [ nested_to_record(r, sep=sep, max_level=max_level) @@ -305,8 +307,26 @@ def _recursive_extract(data, path, seen_meta, level=0): # For repeating the metadata later lengths.append(len(recs)) for val, key in zip(meta, meta_keys): + + # Extract the value of the key when the level + # is at the meta path end if level + 1 > len(val): meta_val = seen_meta[key] + + # Extract the value of the key from seen_meta when + # meta path and record path are on two branches + elif seen_meta: + meta_val_obj = seen_meta[key] + + # Both the list case and the dict case are covered + meta_val = ( + meta_val_obj[ind][val[level]] + if isinstance(meta_val_obj, list) + else meta_val_obj[val[level]] + ) + + # At top level, seen_meta is empty, pull from data + # directly and raise KeyError if not found else: try: meta_val = _pull_field(obj, val[level:]) @@ -320,6 +340,7 @@ def _recursive_extract(data, path, seen_meta, level=0): "{err} is not always present".format(err=e) ) meta_vals[key].append(meta_val) + records.extend(recs) _recursive_extract(data, record_path, {}, level=0) diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index 3ceddfc3c1db4..4e63673642abe 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -287,6 +287,31 @@ def test_shallow_nested(self): expected = DataFrame(ex_data, columns=result.columns) tm.assert_frame_equal(result, expected) + @pytest.mark.skipif(not PY36, reason="drop support for 3.5 soon") + def test_nested_meta_path_with_nested_record_path(self, state_data): + # GH 27220 + result = json_normalize( + data=state_data, + record_path=["counties", "name"], + meta=["state", "shortname", ["info", "governor"]], + errors="ignore", + ) + + ex_data = [ + [ + i + for word in ["Dade", "Broward", "Palm Beach", "Summit", "Cuyahoga"] + for i in word + ], + ["Florida"] * 21 + ["Ohio"] * 14, + ["FL"] * 21 + ["OH"] * 14, + ["Rick Scott"] * 21 + ["John Kasich"] * 14, + ] + expected = DataFrame(ex_data).T + expected.columns = [0, "state", "shortname", "info.governor"] + + tm.assert_frame_equal(result, expected) + def test_meta_name_conflict(self): data = [ {