Skip to content

BUG: fix nested meta path bug (GH 27220) #27667

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 13 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/source/whatsnew/v1.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,9 @@ Performance improvements
Bug fixes
~~~~~~~~~

- Fix bug in :meth:`io.json.json_normalize` when nested meta paths with a nested record path. (:issue:`27220`)
-
-

Categorical
^^^^^^^^^^^
Expand Down
27 changes: 24 additions & 3 deletions pandas/io/json/_normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,12 +288,14 @@ def _recursive_extract(data, path, seen_meta, level=0):
if len(path) > 1:
for obj in data:
for val, key in zip(meta, meta_keys):
if level + 1 == len(val):
seen_meta[key] = _pull_field(obj, val[-1])

# Pull value for all the keys in case meta path and
# record path are on two branches
seen_meta[key] = _pull_field(obj, val[0])

_recursive_extract(obj[path[0]], path[1:], seen_meta, level=level + 1)
else:
for obj in data:
for ind, obj in enumerate(data):
recs = _pull_field(obj, path[0])
recs = [
nested_to_record(r, sep=sep, max_level=max_level)
Expand All @@ -305,8 +307,26 @@ def _recursive_extract(data, path, seen_meta, level=0):
# For repeating the metadata later
lengths.append(len(recs))
for val, key in zip(meta, meta_keys):

# Extract the value of the key when the level
# is at the meta path end
if level + 1 > len(val):
meta_val = seen_meta[key]

# Extract the value of the key from seen_meta when
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you put a blank line fore these comments (and below and above), basically easier to read if they are paragraph like.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Definitely. Will update this tomorrow.

# meta path and record path are on two branches
elif seen_meta:
meta_val_obj = seen_meta[key]

# Both the list case and the dict case are covered
meta_val = (
meta_val_obj[ind][val[level]]
if isinstance(meta_val_obj, list)
else meta_val_obj[val[level]]
)

# At top level, seen_meta is empty, pull from data
# directly and raise KeyError if not found
else:
try:
meta_val = _pull_field(obj, val[level:])
Expand All @@ -320,6 +340,7 @@ def _recursive_extract(data, path, seen_meta, level=0):
"{err} is not always present".format(err=e)
)
meta_vals[key].append(meta_val)

records.extend(recs)

_recursive_extract(data, record_path, {}, level=0)
Expand Down
25 changes: 25 additions & 0 deletions pandas/tests/io/json/test_normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,31 @@ def test_shallow_nested(self):
expected = DataFrame(ex_data, columns=result.columns)
tm.assert_frame_equal(result, expected)

@pytest.mark.skipif(not PY36, reason="drop support for 3.5 soon")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this is needed if you follow comment below

def test_nested_meta_path_with_nested_record_path(self, state_data):
# GH 27220
result = json_normalize(
data=state_data,
record_path=["counties", "name"],
meta=["state", "shortname", ["info", "governor"]],
errors="ignore",
)

ex_data = [
[
i
for word in ["Dade", "Broward", "Palm Beach", "Summit", "Cuyahoga"]
for i in word
],
["Florida"] * 21 + ["Ohio"] * 14,
["FL"] * 21 + ["OH"] * 14,
["Rick Scott"] * 21 + ["John Kasich"] * 14,
]
expected = DataFrame(ex_data).T
expected.columns = [0, "state", "shortname", "info.governor"]

tm.assert_frame_equal(result, expected)

def test_meta_name_conflict(self):
data = [
{
Expand Down