Skip to content

PERF: json_normalize, for basic use case #40035

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 18 commits into from
Mar 5, 2021
Merged
Changes from 5 commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 96 additions & 0 deletions pandas/io/json/_normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,94 @@ def nested_to_record(
return new_ds


def _simple_json_normalize(
ds: Union[Dict, List[Dict]],
sep: str = ".",
) -> Union[List[dict], dict]:
"""
A optimized basic json_normalize

Converts a nested dict into a flat dict ("record"), unlike
json_normalize and nested_to_record it doesn't do anything clever.
But for the most basic use cases it enhances performance.
E.g. pd.json_normalize(data)

Parameters
----------
ds : dict or list of dicts
sep : str, default '.'
Nested records will generate names separated by sep,
e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar

Returns
-------
frame : DataFrame
d - dict or list of dicts, matching `normalised_json_object`

Examples
--------
IN[52]: _simple_json_normalize(dict(flat1=1,dict1=dict(c=1,d=2),
nested=dict(e=dict(c=1,d=2),d=2)))
Out[52]:
{'dict1.c': 1,
'dict1.d': 2,
'flat1': 1,
'nested.d': 2,
'nested.e.c': 1,
'nested.e.d': 2}

"""

# intended as a performance improvement, see #15621
def _normalise_json(
data: object,
key_string: str,
new_dict: Dict,
separator: str,
):
"""
Main recursive function, maintains object types, not ordering
Designed for the most basic use case of pd.json_normalize(data)
(i.e. no bells or whistlers)
"""
if isinstance(data, dict):
for key, value in data.items():
new_key = f"{key_string}{separator}{key}"
_normalise_json(
data=value,
# to avoid adding the separator to the start of every key
key_string=new_key
if new_key[len(separator) - 1] != separator
else new_key[len(separator) :],
new_dict=new_dict,
separator=separator,
)
else:
new_dict[key_string] = data
return new_dict

def _normalise_json_ordered(data: Dict, separator: str):
"""
Order the top level keys and then recursively go to depth
"""
top_dict_ = {k: v for k, v in data.items() if not isinstance(v, dict)}
nested_dict_ = _normalise_json(
data={k: v for k, v in data.items() if isinstance(v, dict)},
key_string="",
new_dict={},
separator=separator,
)
return {**top_dict_, **nested_dict_}

normalised_json_object = {}
# expect a dictionary, as most jsons are. However, lists are perfectly valid
if isinstance(ds, dict):
normalised_json_object = _normalise_json_ordered(data=ds, separator=sep)
elif isinstance(ds, list):
normalised_json_object = [_simple_json_normalize(row, sep=sep) for row in ds]
return normalised_json_object


def _json_normalize(
data: Union[Dict, List[Dict]],
record_path: Optional[Union[str, List]] = None,
Expand Down Expand Up @@ -283,6 +371,14 @@ def _pull_records(js: Dict[str, Any], spec: Union[List, str]) -> List:
else:
raise NotImplementedError

if all(
True if x is None else False
for x in (record_path, meta, meta_prefix, record_prefix, max_level)
):
# for very basic use case of pd.json_normalize(data), this is quick and
# consistent with pandas ordering of json data
return pd.DataFrame(_simple_json_normalize(data, sep=sep))

if record_path is None:
if any([isinstance(x, dict) for x in y.values()] for y in data):
# naive normalization, this is idempotent for flat records
Expand Down