Skip to content

Commit c7d3e9b

Browse files
authored
PERF: json_normalize, for basic use case (#40035)
1 parent 527c789 commit c7d3e9b

File tree

3 files changed

+150
-0
lines changed

3 files changed

+150
-0
lines changed

asv_bench/benchmarks/io/json.py

+22
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
DataFrame,
77
concat,
88
date_range,
9+
json_normalize,
910
read_json,
1011
timedelta_range,
1112
)
@@ -77,6 +78,27 @@ def peakmem_read_json_lines_nrows(self, index):
7778
read_json(self.fname, orient="records", lines=True, nrows=15000)
7879

7980

81+
class NormalizeJSON(BaseIO):
82+
fname = "__test__.json"
83+
params = [
84+
["split", "columns", "index", "values", "records"],
85+
["df", "df_date_idx", "df_td_int_ts", "df_int_floats", "df_int_float_str"],
86+
]
87+
param_names = ["orient", "frame"]
88+
89+
def setup(self, orient, frame):
90+
data = {
91+
"hello": ["thisisatest", 999898, "mixed types"],
92+
"nest1": {"nest2": {"nest3": "nest3_value", "nest3_int": 3445}},
93+
"nest1_list": {"nest2": ["blah", 32423, 546456.876, 92030234]},
94+
"hello2": "string",
95+
}
96+
self.data = [data for i in range(10000)]
97+
98+
def time_normalize_json(self, orient, frame):
99+
json_normalize(self.data)
100+
101+
80102
class ToJSON(BaseIO):
81103

82104
fname = "__test__.json"

doc/source/whatsnew/v1.3.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -378,6 +378,7 @@ Performance improvements
378378
- Performance improvement in :meth:`core.window.rolling.Rolling.corr` and :meth:`core.window.rolling.Rolling.cov` (:issue:`39388`)
379379
- Performance improvement in :meth:`core.window.rolling.RollingGroupby.corr`, :meth:`core.window.expanding.ExpandingGroupby.corr`, :meth:`core.window.expanding.ExpandingGroupby.corr` and :meth:`core.window.expanding.ExpandingGroupby.cov` (:issue:`39591`)
380380
- Performance improvement in :func:`unique` for object data type (:issue:`37615`)
381+
- Performance improvement in :func:`pd.json_normalize` for basic cases (including seperators) (:issue:`40035` :issue:`15621`)
381382
- Performance improvement in :class:`core.window.rolling.ExpandingGroupby` aggregation methods (:issue:`39664`)
382383
- Performance improvement in :class:`Styler` where render times are more than 50% reduced (:issue:`39972` :issue:`39952`)
383384
- Performance improvement in :meth:`core.window.ewm.ExponentialMovingWindow.mean` with ``times`` (:issue:`39784`)

pandas/io/json/_normalize.py

+127
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,121 @@ def nested_to_record(
121121
return new_ds
122122

123123

124+
def _normalise_json(
125+
data: Any,
126+
key_string: str,
127+
normalized_dict: Dict[str, Any],
128+
separator: str,
129+
) -> Dict[str, Any]:
130+
"""
131+
Main recursive function
132+
Designed for the most basic use case of pd.json_normalize(data)
133+
intended as a performance improvement, see #15621
134+
135+
Parameters
136+
----------
137+
data : Any
138+
Type dependent on types contained within nested Json
139+
key_string : str
140+
New key (with separator(s) in) for data
141+
normalized_dict : dict
142+
The new normalized/flattened Json dict
143+
separator : str, default '.'
144+
Nested records will generate names separated by sep,
145+
e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
146+
"""
147+
if isinstance(data, dict):
148+
for key, value in data.items():
149+
new_key = f"{key_string}{separator}{key}"
150+
_normalise_json(
151+
data=value,
152+
# to avoid adding the separator to the start of every key
153+
key_string=new_key
154+
if new_key[len(separator) - 1] != separator
155+
else new_key[len(separator) :],
156+
normalized_dict=normalized_dict,
157+
separator=separator,
158+
)
159+
else:
160+
normalized_dict[key_string] = data
161+
return normalized_dict
162+
163+
164+
def _normalise_json_ordered(data: Dict[str, Any], separator: str) -> Dict[str, Any]:
165+
"""
166+
Order the top level keys and then recursively go to depth
167+
168+
Parameters
169+
----------
170+
data : dict or list of dicts
171+
separator : str, default '.'
172+
Nested records will generate names separated by sep,
173+
e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
174+
175+
Returns
176+
-------
177+
dict or list of dicts, matching `normalised_json_object`
178+
"""
179+
top_dict_ = {k: v for k, v in data.items() if not isinstance(v, dict)}
180+
nested_dict_ = _normalise_json(
181+
data={k: v for k, v in data.items() if isinstance(v, dict)},
182+
key_string="",
183+
normalized_dict={},
184+
separator=separator,
185+
)
186+
return {**top_dict_, **nested_dict_}
187+
188+
189+
def _simple_json_normalize(
190+
ds: Union[Dict, List[Dict]],
191+
sep: str = ".",
192+
) -> Union[Dict, List[Dict], Any]:
193+
"""
194+
A optimized basic json_normalize
195+
196+
Converts a nested dict into a flat dict ("record"), unlike
197+
json_normalize and nested_to_record it doesn't do anything clever.
198+
But for the most basic use cases it enhances performance.
199+
E.g. pd.json_normalize(data)
200+
201+
Parameters
202+
----------
203+
ds : dict or list of dicts
204+
sep : str, default '.'
205+
Nested records will generate names separated by sep,
206+
e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
207+
208+
Returns
209+
-------
210+
frame : DataFrame
211+
d - dict or list of dicts, matching `normalised_json_object`
212+
213+
Examples
214+
--------
215+
IN[52]: _simple_json_normalize({
216+
'flat1': 1,
217+
'dict1': {'c': 1, 'd': 2},
218+
'nested': {'e': {'c': 1, 'd': 2}, 'd': 2}
219+
})
220+
Out[52]:
221+
{'dict1.c': 1,
222+
'dict1.d': 2,
223+
'flat1': 1,
224+
'nested.d': 2,
225+
'nested.e.c': 1,
226+
'nested.e.d': 2}
227+
228+
"""
229+
normalised_json_object = {}
230+
# expect a dictionary, as most jsons are. However, lists are perfectly valid
231+
if isinstance(ds, dict):
232+
normalised_json_object = _normalise_json_ordered(data=ds, separator=sep)
233+
elif isinstance(ds, list):
234+
normalised_json_list = [_simple_json_normalize(row, sep=sep) for row in ds]
235+
return normalised_json_list
236+
return normalised_json_object
237+
238+
124239
def _json_normalize(
125240
data: Union[Dict, List[Dict]],
126241
record_path: Optional[Union[str, List]] = None,
@@ -283,6 +398,18 @@ def _pull_records(js: Dict[str, Any], spec: Union[List, str]) -> List:
283398
else:
284399
raise NotImplementedError
285400

401+
# check to see if a simple recursive function is possible to
402+
# improve performance (see #15621) but only for cases such
403+
# as pd.Dataframe(data) or pd.Dataframe(data, sep)
404+
if (
405+
record_path is None
406+
and meta is None
407+
and meta_prefix is None
408+
and record_prefix is None
409+
and max_level is None
410+
):
411+
return DataFrame(_simple_json_normalize(data, sep=sep))
412+
286413
if record_path is None:
287414
if any([isinstance(x, dict) for x in y.values()] for y in data):
288415
# naive normalization, this is idempotent for flat records

0 commit comments

Comments
 (0)