Skip to content

Commit 34c6bd0

Browse files
committed
ENH: GH14883: json_normalize now takes a user-specified separator
closes #14883 Author: Jeff Reback <[email protected]> Author: John Owens <[email protected]> Closes #14950 from jowens/json_normalize-separator and squashes the following commits: 0327dd1 [Jeff Reback] compare sorted columns bc5aae8 [Jeff Reback] CLN: fixup json_normalize with sep 8edc40e [John Owens] ENH: json_normalize now takes a user-specified separator
1 parent ec84ae3 commit 34c6bd0

File tree

3 files changed

+114
-72
lines changed

3 files changed

+114
-72
lines changed

doc/source/whatsnew/v0.20.0.txt

+6-2
Original file line numberDiff line numberDiff line change
@@ -300,9 +300,9 @@ Other Enhancements
300300
- ``pd.DataFrame.plot`` now prints a title above each subplot if ``suplots=True`` and ``title`` is a list of strings (:issue:`14753`)
301301
- ``pd.Series.interpolate`` now supports timedelta as an index type with ``method='time'`` (:issue:`6424`)
302302
- ``Timedelta.isoformat`` method added for formatting Timedeltas as an `ISO 8601 duration`_. See the :ref:`Timedelta docs <timedeltas.isoformat>` (:issue:`15136`)
303-
- ``pandas.io.json.json_normalize()`` gained the option ``errors='ignore'|'raise'``; the default is ``errors='raise'`` which is backward compatible. (:issue:`14583`)
304303
- ``.select_dtypes()`` now allows the string 'datetimetz' to generically select datetimes with tz (:issue:`14910`)
305304
- The ``.to_latex()`` method will now accept ``multicolumn`` and ``multirow`` arguments to use the accompanying LaTeX enhancements
305+
306306
- ``pd.merge_asof()`` gained the option ``direction='backward'|'forward'|'nearest'`` (:issue:`14887`)
307307
- ``Series/DataFrame.asfreq()`` have gained a ``fill_value`` parameter, to fill missing values (:issue:`3715`).
308308
- ``Series/DataFrame.resample.asfreq`` have gained a ``fill_value`` parameter, to fill missing values during resampling (:issue:`3715`).
@@ -313,11 +313,15 @@ Other Enhancements
313313

314314
- ``pd.TimedeltaIndex`` now has a custom datetick formatter specifically designed for nanosecond level precision (:issue:`8711`)
315315
- ``pd.types.concat.union_categoricals`` gained the ``ignore_ordered`` argument to allow ignoring the ordered attribute of unioned categoricals (:issue:`13410`). See the :ref:`categorical union docs <categorical.union>` for more information.
316-
- ``pandas.io.json.json_normalize()`` with an empty ``list`` will return an empty ``DataFrame`` (:issue:`15534`)
317316
- ``pd.DataFrame.to_latex`` and ``pd.DataFrame.to_string`` now allow optional header aliases. (:issue:`15536`)
318317
- Re-enable the ``parse_dates`` keyword of ``read_excel`` to parse string columns as dates (:issue:`14326`)
319318
- Added ``.empty`` property to subclasses of ``Index``. (:issue:`15270`)
320319

320+
- ``pandas.io.json.json_normalize()`` gained the option ``errors='ignore'|'raise'``; the default is ``errors='raise'`` which is backward compatible. (:issue:`14583`)
321+
- ``pandas.io.json.json_normalize()`` with an empty ``list`` will return an empty ``DataFrame`` (:issue:`15534`)
322+
- ``pandas.io.json.json_normalize()`` has gained a ``sep`` option that accepts ``str`` to separate joined fields; the default is ".", which is backward compatible. (:issue:`14883`)
323+
324+
321325
.. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations
322326

323327

pandas/io/json/normalize.py

+27-10
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ def _convert_to_line_delimits(s):
2121
return convert_json_to_lines(s)
2222

2323

24-
def nested_to_record(ds, prefix="", level=0):
24+
def nested_to_record(ds, prefix="", sep=".", level=0):
2525
"""a simplified json_normalize
2626
2727
converts a nested dict into a flat dict ("record"), unlike json_normalize,
@@ -31,6 +31,12 @@ def nested_to_record(ds, prefix="", level=0):
3131
----------
3232
ds : dict or list of dicts
3333
prefix: the prefix, optional, default: ""
34+
sep : string, default '.'
35+
Nested records will generate names separated by sep,
36+
e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
37+
38+
.. versionadded:: 0.20.0
39+
3440
level: the number of levels in the jason string, optional, default: 0
3541
3642
Returns
@@ -66,7 +72,7 @@ def nested_to_record(ds, prefix="", level=0):
6672
if level == 0:
6773
newkey = k
6874
else:
69-
newkey = prefix + '.' + k
75+
newkey = prefix + sep + k
7076

7177
# only dicts gets recurse-flattend
7278
# only at level>1 do we rename the rest of the keys
@@ -77,7 +83,7 @@ def nested_to_record(ds, prefix="", level=0):
7783
continue
7884
else:
7985
v = new_d.pop(k)
80-
new_d.update(nested_to_record(v, newkey, level + 1))
86+
new_d.update(nested_to_record(v, newkey, sep, level + 1))
8187
new_ds.append(new_d)
8288

8389
if singleton:
@@ -88,7 +94,8 @@ def nested_to_record(ds, prefix="", level=0):
8894
def json_normalize(data, record_path=None, meta=None,
8995
meta_prefix=None,
9096
record_prefix=None,
91-
errors='raise'):
97+
errors='raise',
98+
sep='.'):
9299
"""
93100
"Normalize" semi-structured JSON data into a flat table
94101
@@ -106,13 +113,21 @@ def json_normalize(data, record_path=None, meta=None,
106113
path to records is ['foo', 'bar']
107114
meta_prefix : string, default None
108115
errors : {'raise', 'ignore'}, default 'raise'
109-
* 'ignore' : will ignore KeyError if keys listed in meta are not
110-
always present
111-
* 'raise' : will raise KeyError if keys listed in meta are not
112-
always present
116+
117+
* ignore : will ignore KeyError if keys listed in meta are not
118+
always present
119+
* raise : will raise KeyError if keys listed in meta are not
120+
always present
113121
114122
.. versionadded:: 0.20.0
115123
124+
sep : string, default '.'
125+
Nested records will generate names separated by sep,
126+
e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
127+
128+
.. versionadded:: 0.20.0
129+
130+
116131
Returns
117132
-------
118133
frame : DataFrame
@@ -173,7 +188,7 @@ def _pull_field(js, spec):
173188
#
174189
# TODO: handle record value which are lists, at least error
175190
# reasonably
176-
data = nested_to_record(data)
191+
data = nested_to_record(data, sep=sep)
177192
return DataFrame(data)
178193
elif not isinstance(record_path, list):
179194
record_path = [record_path]
@@ -192,7 +207,9 @@ def _pull_field(js, spec):
192207
lengths = []
193208

194209
meta_vals = defaultdict(list)
195-
meta_keys = ['.'.join(val) for val in meta]
210+
if not isinstance(sep, compat.string_types):
211+
sep = str(sep)
212+
meta_keys = [sep.join(val) for val in meta]
196213

197214
def _recursive_extract(data, path, seen_meta, level=0):
198215
if len(path) > 1:

pandas/tests/io/json/test_normalize.py

+81-60
Original file line numberDiff line numberDiff line change
@@ -1,36 +1,60 @@
1-
from pandas import DataFrame
1+
import pytest
22
import numpy as np
33
import json
44

55
import pandas.util.testing as tm
6-
from pandas import compat
6+
from pandas import compat, Index, DataFrame
77

88
from pandas.io.json import json_normalize
99
from pandas.io.json.normalize import nested_to_record
1010

1111

12-
def _assert_equal_data(left, right):
13-
if not left.columns.equals(right.columns):
14-
left = left.reindex(columns=right.columns)
12+
@pytest.fixture
13+
def deep_nested():
14+
# deeply nested data
15+
return [{'country': 'USA',
16+
'states': [{'name': 'California',
17+
'cities': [{'name': 'San Francisco',
18+
'pop': 12345},
19+
{'name': 'Los Angeles',
20+
'pop': 12346}]
21+
},
22+
{'name': 'Ohio',
23+
'cities': [{'name': 'Columbus',
24+
'pop': 1234},
25+
{'name': 'Cleveland',
26+
'pop': 1236}]}
27+
]
28+
},
29+
{'country': 'Germany',
30+
'states': [{'name': 'Bayern',
31+
'cities': [{'name': 'Munich', 'pop': 12347}]
32+
},
33+
{'name': 'Nordrhein-Westfalen',
34+
'cities': [{'name': 'Duesseldorf', 'pop': 1238},
35+
{'name': 'Koeln', 'pop': 1239}]}
36+
]
37+
}
38+
]
1539

16-
tm.assert_frame_equal(left, right)
1740

41+
@pytest.fixture
42+
def state_data():
43+
return [
44+
{'counties': [{'name': 'Dade', 'population': 12345},
45+
{'name': 'Broward', 'population': 40000},
46+
{'name': 'Palm Beach', 'population': 60000}],
47+
'info': {'governor': 'Rick Scott'},
48+
'shortname': 'FL',
49+
'state': 'Florida'},
50+
{'counties': [{'name': 'Summit', 'population': 1234},
51+
{'name': 'Cuyahoga', 'population': 1337}],
52+
'info': {'governor': 'John Kasich'},
53+
'shortname': 'OH',
54+
'state': 'Ohio'}]
1855

19-
class TestJSONNormalize(tm.TestCase):
2056

21-
def setUp(self):
22-
self.state_data = [
23-
{'counties': [{'name': 'Dade', 'population': 12345},
24-
{'name': 'Broward', 'population': 40000},
25-
{'name': 'Palm Beach', 'population': 60000}],
26-
'info': {'governor': 'Rick Scott'},
27-
'shortname': 'FL',
28-
'state': 'Florida'},
29-
{'counties': [{'name': 'Summit', 'population': 1234},
30-
{'name': 'Cuyahoga', 'population': 1337}],
31-
'info': {'governor': 'John Kasich'},
32-
'shortname': 'OH',
33-
'state': 'Ohio'}]
57+
class TestJSONNormalize(object):
3458

3559
def test_simple_records(self):
3660
recs = [{'a': 1, 'b': 2, 'c': 3},
@@ -43,21 +67,21 @@ def test_simple_records(self):
4367

4468
tm.assert_frame_equal(result, expected)
4569

46-
def test_simple_normalize(self):
47-
result = json_normalize(self.state_data[0], 'counties')
48-
expected = DataFrame(self.state_data[0]['counties'])
70+
def test_simple_normalize(self, state_data):
71+
result = json_normalize(state_data[0], 'counties')
72+
expected = DataFrame(state_data[0]['counties'])
4973
tm.assert_frame_equal(result, expected)
5074

51-
result = json_normalize(self.state_data, 'counties')
75+
result = json_normalize(state_data, 'counties')
5276

5377
expected = []
54-
for rec in self.state_data:
78+
for rec in state_data:
5579
expected.extend(rec['counties'])
5680
expected = DataFrame(expected)
5781

5882
tm.assert_frame_equal(result, expected)
5983

60-
result = json_normalize(self.state_data, 'counties', meta='state')
84+
result = json_normalize(state_data, 'counties', meta='state')
6185
expected['state'] = np.array(['Florida', 'Ohio']).repeat([3, 2])
6286

6387
tm.assert_frame_equal(result, expected)
@@ -67,33 +91,30 @@ def test_empty_array(self):
6791
expected = DataFrame()
6892
tm.assert_frame_equal(result, expected)
6993

70-
def test_more_deeply_nested(self):
71-
data = [{'country': 'USA',
72-
'states': [{'name': 'California',
73-
'cities': [{'name': 'San Francisco',
74-
'pop': 12345},
75-
{'name': 'Los Angeles',
76-
'pop': 12346}]
77-
},
78-
{'name': 'Ohio',
79-
'cities': [{'name': 'Columbus',
80-
'pop': 1234},
81-
{'name': 'Cleveland',
82-
'pop': 1236}]}
83-
]
84-
},
85-
{'country': 'Germany',
86-
'states': [{'name': 'Bayern',
87-
'cities': [{'name': 'Munich', 'pop': 12347}]
88-
},
89-
{'name': 'Nordrhein-Westfalen',
90-
'cities': [{'name': 'Duesseldorf', 'pop': 1238},
91-
{'name': 'Koeln', 'pop': 1239}]}
92-
]
93-
}
94-
]
94+
def test_simple_normalize_with_separator(self, deep_nested):
95+
# GH 14883
96+
result = json_normalize({'A': {'A': 1, 'B': 2}})
97+
expected = DataFrame([[1, 2]], columns=['A.A', 'A.B'])
98+
tm.assert_frame_equal(result.reindex_like(expected), expected)
99+
100+
result = json_normalize({'A': {'A': 1, 'B': 2}}, sep='_')
101+
expected = DataFrame([[1, 2]], columns=['A_A', 'A_B'])
102+
tm.assert_frame_equal(result.reindex_like(expected), expected)
103+
104+
result = json_normalize({'A': {'A': 1, 'B': 2}}, sep=u'\u03c3')
105+
expected = DataFrame([[1, 2]], columns=[u'A\u03c3A', u'A\u03c3B'])
106+
tm.assert_frame_equal(result.reindex_like(expected), expected)
107+
108+
result = json_normalize(deep_nested, ['states', 'cities'],
109+
meta=['country', ['states', 'name']],
110+
sep='_')
111+
expected = Index(['name', 'pop',
112+
'country', 'states_name']).sort_values()
113+
assert result.columns.sort_values().equals(expected)
114+
115+
def test_more_deeply_nested(self, deep_nested):
95116

96-
result = json_normalize(data, ['states', 'cities'],
117+
result = json_normalize(deep_nested, ['states', 'cities'],
97118
meta=['country', ['states', 'name']])
98119
# meta_prefix={'states': 'state_'})
99120

@@ -143,26 +164,26 @@ def test_meta_name_conflict(self):
143164
'data': [{'foo': 'something', 'bar': 'else'},
144165
{'foo': 'something2', 'bar': 'else2'}]}]
145166

146-
self.assertRaises(ValueError, json_normalize, data,
147-
'data', meta=['foo', 'bar'])
167+
with pytest.raises(ValueError):
168+
json_normalize(data, 'data', meta=['foo', 'bar'])
148169

149170
result = json_normalize(data, 'data', meta=['foo', 'bar'],
150171
meta_prefix='meta')
151172

152173
for val in ['metafoo', 'metabar', 'foo', 'bar']:
153-
self.assertTrue(val in result)
174+
assert val in result
154175

155-
def test_record_prefix(self):
156-
result = json_normalize(self.state_data[0], 'counties')
157-
expected = DataFrame(self.state_data[0]['counties'])
176+
def test_record_prefix(self, state_data):
177+
result = json_normalize(state_data[0], 'counties')
178+
expected = DataFrame(state_data[0]['counties'])
158179
tm.assert_frame_equal(result, expected)
159180

160-
result = json_normalize(self.state_data, 'counties',
181+
result = json_normalize(state_data, 'counties',
161182
meta='state',
162183
record_prefix='county_')
163184

164185
expected = []
165-
for rec in self.state_data:
186+
for rec in state_data:
166187
expected.extend(rec['counties'])
167188
expected = DataFrame(expected)
168189
expected = expected.rename(columns=lambda x: 'county_' + x)

0 commit comments

Comments
 (0)