From 8edc40e548efa93afe89a10365fe102b9fee5fbd Mon Sep 17 00:00:00 2001 From: John Owens Date: Wed, 21 Dec 2016 16:03:14 -0800 Subject: [PATCH 1/3] ENH: json_normalize now takes a user-specified separator closes #14883 --- doc/source/whatsnew/v0.20.0.txt | 4 ++++ pandas/tests/io/json/test_normalize.py | 15 +++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 15566d207e31f..30f25b2f99117 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -303,6 +303,7 @@ Other Enhancements - ``pandas.io.json.json_normalize()`` gained the option ``errors='ignore'|'raise'``; the default is ``errors='raise'`` which is backward compatible. (:issue:`14583`) - ``.select_dtypes()`` now allows the string 'datetimetz' to generically select datetimes with tz (:issue:`14910`) - The ``.to_latex()`` method will now accept ``multicolumn`` and ``multirow`` arguments to use the accompanying LaTeX enhancements + - ``pd.merge_asof()`` gained the option ``direction='backward'|'forward'|'nearest'`` (:issue:`14887`) - ``Series/DataFrame.asfreq()`` have gained a ``fill_value`` parameter, to fill missing values (:issue:`3715`). - ``Series/DataFrame.resample.asfreq`` have gained a ``fill_value`` parameter, to fill missing values during resampling (:issue:`3715`). @@ -318,6 +319,9 @@ Other Enhancements - Re-enable the ``parse_dates`` keyword of ``read_excel`` to parse string columns as dates (:issue:`14326`) - Added ``.empty`` property to subclasses of ``Index``. (:issue:`15270`) +- ``pandas.io.json.json_normalize()`` has gained a ``sep`` option that accepts ``str`` to separate joined fields; the default is ".", which is backward compatible. (:issue:`14883`) + + .. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index f881f4dafe0f3..ecc59be85b860 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -67,6 +67,21 @@ def test_empty_array(self): expected = DataFrame() tm.assert_frame_equal(result, expected) + def test_simple_normalize_with_default_separator(self): + result = json_normalize({'A': {'A': 1, 'B': 2}}) + expected = DataFrame([[1, 2]], columns={'A.A', 'A.B'}) + tm.assert_frame_equal(result, expected) + + def test_simple_normalize_with_user_specified_separator(self): + result = json_normalize({'A': {'A': 1, 'B': 2}}, sep='_') + expected = DataFrame([[1, 2]], columns={'A_A', 'A_B'}) + tm.assert_frame_equal(result, expected) + + def test_simple_normalize_with_user_specified_unicode_separator(self): + result = json_normalize({'A': {'A': 1, 'B': 2}}, sep=u'\u03c3') + expected = DataFrame([[1, 2]], columns={u'A\u03c3A', u'A\u03c3B'}) + tm.assert_frame_equal(result, expected) + def test_more_deeply_nested(self): data = [{'country': 'USA', 'states': [{'name': 'California', From bc5aae86ee76b09d5b5b45d2b389772aabb4c162 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 22 Jan 2017 12:01:24 -0500 Subject: [PATCH 2/3] CLN: fixup json_normalize with sep --- doc/source/whatsnew/v0.20.0.txt | 4 +- pandas/io/json/normalize.py | 37 +++++-- pandas/tests/io/json/test_normalize.py | 143 +++++++++++++------------ 3 files changed, 103 insertions(+), 81 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 30f25b2f99117..638044cee67bb 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -300,7 +300,6 @@ Other Enhancements - ``pd.DataFrame.plot`` now prints a title above each subplot if ``suplots=True`` and ``title`` is a list of strings (:issue:`14753`) - ``pd.Series.interpolate`` now supports timedelta as an index type with ``method='time'`` (:issue:`6424`) - ``Timedelta.isoformat`` method added for formatting Timedeltas as an `ISO 8601 duration`_. See the :ref:`Timedelta docs ` (:issue:`15136`) -- ``pandas.io.json.json_normalize()`` gained the option ``errors='ignore'|'raise'``; the default is ``errors='raise'`` which is backward compatible. (:issue:`14583`) - ``.select_dtypes()`` now allows the string 'datetimetz' to generically select datetimes with tz (:issue:`14910`) - The ``.to_latex()`` method will now accept ``multicolumn`` and ``multirow`` arguments to use the accompanying LaTeX enhancements @@ -314,11 +313,12 @@ Other Enhancements - ``pd.TimedeltaIndex`` now has a custom datetick formatter specifically designed for nanosecond level precision (:issue:`8711`) - ``pd.types.concat.union_categoricals`` gained the ``ignore_ordered`` argument to allow ignoring the ordered attribute of unioned categoricals (:issue:`13410`). See the :ref:`categorical union docs ` for more information. -- ``pandas.io.json.json_normalize()`` with an empty ``list`` will return an empty ``DataFrame`` (:issue:`15534`) - ``pd.DataFrame.to_latex`` and ``pd.DataFrame.to_string`` now allow optional header aliases. (:issue:`15536`) - Re-enable the ``parse_dates`` keyword of ``read_excel`` to parse string columns as dates (:issue:`14326`) - Added ``.empty`` property to subclasses of ``Index``. (:issue:`15270`) +- ``pandas.io.json.json_normalize()`` gained the option ``errors='ignore'|'raise'``; the default is ``errors='raise'`` which is backward compatible. (:issue:`14583`) +- ``pandas.io.json.json_normalize()`` with an empty ``list`` will return an empty ``DataFrame`` (:issue:`15534`) - ``pandas.io.json.json_normalize()`` has gained a ``sep`` option that accepts ``str`` to separate joined fields; the default is ".", which is backward compatible. (:issue:`14883`) diff --git a/pandas/io/json/normalize.py b/pandas/io/json/normalize.py index 4da4a6ad57850..518e0bc2064e2 100644 --- a/pandas/io/json/normalize.py +++ b/pandas/io/json/normalize.py @@ -21,7 +21,7 @@ def _convert_to_line_delimits(s): return convert_json_to_lines(s) -def nested_to_record(ds, prefix="", level=0): +def nested_to_record(ds, prefix="", sep=".", level=0): """a simplified json_normalize converts a nested dict into a flat dict ("record"), unlike json_normalize, @@ -31,6 +31,12 @@ def nested_to_record(ds, prefix="", level=0): ---------- ds : dict or list of dicts prefix: the prefix, optional, default: "" + sep : string, default '.' + Nested records will generate names separated by sep, + e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar + + .. versionadded:: 0.20.0 + level: the number of levels in the jason string, optional, default: 0 Returns @@ -66,7 +72,7 @@ def nested_to_record(ds, prefix="", level=0): if level == 0: newkey = k else: - newkey = prefix + '.' + k + newkey = prefix + sep + k # only dicts gets recurse-flattend # only at level>1 do we rename the rest of the keys @@ -77,7 +83,7 @@ def nested_to_record(ds, prefix="", level=0): continue else: v = new_d.pop(k) - new_d.update(nested_to_record(v, newkey, level + 1)) + new_d.update(nested_to_record(v, newkey, sep, level + 1)) new_ds.append(new_d) if singleton: @@ -88,7 +94,8 @@ def nested_to_record(ds, prefix="", level=0): def json_normalize(data, record_path=None, meta=None, meta_prefix=None, record_prefix=None, - errors='raise'): + errors='raise', + sep='.'): """ "Normalize" semi-structured JSON data into a flat table @@ -106,13 +113,21 @@ def json_normalize(data, record_path=None, meta=None, path to records is ['foo', 'bar'] meta_prefix : string, default None errors : {'raise', 'ignore'}, default 'raise' - * 'ignore' : will ignore KeyError if keys listed in meta are not - always present - * 'raise' : will raise KeyError if keys listed in meta are not - always present + + * ignore : will ignore KeyError if keys listed in meta are not + always present + * raise : will raise KeyError if keys listed in meta are not + always present .. versionadded:: 0.20.0 + sep : string, default '.' + Nested records will generate names separated by sep, + e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar + + .. versionadded:: 0.20.0 + + Returns ------- frame : DataFrame @@ -173,7 +188,7 @@ def _pull_field(js, spec): # # TODO: handle record value which are lists, at least error # reasonably - data = nested_to_record(data) + data = nested_to_record(data, sep=sep) return DataFrame(data) elif not isinstance(record_path, list): record_path = [record_path] @@ -192,7 +207,9 @@ def _pull_field(js, spec): lengths = [] meta_vals = defaultdict(list) - meta_keys = ['.'.join(val) for val in meta] + if not isinstance(sep, compat.string_types): + sep = str(sep) + meta_keys = [sep.join(val) for val in meta] def _recursive_extract(data, path, seen_meta, level=0): if len(path) > 1: diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index ecc59be85b860..d945d249271f3 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -1,36 +1,60 @@ -from pandas import DataFrame +import pytest import numpy as np import json import pandas.util.testing as tm -from pandas import compat +from pandas import compat, Index, DataFrame from pandas.io.json import json_normalize from pandas.io.json.normalize import nested_to_record -def _assert_equal_data(left, right): - if not left.columns.equals(right.columns): - left = left.reindex(columns=right.columns) +@pytest.fixture +def deep_nested(): + # deeply nested data + return [{'country': 'USA', + 'states': [{'name': 'California', + 'cities': [{'name': 'San Francisco', + 'pop': 12345}, + {'name': 'Los Angeles', + 'pop': 12346}] + }, + {'name': 'Ohio', + 'cities': [{'name': 'Columbus', + 'pop': 1234}, + {'name': 'Cleveland', + 'pop': 1236}]} + ] + }, + {'country': 'Germany', + 'states': [{'name': 'Bayern', + 'cities': [{'name': 'Munich', 'pop': 12347}] + }, + {'name': 'Nordrhein-Westfalen', + 'cities': [{'name': 'Duesseldorf', 'pop': 1238}, + {'name': 'Koeln', 'pop': 1239}]} + ] + } + ] - tm.assert_frame_equal(left, right) +@pytest.fixture +def state_data(): + return [ + {'counties': [{'name': 'Dade', 'population': 12345}, + {'name': 'Broward', 'population': 40000}, + {'name': 'Palm Beach', 'population': 60000}], + 'info': {'governor': 'Rick Scott'}, + 'shortname': 'FL', + 'state': 'Florida'}, + {'counties': [{'name': 'Summit', 'population': 1234}, + {'name': 'Cuyahoga', 'population': 1337}], + 'info': {'governor': 'John Kasich'}, + 'shortname': 'OH', + 'state': 'Ohio'}] -class TestJSONNormalize(tm.TestCase): - def setUp(self): - self.state_data = [ - {'counties': [{'name': 'Dade', 'population': 12345}, - {'name': 'Broward', 'population': 40000}, - {'name': 'Palm Beach', 'population': 60000}], - 'info': {'governor': 'Rick Scott'}, - 'shortname': 'FL', - 'state': 'Florida'}, - {'counties': [{'name': 'Summit', 'population': 1234}, - {'name': 'Cuyahoga', 'population': 1337}], - 'info': {'governor': 'John Kasich'}, - 'shortname': 'OH', - 'state': 'Ohio'}] +class TestJSONNormalize(object): def test_simple_records(self): recs = [{'a': 1, 'b': 2, 'c': 3}, @@ -43,21 +67,21 @@ def test_simple_records(self): tm.assert_frame_equal(result, expected) - def test_simple_normalize(self): - result = json_normalize(self.state_data[0], 'counties') - expected = DataFrame(self.state_data[0]['counties']) + def test_simple_normalize(self, state_data): + result = json_normalize(state_data[0], 'counties') + expected = DataFrame(state_data[0]['counties']) tm.assert_frame_equal(result, expected) - result = json_normalize(self.state_data, 'counties') + result = json_normalize(state_data, 'counties') expected = [] - for rec in self.state_data: + for rec in state_data: expected.extend(rec['counties']) expected = DataFrame(expected) tm.assert_frame_equal(result, expected) - result = json_normalize(self.state_data, 'counties', meta='state') + result = json_normalize(state_data, 'counties', meta='state') expected['state'] = np.array(['Florida', 'Ohio']).repeat([3, 2]) tm.assert_frame_equal(result, expected) @@ -67,48 +91,29 @@ def test_empty_array(self): expected = DataFrame() tm.assert_frame_equal(result, expected) - def test_simple_normalize_with_default_separator(self): + def test_simple_normalize_with_separator(self, deep_nested): + # GH 14883 result = json_normalize({'A': {'A': 1, 'B': 2}}) - expected = DataFrame([[1, 2]], columns={'A.A', 'A.B'}) - tm.assert_frame_equal(result, expected) + expected = DataFrame([[1, 2]], columns=['A.A', 'A.B']) + tm.assert_frame_equal(result.reindex_like(expected), expected) - def test_simple_normalize_with_user_specified_separator(self): result = json_normalize({'A': {'A': 1, 'B': 2}}, sep='_') - expected = DataFrame([[1, 2]], columns={'A_A', 'A_B'}) - tm.assert_frame_equal(result, expected) + expected = DataFrame([[1, 2]], columns=['A_A', 'A_B']) + tm.assert_frame_equal(result.reindex_like(expected), expected) - def test_simple_normalize_with_user_specified_unicode_separator(self): result = json_normalize({'A': {'A': 1, 'B': 2}}, sep=u'\u03c3') - expected = DataFrame([[1, 2]], columns={u'A\u03c3A', u'A\u03c3B'}) - tm.assert_frame_equal(result, expected) + expected = DataFrame([[1, 2]], columns=[u'A\u03c3A', u'A\u03c3B']) + tm.assert_frame_equal(result.reindex_like(expected), expected) - def test_more_deeply_nested(self): - data = [{'country': 'USA', - 'states': [{'name': 'California', - 'cities': [{'name': 'San Francisco', - 'pop': 12345}, - {'name': 'Los Angeles', - 'pop': 12346}] - }, - {'name': 'Ohio', - 'cities': [{'name': 'Columbus', - 'pop': 1234}, - {'name': 'Cleveland', - 'pop': 1236}]} - ] - }, - {'country': 'Germany', - 'states': [{'name': 'Bayern', - 'cities': [{'name': 'Munich', 'pop': 12347}] - }, - {'name': 'Nordrhein-Westfalen', - 'cities': [{'name': 'Duesseldorf', 'pop': 1238}, - {'name': 'Koeln', 'pop': 1239}]} - ] - } - ] + result = json_normalize(deep_nested, ['states', 'cities'], + meta=['country', ['states', 'name']], + sep='_') + assert result.columns.equals( + Index(['name', 'pop', 'country', 'states_name'])) + + def test_more_deeply_nested(self, deep_nested): - result = json_normalize(data, ['states', 'cities'], + result = json_normalize(deep_nested, ['states', 'cities'], meta=['country', ['states', 'name']]) # meta_prefix={'states': 'state_'}) @@ -158,26 +163,26 @@ def test_meta_name_conflict(self): 'data': [{'foo': 'something', 'bar': 'else'}, {'foo': 'something2', 'bar': 'else2'}]}] - self.assertRaises(ValueError, json_normalize, data, - 'data', meta=['foo', 'bar']) + with pytest.raises(ValueError): + json_normalize(data, 'data', meta=['foo', 'bar']) result = json_normalize(data, 'data', meta=['foo', 'bar'], meta_prefix='meta') for val in ['metafoo', 'metabar', 'foo', 'bar']: - self.assertTrue(val in result) + assert val in result - def test_record_prefix(self): - result = json_normalize(self.state_data[0], 'counties') - expected = DataFrame(self.state_data[0]['counties']) + def test_record_prefix(self, state_data): + result = json_normalize(state_data[0], 'counties') + expected = DataFrame(state_data[0]['counties']) tm.assert_frame_equal(result, expected) - result = json_normalize(self.state_data, 'counties', + result = json_normalize(state_data, 'counties', meta='state', record_prefix='county_') expected = [] - for rec in self.state_data: + for rec in state_data: expected.extend(rec['counties']) expected = DataFrame(expected) expected = expected.rename(columns=lambda x: 'county_' + x) From 0327dd14dc38b782e38570f1f24fc3c1bf649a16 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 28 Mar 2017 15:16:54 -0400 Subject: [PATCH 3/3] compare sorted columns --- pandas/tests/io/json/test_normalize.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index d945d249271f3..ee79859e9b71a 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -108,8 +108,9 @@ def test_simple_normalize_with_separator(self, deep_nested): result = json_normalize(deep_nested, ['states', 'cities'], meta=['country', ['states', 'name']], sep='_') - assert result.columns.equals( - Index(['name', 'pop', 'country', 'states_name'])) + expected = Index(['name', 'pop', + 'country', 'states_name']).sort_values() + assert result.columns.sort_values().equals(expected) def test_more_deeply_nested(self, deep_nested):