From e232067e75170f4d00161f7bce51a952155d1942 Mon Sep 17 00:00:00 2001 From: Kenneth <24253983+aerymilts@users.noreply.github.com> Date: Sun, 18 Mar 2018 18:44:40 +0800 Subject: [PATCH 1/2] BUG: fixed json_normalize for subrecords with NoneTypes TST: additional coverage for the test cases DOC: added changes to whatsnew/v0.23.0.txt BUG: changed how nan is declared for backward compatibility - python2.7 --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/io/json/normalize.py | 5 ++- pandas/tests/io/json/test_normalize.py | 49 ++++++++++++++++++++++++++ 3 files changed, 54 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index cfe28edd175b6..f0fc62f455fd1 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -979,6 +979,7 @@ I/O - :class:`Timedelta` now supported in :func:`DataFrame.to_excel` for all Excel file types (:issue:`19242`, :issue:`9155`, :issue:`19900`) - Bug in :meth:`pandas.io.stata.StataReader.value_labels` raising an ``AttributeError`` when called on very old files. Now returns an empty dict (:issue:`19417`) - Bug in :func:`read_pickle` when unpickling objects with :class:`TimedeltaIndex` or :class:`Float64Index` created with pandas prior to version 0.20 (:issue:`19939`) +- Bug in :meth:`pandas.io.json.json_normalize` where subrecords are not properly normalized if any subrecords values are NoneType (:issue:`20030`) Plotting ^^^^^^^^ diff --git a/pandas/io/json/normalize.py b/pandas/io/json/normalize.py index c7901f4352d00..549204abd3caf 100644 --- a/pandas/io/json/normalize.py +++ b/pandas/io/json/normalize.py @@ -80,6 +80,8 @@ def nested_to_record(ds, prefix="", sep=".", level=0): if level != 0: # so we skip copying for top level, common case v = new_d.pop(k) new_d[newkey] = v + if v is None: # pop the key if the value is None + new_d.pop(k) continue else: v = new_d.pop(k) @@ -189,7 +191,8 @@ def _pull_field(js, spec): data = [data] if record_path is None: - if any(isinstance(x, dict) for x in compat.itervalues(data[0])): + if any([[isinstance(x, dict) + for x in compat.itervalues(y)] for y in data]): # naive normalization, this is idempotent for flat records # and potentially will inflate the data considerably for # deeply nested structures: diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index 1cceae32cd748..3249f8ed78c7f 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -54,6 +54,17 @@ def state_data(): 'state': 'Ohio'}] +@pytest.fixture +def author_missing_data(): + return [ + {'info': None}, + {'info': + {'created_at': '11/08/1993', 'last_updated': '26/05/2012'}, + 'author_name': + {'first': 'Jane', 'last_name': 'Doe'} + }] + + class TestJSONNormalize(object): def test_simple_records(self): @@ -226,6 +237,21 @@ def test_non_ascii_key(self): result = json_normalize(json.loads(testjson)) tm.assert_frame_equal(result, expected) + def test_missing_field(self, author_missing_data): + result = json_normalize(author_missing_data) + ex_data = [ + {'author_name.first': float('nan'), + 'author_name.last_name': float('nan'), + 'info.created_at': float('nan'), + 'info.last_updated': float('nan')}, + {'author_name.first': 'Jane', + 'author_name.last_name': 'Doe', + 'info.created_at': '11/08/1993', + 'info.last_updated': '26/05/2012'} + ] + expected = DataFrame(ex_data) + tm.assert_frame_equal(result, expected) + class TestNestedToRecord(object): @@ -322,3 +348,26 @@ def test_json_normalize_errors(self): ['general', 'trade_version']], errors='raise' ) + + def test_nonetype_dropping(self): + data = [ + {'info': None, + 'author_name': + {'first': 'Smith', 'last_name': 'Appleseed'} + }, + {'info': + {'created_at': '11/08/1993', 'last_updated': '26/05/2012'}, + 'author_name': + {'first': 'Jane', 'last_name': 'Doe'} + } + ] + result = nested_to_record(data) + expected = [ + {'author_name.first': 'Smith', + 'author_name.last_name': 'Appleseed'}, + {'author_name.first': 'Jane', + 'author_name.last_name': 'Doe', + 'info.created_at': '11/08/1993', + 'info.last_updated': '26/05/2012'}] + + assert result == expected From 76d2d910e995d1c4f0a7ce649b786d53e0faefab Mon Sep 17 00:00:00 2001 From: Kenneth <24253983+aerymilts@users.noreply.github.com> Date: Tue, 20 Mar 2018 10:31:01 +0800 Subject: [PATCH 2/2] TST: updated test purposes and how nan is declared (#20030) --- pandas/tests/io/json/test_normalize.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index 3249f8ed78c7f..0fabaf747b6de 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -238,12 +238,14 @@ def test_non_ascii_key(self): tm.assert_frame_equal(result, expected) def test_missing_field(self, author_missing_data): + # GH20030: Checks for robustness of json_normalize - should + # unnest records where only the first record has a None value result = json_normalize(author_missing_data) ex_data = [ - {'author_name.first': float('nan'), - 'author_name.last_name': float('nan'), - 'info.created_at': float('nan'), - 'info.last_updated': float('nan')}, + {'author_name.first': np.nan, + 'author_name.last_name': np.nan, + 'info.created_at': np.nan, + 'info.last_updated': np.nan}, {'author_name.first': 'Jane', 'author_name.last_name': 'Doe', 'info.created_at': '11/08/1993', @@ -350,6 +352,8 @@ def test_json_normalize_errors(self): ) def test_nonetype_dropping(self): + # GH20030: Checks that None values are dropped in nested_to_record + # to prevent additional columns of nans when passed to DataFrame data = [ {'info': None, 'author_name':