From bcfbf186725efde89aa003decddb7137e73377b6 Mon Sep 17 00:00:00 2001 From: dickreuter Date: Wed, 26 Oct 2016 21:14:55 +0100 Subject: [PATCH 1/5] Avoids exception when pandas.io.json.json_normalize contains items in meta parameter that don't always occur in every item of the list Added documentation and test for issue #14505 Added keyword errors {'raise'|'ignore} Shortened what's new Removed commas in dictionary for linting compatibility Updated doc --- pandas/io/json.py | 15 +++++++- pandas/io/tests/json/test_json_norm.py | 53 ++++++++++++++++++++++++++ 2 files changed, 66 insertions(+), 2 deletions(-) diff --git a/pandas/io/json.py b/pandas/io/json.py index 878506a6ddc05..ab4c30df1b4c8 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -723,7 +723,9 @@ def nested_to_record(ds, prefix="", level=0): def json_normalize(data, record_path=None, meta=None, meta_prefix=None, - record_prefix=None): + record_prefix=None, + errors='raise'): + """ "Normalize" semi-structured JSON data into a flat table @@ -740,6 +742,8 @@ def json_normalize(data, record_path=None, meta=None, If True, prefix records with dotted (?) path, e.g. foo.bar.field if path to records is ['foo', 'bar'] meta_prefix : string, default None + error: {'raise', 'ignore'}, default 'raise' + * ignore: will ignore keyErrors if keys listed in meta are not always present Returns ------- @@ -839,7 +843,14 @@ def _recursive_extract(data, path, seen_meta, level=0): if level + 1 > len(val): meta_val = seen_meta[key] else: - meta_val = _pull_field(obj, val[level:]) + try: + meta_val = _pull_field(obj, val[level:]) + except KeyError as e: + if errors == 'ignore': + meta_val = np.nan + else: + raise KeyError( + "Try running with errors='ignore' as the following key may not always be present: " + str(e)) meta_vals[key].append(meta_val) records.extend(recs) diff --git a/pandas/io/tests/json/test_json_norm.py b/pandas/io/tests/json/test_json_norm.py index 4848db97194d9..4877728d9ec52 100644 --- a/pandas/io/tests/json/test_json_norm.py +++ b/pandas/io/tests/json/test_json_norm.py @@ -225,6 +225,59 @@ def test_nested_flattens(self): self.assertEqual(result, expected) + + def test_json_normalise_fix(self): + # issue 14505 + j = { + "Trades": [{ + "general": { + "tradeid": 100, + "trade_version": 1, + "stocks": [{ + + "symbol": "AAPL", + "name": "Apple", + "price": "0" + + }, { + + "symbol": "GOOG", + "name": "Google", + "price": "0" + + } + ] + } + }, { + "general": { + "tradeid": 100, + "stocks": [{ + + "symbol": "AAPL", + "name": "Apple", + "price": "0" + + }, { + "symbol": "GOOG", + "name": "Google", + "price": "0" + + } + ] + } + } + ] + } + j = json_normalize(data=j['Trades'], record_path=[['general', 'stocks']], + meta=[['general', 'tradeid'], ['general', 'trade_version']], errors='ignore') + expected={'general.trade_version': {0: 1.0, 1: 1.0, 2: '', 3: ''}, + 'general.tradeid': {0: 100, 1: 100, 2: 100, 3: 100}, + 'name': {0: 'Apple', 1: 'Google', 2: 'Apple', 3: 'Google'}, + 'price': {0: '0', 1: '0', 2: '0', 3: '0'}, + 'symbol': {0: 'AAPL', 1: 'GOOG', 2: 'AAPL', 3: 'GOOG'}} + + self.assertEqual(j.fillna('').to_dict(), expected) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure', '-s'], exit=False) From d29858878f6c4f4be59307867004e75c41e6709a Mon Sep 17 00:00:00 2001 From: dickreuter Date: Fri, 18 Nov 2016 17:19:11 +0000 Subject: [PATCH 2/5] Fixed as instructed in pull request page --- doc/source/whatsnew/v0.20.0.txt | 2 ++ pandas/io/json.py | 5 +++-- pandas/io/tests/json/test_json_norm.py | 14 ++++++++++---- 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 8d88a7b4fb215..33d17a193c791 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -61,6 +61,8 @@ Other enhancements - The ``usecols`` argument in ``pd.read_csv`` now accepts a callable function as a value (:issue:`14154`) - ``pd.DataFrame.plot`` now prints a title above each subplot if ``suplots=True`` and ``title`` is a list of strings (:issue:`14753`) - ``pd.Series.interpolate`` now supports timedelta as an index type with ``method='time'`` (:issue:`6424`) +- ``pandas.io.json.json_normalize`` If meta keys are not always present a new option to set errors="ignore" (:issue:`14583`) + .. _whatsnew_0200.api_breaking: diff --git a/pandas/io/json.py b/pandas/io/json.py index ab4c30df1b4c8..6066cdc92f10d 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -849,8 +849,9 @@ def _recursive_extract(data, path, seen_meta, level=0): if errors == 'ignore': meta_val = np.nan else: - raise KeyError( - "Try running with errors='ignore' as the following key may not always be present: " + str(e)) + raise \ + KeyError( + "Try running with errors='ignore' as key may not always be present: %s", e) meta_vals[key].append(meta_val) records.extend(recs) diff --git a/pandas/io/tests/json/test_json_norm.py b/pandas/io/tests/json/test_json_norm.py index 4877728d9ec52..fdbb4c699882e 100644 --- a/pandas/io/tests/json/test_json_norm.py +++ b/pandas/io/tests/json/test_json_norm.py @@ -226,9 +226,9 @@ def test_nested_flattens(self): self.assertEqual(result, expected) - def test_json_normalise_fix(self): - # issue 14505 - j = { + def test_json_normalize_errors(self): + # If meta keys are not always present a new option to set errors='ignore' has been implemented (:issue:`14583`) + i = { "Trades": [{ "general": { "tradeid": 100, @@ -268,7 +268,7 @@ def test_json_normalise_fix(self): } ] } - j = json_normalize(data=j['Trades'], record_path=[['general', 'stocks']], + j = json_normalize(data=i['Trades'], record_path=[['general', 'stocks']], meta=[['general', 'tradeid'], ['general', 'trade_version']], errors='ignore') expected={'general.trade_version': {0: 1.0, 1: 1.0, 2: '', 3: ''}, 'general.tradeid': {0: 100, 1: 100, 2: 100, 3: 100}, @@ -278,6 +278,12 @@ def test_json_normalise_fix(self): self.assertEqual(j.fillna('').to_dict(), expected) + self.assertRaises(KeyError, + json_normalize, data=i['Trades'], record_path=[['general', 'stocks']], + meta=[['general', 'tradeid'], ['general', 'trade_version']], errors='raise' + ) + + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure', '-s'], exit=False) From 2028924c8ef63bbb9f9f39752d525cc12b77e1df Mon Sep 17 00:00:00 2001 From: dickreuter Date: Fri, 2 Dec 2016 15:14:51 +0000 Subject: [PATCH 3/5] doc changes --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/io/json.py | 7 +++---- pandas/io/tests/json/test_json_norm.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 33d17a193c791..6b9f380ded918 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -62,6 +62,7 @@ Other enhancements - ``pd.DataFrame.plot`` now prints a title above each subplot if ``suplots=True`` and ``title`` is a list of strings (:issue:`14753`) - ``pd.Series.interpolate`` now supports timedelta as an index type with ``method='time'`` (:issue:`6424`) - ``pandas.io.json.json_normalize`` If meta keys are not always present a new option to set errors="ignore" (:issue:`14583`) +- ``pandas.io.json.json_normalize`` gained the option ``errors='ignore'|raise``; the default is raise and is backward compatible. (:issue:`14583`) .. _whatsnew_0200.api_breaking: diff --git a/pandas/io/json.py b/pandas/io/json.py index 6066cdc92f10d..327c764d08541 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -743,7 +743,8 @@ def json_normalize(data, record_path=None, meta=None, path to records is ['foo', 'bar'] meta_prefix : string, default None error: {'raise', 'ignore'}, default 'raise' - * ignore: will ignore keyErrors if keys listed in meta are not always present + * ignore: will ignore KeyError if keys listed in meta are not always present + * raise: will raise KeyError if keys listed in meta are not always present Returns ------- @@ -849,9 +850,7 @@ def _recursive_extract(data, path, seen_meta, level=0): if errors == 'ignore': meta_val = np.nan else: - raise \ - KeyError( - "Try running with errors='ignore' as key may not always be present: %s", e) + raise KeyError("Try running with errors='ignore' as key %s is not always present.", e) meta_vals[key].append(meta_val) records.extend(recs) diff --git a/pandas/io/tests/json/test_json_norm.py b/pandas/io/tests/json/test_json_norm.py index fdbb4c699882e..ff2627c82d0c9 100644 --- a/pandas/io/tests/json/test_json_norm.py +++ b/pandas/io/tests/json/test_json_norm.py @@ -227,7 +227,7 @@ def test_nested_flattens(self): def test_json_normalize_errors(self): - # If meta keys are not always present a new option to set errors='ignore' has been implemented (:issue:`14583`) + # GH14583: If meta keys are not always present a new option to set errors='ignore' has been implemented i = { "Trades": [{ "general": { From 3c942065525b1a1b76387e11a34a14b878d34024 Mon Sep 17 00:00:00 2001 From: dickreuter Date: Thu, 8 Dec 2016 00:16:43 +0000 Subject: [PATCH 4/5] shortened lines to pass linting --- doc/source/whatsnew/v0.20.0.txt | 2 -- pandas/io/json.py | 11 ++++++++--- pandas/io/tests/json/test_json_norm.py | 15 ++++++--------- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 6b9f380ded918..7efaaaa39871c 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -9,7 +9,6 @@ users upgrade to this version. Highlights include: -- Building pandas for development now requires ``cython >= 0.23`` (:issue:`14831`) Check the :ref:`API Changes ` and :ref:`deprecations ` before updating. @@ -61,7 +60,6 @@ Other enhancements - The ``usecols`` argument in ``pd.read_csv`` now accepts a callable function as a value (:issue:`14154`) - ``pd.DataFrame.plot`` now prints a title above each subplot if ``suplots=True`` and ``title`` is a list of strings (:issue:`14753`) - ``pd.Series.interpolate`` now supports timedelta as an index type with ``method='time'`` (:issue:`6424`) -- ``pandas.io.json.json_normalize`` If meta keys are not always present a new option to set errors="ignore" (:issue:`14583`) - ``pandas.io.json.json_normalize`` gained the option ``errors='ignore'|raise``; the default is raise and is backward compatible. (:issue:`14583`) diff --git a/pandas/io/json.py b/pandas/io/json.py index 327c764d08541..ead111e4724f0 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -743,8 +743,12 @@ def json_normalize(data, record_path=None, meta=None, path to records is ['foo', 'bar'] meta_prefix : string, default None error: {'raise', 'ignore'}, default 'raise' - * ignore: will ignore KeyError if keys listed in meta are not always present - * raise: will raise KeyError if keys listed in meta are not always present + * ignore: will ignore KeyError if keys listed in meta are not + always present + * raise: will raise KeyError if keys listed in meta are not + always present + + .. versionadded:: 0.20.0 Returns ------- @@ -850,7 +854,8 @@ def _recursive_extract(data, path, seen_meta, level=0): if errors == 'ignore': meta_val = np.nan else: - raise KeyError("Try running with errors='ignore' as key %s is not always present.", e) + raise KeyError("Try running with errors='ignore'" + "as key %s is not always present.", e) meta_vals[key].append(meta_val) records.extend(recs) diff --git a/pandas/io/tests/json/test_json_norm.py b/pandas/io/tests/json/test_json_norm.py index ff2627c82d0c9..85815e9c74d28 100644 --- a/pandas/io/tests/json/test_json_norm.py +++ b/pandas/io/tests/json/test_json_norm.py @@ -227,7 +227,8 @@ def test_nested_flattens(self): def test_json_normalize_errors(self): - # GH14583: If meta keys are not always present a new option to set errors='ignore' has been implemented + # GH14583: If meta keys are not always present + # a new option to set errors='ignore' has been implemented i = { "Trades": [{ "general": { @@ -238,13 +239,10 @@ def test_json_normalize_errors(self): "symbol": "AAPL", "name": "Apple", "price": "0" - }, { - "symbol": "GOOG", "name": "Google", "price": "0" - } ] } @@ -252,16 +250,13 @@ def test_json_normalize_errors(self): "general": { "tradeid": 100, "stocks": [{ - "symbol": "AAPL", "name": "Apple", "price": "0" - }, { "symbol": "GOOG", "name": "Google", "price": "0" - } ] } @@ -269,7 +264,8 @@ def test_json_normalize_errors(self): ] } j = json_normalize(data=i['Trades'], record_path=[['general', 'stocks']], - meta=[['general', 'tradeid'], ['general', 'trade_version']], errors='ignore') + meta=[['general', 'tradeid'], ['general', 'trade_version']], + errors='ignore') expected={'general.trade_version': {0: 1.0, 1: 1.0, 2: '', 3: ''}, 'general.tradeid': {0: 100, 1: 100, 2: 100, 3: 100}, 'name': {0: 'Apple', 1: 'Google', 2: 'Apple', 3: 'Google'}, @@ -280,7 +276,8 @@ def test_json_normalize_errors(self): self.assertRaises(KeyError, json_normalize, data=i['Trades'], record_path=[['general', 'stocks']], - meta=[['general', 'tradeid'], ['general', 'trade_version']], errors='raise' + meta=[['general', 'tradeid'], ['general', 'trade_version']], + errors='raise' ) From 701c14092267425542aa8eb367298c035d9e8610 Mon Sep 17 00:00:00 2001 From: dickreuter Date: Sun, 11 Dec 2016 03:40:47 +0000 Subject: [PATCH 5/5] adjusted formatting --- pandas/io/json.py | 7 ++++--- pandas/io/tests/json/test_json_norm.py | 23 +++++++++++++---------- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/pandas/io/json.py b/pandas/io/json.py index ead111e4724f0..da540a8797578 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -747,7 +747,6 @@ def json_normalize(data, record_path=None, meta=None, always present * raise: will raise KeyError if keys listed in meta are not always present - .. versionadded:: 0.20.0 Returns @@ -854,8 +853,10 @@ def _recursive_extract(data, path, seen_meta, level=0): if errors == 'ignore': meta_val = np.nan else: - raise KeyError("Try running with errors='ignore'" - "as key %s is not always present.", e) + raise \ + KeyError("Try running with " + "errors='ignore' as key " + "%s is not always present", e) meta_vals[key].append(meta_val) records.extend(recs) diff --git a/pandas/io/tests/json/test_json_norm.py b/pandas/io/tests/json/test_json_norm.py index 85815e9c74d28..36110898448ea 100644 --- a/pandas/io/tests/json/test_json_norm.py +++ b/pandas/io/tests/json/test_json_norm.py @@ -225,7 +225,6 @@ def test_nested_flattens(self): self.assertEqual(result, expected) - def test_json_normalize_errors(self): # GH14583: If meta keys are not always present # a new option to set errors='ignore' has been implemented @@ -263,20 +262,24 @@ def test_json_normalize_errors(self): } ] } - j = json_normalize(data=i['Trades'], record_path=[['general', 'stocks']], - meta=[['general', 'tradeid'], ['general', 'trade_version']], + j = json_normalize(data=i['Trades'], + record_path=[['general', 'stocks']], + meta=[['general', 'tradeid'], + ['general', 'trade_version']], errors='ignore') - expected={'general.trade_version': {0: 1.0, 1: 1.0, 2: '', 3: ''}, - 'general.tradeid': {0: 100, 1: 100, 2: 100, 3: 100}, - 'name': {0: 'Apple', 1: 'Google', 2: 'Apple', 3: 'Google'}, - 'price': {0: '0', 1: '0', 2: '0', 3: '0'}, - 'symbol': {0: 'AAPL', 1: 'GOOG', 2: 'AAPL', 3: 'GOOG'}} + expected = {'general.trade_version': {0: 1.0, 1: 1.0, 2: '', 3: ''}, + 'general.tradeid': {0: 100, 1: 100, 2: 100, 3: 100}, + 'name': {0: 'Apple', 1: 'Google', 2: 'Apple', 3: 'Google'}, + 'price': {0: '0', 1: '0', 2: '0', 3: '0'}, + 'symbol': {0: 'AAPL', 1: 'GOOG', 2: 'AAPL', 3: 'GOOG'}} self.assertEqual(j.fillna('').to_dict(), expected) self.assertRaises(KeyError, - json_normalize, data=i['Trades'], record_path=[['general', 'stocks']], - meta=[['general', 'tradeid'], ['general', 'trade_version']], + json_normalize, data=i['Trades'], + record_path=[['general', 'stocks']], + meta=[['general', 'tradeid'], + ['general', 'trade_version']], errors='raise' )