diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 8d88a7b4fb215..7efaaaa39871c 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -9,7 +9,6 @@ users upgrade to this version. Highlights include: -- Building pandas for development now requires ``cython >= 0.23`` (:issue:`14831`) Check the :ref:`API Changes ` and :ref:`deprecations ` before updating. @@ -61,6 +60,8 @@ Other enhancements - The ``usecols`` argument in ``pd.read_csv`` now accepts a callable function as a value (:issue:`14154`) - ``pd.DataFrame.plot`` now prints a title above each subplot if ``suplots=True`` and ``title`` is a list of strings (:issue:`14753`) - ``pd.Series.interpolate`` now supports timedelta as an index type with ``method='time'`` (:issue:`6424`) +- ``pandas.io.json.json_normalize`` gained the option ``errors='ignore'|raise``; the default is raise and is backward compatible. (:issue:`14583`) + .. _whatsnew_0200.api_breaking: diff --git a/pandas/io/json.py b/pandas/io/json.py index 878506a6ddc05..da540a8797578 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -723,7 +723,9 @@ def nested_to_record(ds, prefix="", level=0): def json_normalize(data, record_path=None, meta=None, meta_prefix=None, - record_prefix=None): + record_prefix=None, + errors='raise'): + """ "Normalize" semi-structured JSON data into a flat table @@ -740,6 +742,12 @@ def json_normalize(data, record_path=None, meta=None, If True, prefix records with dotted (?) path, e.g. foo.bar.field if path to records is ['foo', 'bar'] meta_prefix : string, default None + error: {'raise', 'ignore'}, default 'raise' + * ignore: will ignore KeyError if keys listed in meta are not + always present + * raise: will raise KeyError if keys listed in meta are not + always present + .. versionadded:: 0.20.0 Returns ------- @@ -839,7 +847,16 @@ def _recursive_extract(data, path, seen_meta, level=0): if level + 1 > len(val): meta_val = seen_meta[key] else: - meta_val = _pull_field(obj, val[level:]) + try: + meta_val = _pull_field(obj, val[level:]) + except KeyError as e: + if errors == 'ignore': + meta_val = np.nan + else: + raise \ + KeyError("Try running with " + "errors='ignore' as key " + "%s is not always present", e) meta_vals[key].append(meta_val) records.extend(recs) diff --git a/pandas/io/tests/json/test_json_norm.py b/pandas/io/tests/json/test_json_norm.py index 4848db97194d9..36110898448ea 100644 --- a/pandas/io/tests/json/test_json_norm.py +++ b/pandas/io/tests/json/test_json_norm.py @@ -225,6 +225,65 @@ def test_nested_flattens(self): self.assertEqual(result, expected) + def test_json_normalize_errors(self): + # GH14583: If meta keys are not always present + # a new option to set errors='ignore' has been implemented + i = { + "Trades": [{ + "general": { + "tradeid": 100, + "trade_version": 1, + "stocks": [{ + + "symbol": "AAPL", + "name": "Apple", + "price": "0" + }, { + "symbol": "GOOG", + "name": "Google", + "price": "0" + } + ] + } + }, { + "general": { + "tradeid": 100, + "stocks": [{ + "symbol": "AAPL", + "name": "Apple", + "price": "0" + }, { + "symbol": "GOOG", + "name": "Google", + "price": "0" + } + ] + } + } + ] + } + j = json_normalize(data=i['Trades'], + record_path=[['general', 'stocks']], + meta=[['general', 'tradeid'], + ['general', 'trade_version']], + errors='ignore') + expected = {'general.trade_version': {0: 1.0, 1: 1.0, 2: '', 3: ''}, + 'general.tradeid': {0: 100, 1: 100, 2: 100, 3: 100}, + 'name': {0: 'Apple', 1: 'Google', 2: 'Apple', 3: 'Google'}, + 'price': {0: '0', 1: '0', 2: '0', 3: '0'}, + 'symbol': {0: 'AAPL', 1: 'GOOG', 2: 'AAPL', 3: 'GOOG'}} + + self.assertEqual(j.fillna('').to_dict(), expected) + + self.assertRaises(KeyError, + json_normalize, data=i['Trades'], + record_path=[['general', 'stocks']], + meta=[['general', 'tradeid'], + ['general', 'trade_version']], + errors='raise' + ) + + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure', '-s'], exit=False)