diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 1840c47b4054f..2cabbc2e400b3 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -214,10 +214,10 @@ I/O - Bug in :func:`read_json` for ``orient='table'`` when it tries to infer dtypes by default, which is not applicable as dtypes are already defined in the JSON schema (:issue:`21345`) - Bug in :func:`read_json` for ``orient='table'`` and float index, as it infers index dtype by default, which is not applicable because index dtype is already defined in the JSON schema (:issue:`25433`) - Bug in :func:`read_json` for ``orient='table'`` and string of float column names, as it makes a column name type conversion to Timestamp, which is not applicable because column names are already defined in the JSON schema (:issue:`25435`) +- Bug in :func:`json_normalize` for ``errors='ignore'`` where missing values in the input data, were filled in resulting ``DataFrame`` with the string "nan" instead of ``numpy.nan`` (:issue:`25468`) - :meth:`DataFrame.to_html` now raises ``TypeError`` when using an invalid type for the ``classes`` parameter instead of ``AsseertionError`` (:issue:`25608`) - Bug in :meth:`DataFrame.to_string` and :meth:`DataFrame.to_latex` that would lead to incorrect output when the ``header`` keyword is used (:issue:`16718`) - -- Plotting diff --git a/pandas/io/json/normalize.py b/pandas/io/json/normalize.py index 279630ccd107c..7a8188dd07b6b 100644 --- a/pandas/io/json/normalize.py +++ b/pandas/io/json/normalize.py @@ -281,6 +281,7 @@ def _recursive_extract(data, path, seen_meta, level=0): raise ValueError('Conflicting metadata name {name}, ' 'need distinguishing prefix '.format(name=k)) - result[k] = np.array(v).repeat(lengths) + # forcing dtype to object to avoid the metadata being casted to string + result[k] = np.array(v, dtype=object).repeat(lengths) return result diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index 3bf699cc8a1f0..5362274274d72 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -66,6 +66,25 @@ def author_missing_data(): }] +@pytest.fixture +def missing_metadata(): + return [ + {'name': 'Alice', + 'addresses': [{'number': 9562, + 'street': 'Morris St.', + 'city': 'Massillon', + 'state': 'OH', + 'zip': 44646}] + }, + {'addresses': [{'number': 8449, + 'street': 'Spring St.', + 'city': 'Elizabethton', + 'state': 'TN', + 'zip': 37643}] + } + ] + + class TestJSONNormalize(object): def test_simple_records(self): @@ -318,66 +337,51 @@ def test_nested_flattens(self): assert result == expected - def test_json_normalize_errors(self): - # GH14583: If meta keys are not always present - # a new option to set errors='ignore' has been implemented - i = { - "Trades": [{ - "general": { - "tradeid": 100, - "trade_version": 1, - "stocks": [{ - - "symbol": "AAPL", - "name": "Apple", - "price": "0" - }, { - "symbol": "GOOG", - "name": "Google", - "price": "0" - } - ] - } - }, { - "general": { - "tradeid": 100, - "stocks": [{ - "symbol": "AAPL", - "name": "Apple", - "price": "0" - }, { - "symbol": "GOOG", - "name": "Google", - "price": "0" - } - ] - } - } - ] - } - j = json_normalize(data=i['Trades'], - record_path=[['general', 'stocks']], - meta=[['general', 'tradeid'], - ['general', 'trade_version']], - errors='ignore') - expected = {'general.trade_version': {0: 1.0, 1: 1.0, 2: '', 3: ''}, - 'general.tradeid': {0: 100, 1: 100, 2: 100, 3: 100}, - 'name': {0: 'Apple', 1: 'Google', 2: 'Apple', 3: 'Google'}, - 'price': {0: '0', 1: '0', 2: '0', 3: '0'}, - 'symbol': {0: 'AAPL', 1: 'GOOG', 2: 'AAPL', 3: 'GOOG'}} - - assert j.fillna('').to_dict() == expected - - msg = ("Try running with errors='ignore' as key 'trade_version'" + def test_json_normalize_errors(self, missing_metadata): + # GH14583: + # If meta keys are not always present a new option to set + # errors='ignore' has been implemented + + msg = ("Try running with errors='ignore' as key 'name'" " is not always present") with pytest.raises(KeyError, match=msg): json_normalize( - data=i['Trades'], - record_path=[['general', 'stocks']], - meta=[['general', 'tradeid'], - ['general', 'trade_version']], + data=missing_metadata, + record_path='addresses', + meta='name', errors='raise') + def test_missing_meta(self, missing_metadata): + # GH25468 + # If metadata is nullable with errors set to ignore, the null values + # should be numpy.nan values + result = json_normalize( + data=missing_metadata, + record_path='addresses', + meta='name', + errors='ignore') + ex_data = [ + {'city': 'Massillon', + 'number': 9562, + 'state': 'OH', + 'street': 'Morris St.', + 'zip': 44646, + 'name': 'Alice'}, + {'city': 'Elizabethton', + 'number': 8449, + 'state': 'TN', + 'street': 'Spring St.', + 'zip': 37643, + 'name': np.nan} + ] + ex_data = [ + ['Massillon', 9562, 'OH', 'Morris St.', 44646, 'Alice'], + ['Elizabethton', 8449, 'TN', 'Spring St.', 37643, np.nan] + ] + columns = ['city', 'number', 'state', 'street', 'zip', 'name'] + expected = DataFrame(ex_data, columns=columns) + tm.assert_frame_equal(result, expected) + def test_donot_drop_nonevalues(self): # GH21356 data = [