Skip to content

ENH: Json fill_value for missing fields #27073

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.25.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ Other Enhancements
- :meth:`DataFrame.describe` now formats integer percentiles without decimal point (:issue:`26660`)
- Added support for reading SPSS .sav files using :func:`read_spss` (:issue:`26537`)
- Added new option ``plotting.backend`` to be able to select a plotting backend different than the existing ``matplotlib`` one. Use ``pandas.set_option('plotting.backend', '<backend-module>')`` where ``<backend-module`` is a library implementing the pandas plotting API (:issue:`14130`)
- :meth:`io.json.json_normalize` now accepts a `fill_value` argument to fill NaN fields in given columns (:issue:`16918`)

.. _whatsnew_0250.api_breaking:

Expand Down
12 changes: 9 additions & 3 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -308,17 +308,23 @@ def fast_unique_multiple_list_gen(object gen, bint sort=True):

@cython.wraparound(False)
@cython.boundscheck(False)
def dicts_to_array(dicts: list, columns: list):
def dicts_to_array(dicts: list, columns: list, fill_value=None):
cdef:
Py_ssize_t i, j, k, n
ndarray[object, ndim=2] result
dict row
object col, onan = np.nan
object col
list onan

k = len(columns)
n = len(dicts)

result = np.empty((n, k), dtype='O')
if fill_value:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rather than do this as a (n x k) array can't you just assign the appropriate value down on line 336 below?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, it can be done that way. I wasn't very sure about how good dictionary value retrieval compares to index retrieval from a list. I was afraid that (k x n) times of dictionary access is much slower than k x n times of index retrieval from a list. Also, this is a (1 x k) array instead of (n x k) array.

onan = ([fill_value[col] if col in fill_value
else np.nan for col in columns])
else:
onan = list(np.full(k, np.nan))

for i in range(n):
row = dicts[i]
Expand All @@ -327,7 +333,7 @@ def dicts_to_array(dicts: list, columns: list):
if col in row:
result[i, j] = row[col]
else:
result[i, j] = onan
result[i, j] = onan[j]

return result

Expand Down
5 changes: 3 additions & 2 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -375,7 +375,7 @@ def _constructor_expanddim(self):
# Constructors

def __init__(self, data=None, index=None, columns=None, dtype=None,
copy=False):
copy=False, fill_value=None):
if data is None:
data = {}
if dtype is not None:
Expand Down Expand Up @@ -431,7 +431,8 @@ def __init__(self, data=None, index=None, columns=None, dtype=None,
if is_list_like(data[0]) and getattr(data[0], 'ndim', 1) == 1:
if is_named_tuple(data[0]) and columns is None:
columns = data[0]._fields
arrays, columns = to_arrays(data, columns, dtype=dtype)
arrays, columns = to_arrays(data, columns, dtype=dtype,
fill_value=fill_value)
columns = ensure_index(columns)

# set the index
Expand Down
11 changes: 7 additions & 4 deletions pandas/core/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -371,7 +371,7 @@ def _get_axes(N, K, index, columns):
# ---------------------------------------------------------------------
# Conversion of Inputs to Arrays

def to_arrays(data, columns, coerce_float=False, dtype=None):
def to_arrays(data, columns, coerce_float=False, dtype=None, fill_value=None):
"""
Return list of arrays, columns.
"""
Expand All @@ -396,7 +396,8 @@ def to_arrays(data, columns, coerce_float=False, dtype=None):
dtype=dtype)
elif isinstance(data[0], abc.Mapping):
return _list_of_dict_to_arrays(data, columns,
coerce_float=coerce_float, dtype=dtype)
coerce_float=coerce_float, dtype=dtype,
fill_value=fill_value)
elif isinstance(data[0], ABCSeries):
return _list_of_series_to_arrays(data, columns,
coerce_float=coerce_float,
Expand Down Expand Up @@ -463,7 +464,8 @@ def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None):
return values.T, columns


def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None):
def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None,
fill_value=None):
if columns is None:
gen = (list(x.keys()) for x in data)
sort = not any(isinstance(d, OrderedDict) for d in data)
Expand All @@ -473,7 +475,8 @@ def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None):
# classes
data = [(type(d) is dict) and d or dict(d) for d in data]

content = list(lib.dicts_to_array(data, list(columns)).T)
content = list(lib.dicts_to_array(data, list(columns),
fill_value=fill_value).T)
return _convert_object_array(content, columns, dtype=dtype,
coerce_float=coerce_float)

Expand Down
21 changes: 18 additions & 3 deletions pandas/io/json/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,8 @@ def json_normalize(data, record_path=None, meta=None,
meta_prefix=None,
record_prefix=None,
errors='raise',
sep='.'):
sep='.',
fill_value=None):
"""
Normalize semi-structured JSON data into a flat table.

Expand Down Expand Up @@ -132,6 +133,11 @@ def json_normalize(data, record_path=None, meta=None,

.. versionadded:: 0.20.0

fill_value : dict, default None
default na values for specified columns

.. versionadded:: 0.25.0

Returns
-------
frame : DataFrame
Expand All @@ -149,6 +155,12 @@ def json_normalize(data, record_path=None, meta=None,
1 NaN NaN Regner NaN Mose NaN
2 2.0 Faye Raker NaN NaN NaN NaN

>>> json_normalize(data, fill_value={'id': -1})
id name name.family name.first name.given name.last
0 1 NaN NaN Coleen NaN Volk
1 -1 NaN Regner NaN Mose NaN
2 2 Faye Raker NaN NaN NaN NaN

>>> data = [{'state': 'Florida',
... 'shortname': 'FL',
... 'info': {
Expand Down Expand Up @@ -197,6 +209,9 @@ def _pull_field(js, spec):
if isinstance(data, dict):
data = [data]

if fill_value and not isinstance(fill_value, dict):
raise ValueError('Invalid fill_value, fill_value only accepts a dict')

if record_path is None:
if any([isinstance(x, dict) for x in y.values()] for y in data):
# naive normalization, this is idempotent for flat records
Expand All @@ -207,7 +222,7 @@ def _pull_field(js, spec):
# TODO: handle record value which are lists, at least error
# reasonably
data = nested_to_record(data, sep=sep)
return DataFrame(data)
return DataFrame(data, fill_value=fill_value)
elif not isinstance(record_path, list):
record_path = [record_path]

Expand Down Expand Up @@ -265,7 +280,7 @@ def _recursive_extract(data, path, seen_meta, level=0):

_recursive_extract(data, record_path, {}, level=0)

result = DataFrame(records)
result = DataFrame(records, fill_value=fill_value)

if record_prefix is not None:
result = result.rename(
Expand Down
65 changes: 65 additions & 0 deletions pandas/tests/io/json/test_normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,34 @@ def deep_nested():
]


@pytest.fixture
def deep_nested_missing():
# deeply nested data with some missing values
return [{'country': 'USA',
'states': [{'name': 'California',
'cities': [{'name': 'San Francisco',
'pop': 12345},
{'name': 'Los Angeles',
'pop': 12346}]
},
{'name': 'Ohio',
'cities': [{'name': 'Columbus',
'pop': 1234},
{'pop': 1236}]}
]
},
{'country': 'Germany',
'states': [{'name': 'Bayern',
'cities': [{'name': 'Munich'}]
},
{'name': 'Nordrhein-Westfalen',
'cities': [{'name': 'Duesseldorf', 'pop': 1238},
{'name': 'Koeln'}]}
]
}
]


@pytest.fixture
def state_data():
return [
Expand Down Expand Up @@ -294,6 +322,43 @@ def test_missing_field(self, author_missing_data):
expected = DataFrame(ex_data)
tm.assert_frame_equal(result, expected)

def test_fill_value(self, author_missing_data, deep_nested_missing):
# GH16918
result = json_normalize(
author_missing_data,
fill_value={'info.last_updated': '27/06/2019'})
ex_data = [
{'info': np.nan,
'author_name.first': np.nan,
'author_name.last_name': np.nan,
'info.created_at': np.nan,
'info.last_updated': '27/06/2019'},
{'info': None,
'author_name.first': 'Jane',
'author_name.last_name': 'Doe',
'info.created_at': '11/08/1993',
'info.last_updated': '26/05/2012'}
]
expected = DataFrame(ex_data)
print(result['info'], expected['info'])
tm.assert_frame_equal(result, expected)

result = json_normalize(deep_nested_missing, ['states', 'cities'],
meta=['country', ['states', 'name']],
fill_value={'pop': 0, 'name': 'N/A'})
# meta_prefix={'states': 'state_'})

ex_data = {'country': ['USA'] * 4 + ['Germany'] * 3,
'states.name': ['California', 'California', 'Ohio', 'Ohio',
'Bayern', 'Nordrhein-Westfalen',
'Nordrhein-Westfalen'],
'name': ['San Francisco', 'Los Angeles', 'Columbus',
'N/A', 'Munich', 'Duesseldorf', 'Koeln'],
'pop': [12345, 12346, 1234, 1236, 0, 1238, 0]}

expected = DataFrame(ex_data, columns=result.columns)
tm.assert_frame_equal(result, expected)


class TestNestedToRecord:

Expand Down