pandas-dev · jiangyue12392 · Jun 27, 2019 · Jun 27, 2019 · Jun 27, 2019 · WillAyd
diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
@@ -133,6 +133,7 @@ Other Enhancements
 - :meth:`DataFrame.describe` now formats integer percentiles without decimal point (:issue:`26660`)
 - Added support for reading SPSS .sav files using :func:`read_spss` (:issue:`26537`)
 - Added new option ``plotting.backend`` to be able to select a plotting backend different than the existing ``matplotlib`` one. Use ``pandas.set_option('plotting.backend', '<backend-module>')`` where ``<backend-module`` is a library implementing the pandas plotting API (:issue:`14130`)
+- :meth:`io.json.json_normalize` now accepts a `fill_value` argument to fill NaN fields in given columns (:issue:`16918`)
 
 .. _whatsnew_0250.api_breaking:
 

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -308,17 +308,23 @@ def fast_unique_multiple_list_gen(object gen, bint sort=True):
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def dicts_to_array(dicts: list, columns: list):
+def dicts_to_array(dicts: list, columns: list, fill_value=None):
     cdef:
         Py_ssize_t i, j, k, n
         ndarray[object, ndim=2] result
         dict row
-        object col, onan = np.nan
+        object col
+        list onan
 
     k = len(columns)
     n = len(dicts)
 
     result = np.empty((n, k), dtype='O')
+    if fill_value:
+        onan = ([fill_value[col] if col in fill_value
+                else np.nan for col in columns])
+    else:
+        onan = list(np.full(k, np.nan))
 
     for i in range(n):
         row = dicts[i]
@@ -327,7 +333,7 @@ def dicts_to_array(dicts: list, columns: list):
             if col in row:
                 result[i, j] = row[col]
             else:
-                result[i, j] = onan
+                result[i, j] = onan[j]
 
     return result
 

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -375,7 +375,7 @@ def _constructor_expanddim(self):
     # Constructors
 
     def __init__(self, data=None, index=None, columns=None, dtype=None,
-                 copy=False):
+                 copy=False, fill_value=None):
         if data is None:
             data = {}
         if dtype is not None:
@@ -431,7 +431,8 @@ def __init__(self, data=None, index=None, columns=None, dtype=None,
                 if is_list_like(data[0]) and getattr(data[0], 'ndim', 1) == 1:
                     if is_named_tuple(data[0]) and columns is None:
                         columns = data[0]._fields
-                    arrays, columns = to_arrays(data, columns, dtype=dtype)
+                    arrays, columns = to_arrays(data, columns, dtype=dtype,
+                                                fill_value=fill_value)
                     columns = ensure_index(columns)
 
                     # set the index

diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py
@@ -371,7 +371,7 @@ def _get_axes(N, K, index, columns):
 # ---------------------------------------------------------------------
 # Conversion of Inputs to Arrays
 
-def to_arrays(data, columns, coerce_float=False, dtype=None):
+def to_arrays(data, columns, coerce_float=False, dtype=None, fill_value=None):
     """
     Return list of arrays, columns.
     """
@@ -396,7 +396,8 @@ def to_arrays(data, columns, coerce_float=False, dtype=None):
                                dtype=dtype)
     elif isinstance(data[0], abc.Mapping):
         return _list_of_dict_to_arrays(data, columns,
-                                       coerce_float=coerce_float, dtype=dtype)
+                                       coerce_float=coerce_float, dtype=dtype,
+                                       fill_value=fill_value)
     elif isinstance(data[0], ABCSeries):
         return _list_of_series_to_arrays(data, columns,
                                          coerce_float=coerce_float,
@@ -463,7 +464,8 @@ def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None):
         return values.T, columns
 
 
-def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None):
+def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None,
+                            fill_value=None):
     if columns is None:
         gen = (list(x.keys()) for x in data)
         sort = not any(isinstance(d, OrderedDict) for d in data)
@@ -473,7 +475,8 @@ def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None):
     # classes
     data = [(type(d) is dict) and d or dict(d) for d in data]
 
-    content = list(lib.dicts_to_array(data, list(columns)).T)
+    content = list(lib.dicts_to_array(data, list(columns),
+                                      fill_value=fill_value).T)
     return _convert_object_array(content, columns, dtype=dtype,
                                  coerce_float=coerce_float)
 

diff --git a/pandas/io/json/normalize.py b/pandas/io/json/normalize.py
@@ -100,7 +100,8 @@ def json_normalize(data, record_path=None, meta=None,
                    meta_prefix=None,
                    record_prefix=None,
                    errors='raise',
-                   sep='.'):
+                   sep='.',
+                   fill_value=None):
     """
     Normalize semi-structured JSON data into a flat table.
 
@@ -132,6 +133,11 @@ def json_normalize(data, record_path=None, meta=None,
 
         .. versionadded:: 0.20.0
 
+    fill_value : dict, default None
+        default na values for specified columns
+
+        .. versionadded:: 0.25.0
+
     Returns
     -------
     frame : DataFrame
@@ -149,6 +155,12 @@ def json_normalize(data, record_path=None, meta=None,
     1  NaN         NaN      Regner        NaN       Mose       NaN
     2  2.0  Faye Raker         NaN        NaN        NaN       NaN
 
+    >>> json_normalize(data, fill_value={'id': -1})
+       id        name name.family name.first name.given name.last
+    0   1         NaN         NaN     Coleen        NaN      Volk
+    1  -1         NaN      Regner        NaN       Mose       NaN
+    2   2  Faye Raker         NaN        NaN        NaN       NaN
+
     >>> data = [{'state': 'Florida',
     ...          'shortname': 'FL',
     ...          'info': {
@@ -197,6 +209,9 @@ def _pull_field(js, spec):
     if isinstance(data, dict):
         data = [data]
 
+    if fill_value and not isinstance(fill_value, dict):
+        raise ValueError('Invalid fill_value, fill_value only accepts a dict')
+
     if record_path is None:
         if any([isinstance(x, dict) for x in y.values()] for y in data):
             # naive normalization, this is idempotent for flat records
@@ -207,7 +222,7 @@ def _pull_field(js, spec):
             # TODO: handle record value which are lists, at least error
             #       reasonably
             data = nested_to_record(data, sep=sep)
-        return DataFrame(data)
+        return DataFrame(data, fill_value=fill_value)
     elif not isinstance(record_path, list):
         record_path = [record_path]
 
@@ -265,7 +280,7 @@ def _recursive_extract(data, path, seen_meta, level=0):
 
     _recursive_extract(data, record_path, {}, level=0)
 
-    result = DataFrame(records)
+    result = DataFrame(records, fill_value=fill_value)
 
     if record_prefix is not None:
         result = result.rename(

diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py
@@ -39,6 +39,34 @@ def deep_nested():
             ]
 
 
+@pytest.fixture
+def deep_nested_missing():
+    # deeply nested data with some missing values
+    return [{'country': 'USA',
+             'states': [{'name': 'California',
+                         'cities': [{'name': 'San Francisco',
+                                     'pop': 12345},
+                                    {'name': 'Los Angeles',
+                                     'pop': 12346}]
+                         },
+                        {'name': 'Ohio',
+                         'cities': [{'name': 'Columbus',
+                                     'pop': 1234},
+                                    {'pop': 1236}]}
+                        ]
+             },
+            {'country': 'Germany',
+             'states': [{'name': 'Bayern',
+                         'cities': [{'name': 'Munich'}]
+                         },
+                        {'name': 'Nordrhein-Westfalen',
+                         'cities': [{'name': 'Duesseldorf', 'pop': 1238},
+                                    {'name': 'Koeln'}]}
+                        ]
+             }
+            ]
+
+
 @pytest.fixture
 def state_data():
     return [
@@ -294,6 +322,43 @@ def test_missing_field(self, author_missing_data):
         expected = DataFrame(ex_data)
         tm.assert_frame_equal(result, expected)
 
+    def test_fill_value(self, author_missing_data, deep_nested_missing):
+        # GH16918
+        result = json_normalize(
+            author_missing_data,
+            fill_value={'info.last_updated': '27/06/2019'})
+        ex_data = [
+            {'info': np.nan,
+             'author_name.first': np.nan,
+             'author_name.last_name': np.nan,
+             'info.created_at': np.nan,
+             'info.last_updated': '27/06/2019'},
+            {'info': None,
+             'author_name.first': 'Jane',
+             'author_name.last_name': 'Doe',
+             'info.created_at': '11/08/1993',
+             'info.last_updated': '26/05/2012'}
+        ]
+        expected = DataFrame(ex_data)
+        print(result['info'], expected['info'])
+        tm.assert_frame_equal(result, expected)
+
+        result = json_normalize(deep_nested_missing, ['states', 'cities'],
+                                meta=['country', ['states', 'name']],
+                                fill_value={'pop': 0, 'name': 'N/A'})
+        # meta_prefix={'states': 'state_'})
+
+        ex_data = {'country': ['USA'] * 4 + ['Germany'] * 3,
+                   'states.name': ['California', 'California', 'Ohio', 'Ohio',
+                                   'Bayern', 'Nordrhein-Westfalen',
+                                   'Nordrhein-Westfalen'],
+                   'name': ['San Francisco', 'Los Angeles', 'Columbus',
+                            'N/A', 'Munich', 'Duesseldorf', 'Koeln'],
+                   'pop': [12345, 12346, 1234, 1236, 0, 1238, 0]}
+
+        expected = DataFrame(ex_data, columns=result.columns)
+        tm.assert_frame_equal(result, expected)
+
 
 class TestNestedToRecord: