add optional fill_value for nan in json_normalize

Jiang Yue · Jiang Yue · commit b25faf704436 · 2019-06-27T11:31:36.000+08:00
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -308,17 +308,22 @@ def fast_unique_multiple_list_gen(object gen, bint sort=True):
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def dicts_to_array(dicts: list, columns: list):
+def dicts_to_array(dicts: list, columns: list, fill_value=None):
     cdef:
         Py_ssize_t i, j, k, n
         ndarray[object, ndim=2] result
         dict row
-        object col, onan = np.nan
+        object col
+        list onan
 
     k = len(columns)
     n = len(dicts)
 
     result = np.empty((n, k), dtype='O')
+    if fill_value:
+        onan = [fill_value[col] if col in fill_value else np.nan for col in columns]
+    else:
+        onan = list(np.full(k, np.nan))
 
     for i in range(n):
         row = dicts[i]
@@ -327,7 +332,7 @@ def dicts_to_array(dicts: list, columns: list):
             if col in row:
                 result[i, j] = row[col]
             else:
-                result[i, j] = onan
+                result[i, j] = onan[j]
 
     return result
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -375,7 +375,7 @@ def _constructor_expanddim(self):
     # Constructors
 
     def __init__(self, data=None, index=None, columns=None, dtype=None,
-                 copy=False):
+                 copy=False, fill_value=None):
         if data is None:
             data = {}
         if dtype is not None:
@@ -431,7 +431,8 @@ def __init__(self, data=None, index=None, columns=None, dtype=None,
                 if is_list_like(data[0]) and getattr(data[0], 'ndim', 1) == 1:
                     if is_named_tuple(data[0]) and columns is None:
                         columns = data[0]._fields
-                    arrays, columns = to_arrays(data, columns, dtype=dtype)
+                    arrays, columns = to_arrays(data, columns, dtype=dtype,
+                                                fill_value=fill_value)
                     columns = ensure_index(columns)
 
                     # set the index
diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py
@@ -371,7 +371,7 @@ def _get_axes(N, K, index, columns):
 # ---------------------------------------------------------------------
 # Conversion of Inputs to Arrays
 
-def to_arrays(data, columns, coerce_float=False, dtype=None):
+def to_arrays(data, columns, coerce_float=False, dtype=None, fill_value=None):
     """
     Return list of arrays, columns.
     """
@@ -396,7 +396,8 @@ def to_arrays(data, columns, coerce_float=False, dtype=None):
                                dtype=dtype)
     elif isinstance(data[0], abc.Mapping):
         return _list_of_dict_to_arrays(data, columns,
-                                       coerce_float=coerce_float, dtype=dtype)
+                                       coerce_float=coerce_float, dtype=dtype,
+                                       fill_value=fill_value)
     elif isinstance(data[0], ABCSeries):
         return _list_of_series_to_arrays(data, columns,
                                          coerce_float=coerce_float,
@@ -463,7 +464,8 @@ def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None):
         return values.T, columns
 
 
-def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None):
+def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None,
+                            fill_value=None):
     if columns is None:
         gen = (list(x.keys()) for x in data)
         sort = not any(isinstance(d, OrderedDict) for d in data)
@@ -473,7 +475,8 @@ def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None):
     # classes
     data = [(type(d) is dict) and d or dict(d) for d in data]
 
-    content = list(lib.dicts_to_array(data, list(columns)).T)
+    content = list(lib.dicts_to_array(data, list(columns),
+                                      fill_value=fill_value).T)
     return _convert_object_array(content, columns, dtype=dtype,
                                  coerce_float=coerce_float)
 
diff --git a/pandas/io/json/normalize.py b/pandas/io/json/normalize.py
@@ -96,7 +96,7 @@ def nested_to_record(ds, prefix="", sep=".", level=0):
     return new_ds
 
 
-def json_normalize(data, record_path=None, meta=None,
+def json_normalize(data, fill_value=None, record_path=None, meta=None,
                    meta_prefix=None,
                    record_prefix=None,
                    errors='raise',
@@ -108,6 +108,8 @@ def json_normalize(data, record_path=None, meta=None,
     ----------
     data : dict or list of dicts
         Unserialized JSON objects
+    fill_value: dict, default None
+        default na values for specified columns
     record_path : string or list of strings, default None
         Path in each object to list of records. If not passed, data will be
         assumed to be an array of records
@@ -149,6 +151,12 @@ def json_normalize(data, record_path=None, meta=None,
     1  NaN         NaN      Regner        NaN       Mose       NaN
     2  2.0  Faye Raker         NaN        NaN        NaN       NaN
 
+    >>> json_normalize(data, fill_value={'id' : -1})
+       id        name name.family name.first name.given name.last
+    0   1         NaN         NaN     Coleen        NaN      Volk
+    1  -1         NaN      Regner        NaN       Mose       NaN
+    2   2  Faye Raker         NaN        NaN        NaN       NaN
+
     >>> data = [{'state': 'Florida',
     ...          'shortname': 'FL',
     ...          'info': {
@@ -197,6 +205,9 @@ def _pull_field(js, spec):
     if isinstance(data, dict):
         data = [data]
 
+    if fill_value and not isinstance(fill_value, dict):
+        raise ValueError('Invalid fill_value, fill_value only accepts a dict')
+
     if record_path is None:
         if any([isinstance(x, dict) for x in y.values()] for y in data):
             # naive normalization, this is idempotent for flat records
@@ -207,7 +218,7 @@ def _pull_field(js, spec):
             # TODO: handle record value which are lists, at least error
             #       reasonably
             data = nested_to_record(data, sep=sep)
-        return DataFrame(data)
+        return DataFrame(data, fill_value=fill_value)
     elif not isinstance(record_path, list):
         record_path = [record_path]
 
@@ -265,7 +276,7 @@ def _recursive_extract(data, path, seen_meta, level=0):
 
     _recursive_extract(data, record_path, {}, level=0)
 
-    result = DataFrame(records)
+    result = DataFrame(records, fill_value=fill_value)
 
     if record_prefix is not None:
         result = result.rename(