Skip to content

Commit b25faf7

Browse files
author
Jiang Yue
committed
add optional fill_value for nan in json_normalize
1 parent c9182df commit b25faf7

File tree

4 files changed

+32
-12
lines changed

4 files changed

+32
-12
lines changed

pandas/_libs/lib.pyx

+8-3
Original file line numberDiff line numberDiff line change
@@ -308,17 +308,22 @@ def fast_unique_multiple_list_gen(object gen, bint sort=True):
308308

309309
@cython.wraparound(False)
310310
@cython.boundscheck(False)
311-
def dicts_to_array(dicts: list, columns: list):
311+
def dicts_to_array(dicts: list, columns: list, fill_value=None):
312312
cdef:
313313
Py_ssize_t i, j, k, n
314314
ndarray[object, ndim=2] result
315315
dict row
316-
object col, onan = np.nan
316+
object col
317+
list onan
317318

318319
k = len(columns)
319320
n = len(dicts)
320321

321322
result = np.empty((n, k), dtype='O')
323+
if fill_value:
324+
onan = [fill_value[col] if col in fill_value else np.nan for col in columns]
325+
else:
326+
onan = list(np.full(k, np.nan))
322327

323328
for i in range(n):
324329
row = dicts[i]
@@ -327,7 +332,7 @@ def dicts_to_array(dicts: list, columns: list):
327332
if col in row:
328333
result[i, j] = row[col]
329334
else:
330-
result[i, j] = onan
335+
result[i, j] = onan[j]
331336

332337
return result
333338

pandas/core/frame.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -375,7 +375,7 @@ def _constructor_expanddim(self):
375375
# Constructors
376376

377377
def __init__(self, data=None, index=None, columns=None, dtype=None,
378-
copy=False):
378+
copy=False, fill_value=None):
379379
if data is None:
380380
data = {}
381381
if dtype is not None:
@@ -431,7 +431,8 @@ def __init__(self, data=None, index=None, columns=None, dtype=None,
431431
if is_list_like(data[0]) and getattr(data[0], 'ndim', 1) == 1:
432432
if is_named_tuple(data[0]) and columns is None:
433433
columns = data[0]._fields
434-
arrays, columns = to_arrays(data, columns, dtype=dtype)
434+
arrays, columns = to_arrays(data, columns, dtype=dtype,
435+
fill_value=fill_value)
435436
columns = ensure_index(columns)
436437

437438
# set the index

pandas/core/internals/construction.py

+7-4
Original file line numberDiff line numberDiff line change
@@ -371,7 +371,7 @@ def _get_axes(N, K, index, columns):
371371
# ---------------------------------------------------------------------
372372
# Conversion of Inputs to Arrays
373373

374-
def to_arrays(data, columns, coerce_float=False, dtype=None):
374+
def to_arrays(data, columns, coerce_float=False, dtype=None, fill_value=None):
375375
"""
376376
Return list of arrays, columns.
377377
"""
@@ -396,7 +396,8 @@ def to_arrays(data, columns, coerce_float=False, dtype=None):
396396
dtype=dtype)
397397
elif isinstance(data[0], abc.Mapping):
398398
return _list_of_dict_to_arrays(data, columns,
399-
coerce_float=coerce_float, dtype=dtype)
399+
coerce_float=coerce_float, dtype=dtype,
400+
fill_value=fill_value)
400401
elif isinstance(data[0], ABCSeries):
401402
return _list_of_series_to_arrays(data, columns,
402403
coerce_float=coerce_float,
@@ -463,7 +464,8 @@ def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None):
463464
return values.T, columns
464465

465466

466-
def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None):
467+
def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None,
468+
fill_value=None):
467469
if columns is None:
468470
gen = (list(x.keys()) for x in data)
469471
sort = not any(isinstance(d, OrderedDict) for d in data)
@@ -473,7 +475,8 @@ def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None):
473475
# classes
474476
data = [(type(d) is dict) and d or dict(d) for d in data]
475477

476-
content = list(lib.dicts_to_array(data, list(columns)).T)
478+
content = list(lib.dicts_to_array(data, list(columns),
479+
fill_value=fill_value).T)
477480
return _convert_object_array(content, columns, dtype=dtype,
478481
coerce_float=coerce_float)
479482

pandas/io/json/normalize.py

+14-3
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ def nested_to_record(ds, prefix="", sep=".", level=0):
9696
return new_ds
9797

9898

99-
def json_normalize(data, record_path=None, meta=None,
99+
def json_normalize(data, fill_value=None, record_path=None, meta=None,
100100
meta_prefix=None,
101101
record_prefix=None,
102102
errors='raise',
@@ -108,6 +108,8 @@ def json_normalize(data, record_path=None, meta=None,
108108
----------
109109
data : dict or list of dicts
110110
Unserialized JSON objects
111+
fill_value: dict, default None
112+
default na values for specified columns
111113
record_path : string or list of strings, default None
112114
Path in each object to list of records. If not passed, data will be
113115
assumed to be an array of records
@@ -149,6 +151,12 @@ def json_normalize(data, record_path=None, meta=None,
149151
1 NaN NaN Regner NaN Mose NaN
150152
2 2.0 Faye Raker NaN NaN NaN NaN
151153
154+
>>> json_normalize(data, fill_value={'id' : -1})
155+
id name name.family name.first name.given name.last
156+
0 1 NaN NaN Coleen NaN Volk
157+
1 -1 NaN Regner NaN Mose NaN
158+
2 2 Faye Raker NaN NaN NaN NaN
159+
152160
>>> data = [{'state': 'Florida',
153161
... 'shortname': 'FL',
154162
... 'info': {
@@ -197,6 +205,9 @@ def _pull_field(js, spec):
197205
if isinstance(data, dict):
198206
data = [data]
199207

208+
if fill_value and not isinstance(fill_value, dict):
209+
raise ValueError('Invalid fill_value, fill_value only accepts a dict')
210+
200211
if record_path is None:
201212
if any([isinstance(x, dict) for x in y.values()] for y in data):
202213
# naive normalization, this is idempotent for flat records
@@ -207,7 +218,7 @@ def _pull_field(js, spec):
207218
# TODO: handle record value which are lists, at least error
208219
# reasonably
209220
data = nested_to_record(data, sep=sep)
210-
return DataFrame(data)
221+
return DataFrame(data, fill_value=fill_value)
211222
elif not isinstance(record_path, list):
212223
record_path = [record_path]
213224

@@ -265,7 +276,7 @@ def _recursive_extract(data, path, seen_meta, level=0):
265276

266277
_recursive_extract(data, record_path, {}, level=0)
267278

268-
result = DataFrame(records)
279+
result = DataFrame(records, fill_value=fill_value)
269280

270281
if record_prefix is not None:
271282
result = result.rename(

0 commit comments

Comments
 (0)