Skip to content

Commit 2ca2930

Browse files
committed
ENH: add functional (but pretty slow) json_normalize function for flattening nested records per #1067
1 parent 42bd3f5 commit 2ca2930

File tree

2 files changed

+427
-18
lines changed

2 files changed

+427
-18
lines changed

pandas/io/json.py

+219-18
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1-
21
# pylint: disable-msg=E1101,W0613,W0603
3-
from StringIO import StringIO
2+
3+
from collections import defaultdict
4+
45
import os
56

67
from pandas import Series, DataFrame, to_datetime
@@ -11,17 +12,19 @@
1112

1213
import numpy as np
1314
from pandas.tslib import iNaT
14-
import pandas.lib as lib
1515

1616
### interface to/from ###
1717

18-
def to_json(path_or_buf, obj, orient=None, date_format='epoch', double_precision=10, force_ascii=True):
19-
18+
def to_json(path_or_buf, obj, orient=None, date_format='epoch',
19+
double_precision=10, force_ascii=True):
20+
2021
if isinstance(obj, Series):
21-
s = SeriesWriter(obj, orient=orient, date_format=date_format, double_precision=double_precision,
22+
s = SeriesWriter(obj, orient=orient, date_format=date_format,
23+
double_precision=double_precision,
2224
ensure_ascii=force_ascii).write()
2325
elif isinstance(obj, DataFrame):
24-
s = FrameWriter(obj, orient=orient, date_format=date_format, double_precision=double_precision,
26+
s = FrameWriter(obj, orient=orient, date_format=date_format,
27+
double_precision=double_precision,
2528
ensure_ascii=force_ascii).write()
2629
else:
2730
raise NotImplementedError
@@ -36,12 +39,13 @@ def to_json(path_or_buf, obj, orient=None, date_format='epoch', double_precision
3639

3740
class Writer(object):
3841

39-
def __init__(self, obj, orient, date_format, double_precision, ensure_ascii):
42+
def __init__(self, obj, orient, date_format, double_precision,
43+
ensure_ascii):
4044
self.obj = obj
4145

4246
if orient is None:
4347
orient = self._default_orient
44-
48+
4549
self.orient = orient
4650
self.date_format = date_format
4751
self.double_precision = double_precision
@@ -64,15 +68,18 @@ def _format_to_date(self, data):
6468
if self._needs_to_date(data):
6569
return data.apply(lambda x: x.isoformat())
6670
return data
67-
71+
6872
def copy_if_needed(self):
6973
""" copy myself if necessary """
7074
if not self.is_copy:
7175
self.obj = self.obj.copy()
7276
self.is_copy = True
7377

7478
def write(self):
75-
return dumps(self.obj, orient=self.orient, double_precision=self.double_precision, ensure_ascii=self.ensure_ascii)
79+
return dumps(self.obj, orient=self.orient,
80+
double_precision=self.double_precision,
81+
ensure_ascii=self.ensure_ascii)
82+
7683

7784
class SeriesWriter(Writer):
7885
_default_orient = 'index'
@@ -186,13 +193,13 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
186193
return obj
187194

188195
class Parser(object):
189-
196+
190197
def __init__(self, json, orient, dtype=True, convert_axes=True, convert_dates=True, keep_default_dates=False, numpy=False):
191198
self.json = json
192199

193200
if orient is None:
194201
orient = self._default_orient
195-
202+
196203
self.orient = orient
197204
self.dtype = dtype
198205

@@ -207,7 +214,7 @@ def __init__(self, json, orient, dtype=True, convert_axes=True, convert_dates=Tr
207214

208215
def parse(self):
209216

210-
# try numpy
217+
# try numpy
211218
numpy = self.numpy
212219
if numpy:
213220
self._parse_numpy()
@@ -269,7 +276,7 @@ def _try_convert_data(self, name, data, use_dtypes=True, convert_dates=True):
269276
pass
270277

271278
if data.dtype == 'float':
272-
279+
273280
# coerce floats to 64
274281
try:
275282
data = data.astype('float64')
@@ -291,7 +298,7 @@ def _try_convert_data(self, name, data, use_dtypes=True, convert_dates=True):
291298

292299
# coerce ints to 64
293300
if data.dtype == 'int':
294-
301+
295302
# coerce floats to 64
296303
try:
297304
data = data.astype('int64')
@@ -322,7 +329,7 @@ def _try_convert_to_date(self, data):
322329
if issubclass(new_data.dtype.type,np.number):
323330
if not ((new_data == iNaT) | (new_data > 31536000000000000L)).all():
324331
return data, False
325-
332+
326333
try:
327334
new_data = to_datetime(new_data)
328335
except:
@@ -342,7 +349,7 @@ class SeriesParser(Parser):
342349
_default_orient = 'index'
343350

344351
def _parse_no_numpy(self):
345-
352+
346353
json = self.json
347354
orient = self.orient
348355
if orient == "split":
@@ -446,3 +453,197 @@ def is_ok(col):
446453
new_data, result = self._try_convert_to_date(self.obj[col])
447454
if result:
448455
self.obj[col] = new_data
456+
457+
458+
#----------------------------------------------------------------------
459+
# JSON normalization routines
460+
461+
def nested_to_record(ds,prefix="",level=0):
462+
"""a simplified json_normalize
463+
464+
converts a nested dict into a flat dict ("record"), unlike json_normalize,
465+
it does not attempt to extract a subset of the data.
466+
467+
Parameters
468+
----------
469+
ds : dict or list of dicts
470+
471+
Returns
472+
-------
473+
d - dict or list of dicts, matching `ds`
474+
475+
Example:
476+
IN[52]: nested_to_record(dict(flat1=1,dict1=dict(c=1,d=2),nested=dict(e=dict(c=1,d=2),d=2)))
477+
Out[52]:
478+
{'dict1.c': 1,
479+
'dict1.d': 2,
480+
'flat1': 1,
481+
'nested.d': 2,
482+
'nested.e.c': 1,
483+
'nested.e.d': 2}
484+
"""
485+
singleton = False
486+
if isinstance(ds,dict):
487+
ds = [ds]
488+
singleton = True
489+
490+
for d in ds:
491+
for k,v in d.items(): # modifying keys inside loop, not lazy
492+
# each key gets renamed with prefix
493+
if level == 0:
494+
newkey = str(k)
495+
else:
496+
newkey = prefix+'.'+ str(k)
497+
498+
# only dicts gets recurse-flattend
499+
# only at level>1 do we rename the rest of the keys
500+
if not isinstance(v,dict):
501+
if level!=0: # so we skip copying for top level, common case
502+
v = d.pop(k)
503+
d[newkey]= v
504+
continue
505+
else:
506+
v = d.pop(k)
507+
d.update(nested_to_record(v,newkey,level+1))
508+
509+
if singleton:
510+
return ds[0]
511+
return ds
512+
513+
514+
def json_normalize(data, record_path=None, meta=None,
515+
meta_prefix=None,
516+
record_prefix=None):
517+
"""
518+
"Normalize" semi-structured JSON data into a flat table
519+
520+
Parameters
521+
----------
522+
data : dict or list of dicts
523+
Unserialized JSON objects
524+
record_path : string or list of strings, default None
525+
Path in each object to list of records. If not passed, data will be
526+
assumed to be an array of records
527+
meta : list of paths (string or list of strings)
528+
Fields to use as metadata for each record in resulting table
529+
record_prefix : string, default None
530+
If True, prefix records with dotted (?) path, e.g. foo.bar.field if
531+
path to records is ['foo', 'bar']
532+
meta_prefix : string, default None
533+
534+
Examples
535+
--------
536+
data = [{'state': 'Florida',
537+
'shortname': 'FL',
538+
'info': {
539+
'governor': 'Rick Scott'
540+
},
541+
'counties': [{'name': 'Dade', 'population': 12345},
542+
{'name': 'Broward', 'population': 40000},
543+
{'name': 'Palm Beach', 'population': 60000}]},
544+
{'state': 'Ohio',
545+
'shortname': 'OH',
546+
'info': {
547+
'governor': 'John Kasich'
548+
},
549+
'counties': [{'name': 'Summit', 'population': 1234},
550+
{'name': 'Cuyahoga', 'population': 1337}]}]
551+
552+
result = json_normalize(data, 'counties', ['state', 'shortname',
553+
['info', 'governor']])
554+
555+
state governor
556+
Florida Rick Scott
557+
558+
559+
Returns
560+
-------
561+
frame : DataFrame
562+
"""
563+
def _pull_field(js, spec):
564+
result = js
565+
if isinstance(spec, list):
566+
for field in spec:
567+
result = result[field]
568+
else:
569+
result = result[spec]
570+
571+
return result
572+
573+
# A bit of a hackjob
574+
if isinstance(data, dict):
575+
data = [data]
576+
577+
if record_path is None:
578+
if any([isinstance(x,dict) for x in data[0].itervalues()]):
579+
# naive normalization, this is idempotent for flat records
580+
# and potentially will inflate the data considerably for
581+
# deeply nested structures:
582+
# {VeryLong: { b: 1,c:2}} -> {VeryLong.b:1 ,VeryLong.c:@}
583+
#
584+
# TODO: handle record value which are lists, at least error reasonabley
585+
data = nested_to_record(data)
586+
return DataFrame(data)
587+
elif not isinstance(record_path, list):
588+
record_path = [record_path]
589+
590+
if meta is None:
591+
meta = []
592+
elif not isinstance(meta, list):
593+
meta = [meta]
594+
595+
for i, x in enumerate(meta):
596+
if not isinstance(x, list):
597+
meta[i] = [x]
598+
599+
# Disastrously inefficient for now
600+
records = []
601+
lengths = []
602+
603+
meta_vals = defaultdict(list)
604+
meta_keys = ['.'.join(val) for val in meta]
605+
606+
def _recursive_extract(data, path, seen_meta, level=0):
607+
if len(path) > 1:
608+
for obj in data:
609+
for val, key in zip(meta, meta_keys):
610+
if level + 1 == len(val):
611+
seen_meta[key] = _pull_field(obj, val[-1])
612+
613+
_recursive_extract(obj[path[0]], path[1:],
614+
seen_meta, level=level+1)
615+
else:
616+
for obj in data:
617+
recs = _pull_field(obj, path[0])
618+
619+
# For repeating the metadata later
620+
lengths.append(len(recs))
621+
622+
for val, key in zip(meta, meta_keys):
623+
if level + 1 > len(val):
624+
meta_val = seen_meta[key]
625+
else:
626+
meta_val = _pull_field(obj, val[level:])
627+
meta_vals[key].append(meta_val)
628+
629+
records.extend(recs)
630+
631+
_recursive_extract(data, record_path, {}, level=0)
632+
633+
result = DataFrame(records)
634+
635+
if record_prefix is not None:
636+
result.rename(columns=lambda x: record_prefix + x, inplace=True)
637+
638+
# Data types, a problem
639+
for k, v in meta_vals.iteritems():
640+
if meta_prefix is not None:
641+
k = meta_prefix + k
642+
643+
if k in result:
644+
raise ValueError('Conflicting metadata name %s, '
645+
'need distinguishing prefix ' % k)
646+
647+
result[k] = np.array(v).repeat(lengths)
648+
649+
return result

0 commit comments

Comments
 (0)