Skip to content

Commit 4790b93

Browse files
wesmjreback
authored andcommitted
ENH: add functional (but pretty slow) json_normalize function for flattening nested records per #1067
1 parent bc6787a commit 4790b93

File tree

3 files changed

+416
-3
lines changed

3 files changed

+416
-3
lines changed

pandas/io/json.py

+196-3
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# pylint: disable-msg=E1101,W0613,W0603
2-
import os
32

3+
import os
4+
from collections import defaultdict
45
import numpy as np
56

67
import pandas.json as _json
@@ -15,7 +16,6 @@
1516
dumps = _json.dumps
1617
### interface to/from ###
1718

18-
1919
def to_json(path_or_buf, obj, orient=None, date_format='epoch',
2020
double_precision=10, force_ascii=True, date_unit='ms'):
2121

@@ -71,7 +71,6 @@ def write(self):
7171
date_unit=self.date_unit,
7272
iso_dates=self.date_format == 'iso')
7373

74-
7574
class SeriesWriter(Writer):
7675
_default_orient = 'index'
7776

@@ -537,3 +536,197 @@ def is_ok(col):
537536
lambda col, c: self._try_convert_to_date(c),
538537
lambda col, c: ((self.keep_default_dates and is_ok(col))
539538
or col in convert_dates))
539+
540+
541+
#----------------------------------------------------------------------
542+
# JSON normalization routines
543+
544+
def nested_to_record(ds,prefix="",level=0):
545+
"""a simplified json_normalize
546+
547+
converts a nested dict into a flat dict ("record"), unlike json_normalize,
548+
it does not attempt to extract a subset of the data.
549+
550+
Parameters
551+
----------
552+
ds : dict or list of dicts
553+
554+
Returns
555+
-------
556+
d - dict or list of dicts, matching `ds`
557+
558+
Example:
559+
IN[52]: nested_to_record(dict(flat1=1,dict1=dict(c=1,d=2),nested=dict(e=dict(c=1,d=2),d=2)))
560+
Out[52]:
561+
{'dict1.c': 1,
562+
'dict1.d': 2,
563+
'flat1': 1,
564+
'nested.d': 2,
565+
'nested.e.c': 1,
566+
'nested.e.d': 2}
567+
"""
568+
singleton = False
569+
if isinstance(ds,dict):
570+
ds = [ds]
571+
singleton = True
572+
573+
for d in ds:
574+
for k,v in d.items(): # modifying keys inside loop, not lazy
575+
# each key gets renamed with prefix
576+
if level == 0:
577+
newkey = str(k)
578+
else:
579+
newkey = prefix+'.'+ str(k)
580+
581+
# only dicts gets recurse-flattend
582+
# only at level>1 do we rename the rest of the keys
583+
if not isinstance(v,dict):
584+
if level!=0: # so we skip copying for top level, common case
585+
v = d.pop(k)
586+
d[newkey]= v
587+
continue
588+
else:
589+
v = d.pop(k)
590+
d.update(nested_to_record(v,newkey,level+1))
591+
592+
if singleton:
593+
return ds[0]
594+
return ds
595+
596+
597+
def json_normalize(data, record_path=None, meta=None,
598+
meta_prefix=None,
599+
record_prefix=None):
600+
"""
601+
"Normalize" semi-structured JSON data into a flat table
602+
603+
Parameters
604+
----------
605+
data : dict or list of dicts
606+
Unserialized JSON objects
607+
record_path : string or list of strings, default None
608+
Path in each object to list of records. If not passed, data will be
609+
assumed to be an array of records
610+
meta : list of paths (string or list of strings)
611+
Fields to use as metadata for each record in resulting table
612+
record_prefix : string, default None
613+
If True, prefix records with dotted (?) path, e.g. foo.bar.field if
614+
path to records is ['foo', 'bar']
615+
meta_prefix : string, default None
616+
617+
Examples
618+
--------
619+
data = [{'state': 'Florida',
620+
'shortname': 'FL',
621+
'info': {
622+
'governor': 'Rick Scott'
623+
},
624+
'counties': [{'name': 'Dade', 'population': 12345},
625+
{'name': 'Broward', 'population': 40000},
626+
{'name': 'Palm Beach', 'population': 60000}]},
627+
{'state': 'Ohio',
628+
'shortname': 'OH',
629+
'info': {
630+
'governor': 'John Kasich'
631+
},
632+
'counties': [{'name': 'Summit', 'population': 1234},
633+
{'name': 'Cuyahoga', 'population': 1337}]}]
634+
635+
result = json_normalize(data, 'counties', ['state', 'shortname',
636+
['info', 'governor']])
637+
638+
state governor
639+
Florida Rick Scott
640+
641+
642+
Returns
643+
-------
644+
frame : DataFrame
645+
"""
646+
def _pull_field(js, spec):
647+
result = js
648+
if isinstance(spec, list):
649+
for field in spec:
650+
result = result[field]
651+
else:
652+
result = result[spec]
653+
654+
return result
655+
656+
# A bit of a hackjob
657+
if isinstance(data, dict):
658+
data = [data]
659+
660+
if record_path is None:
661+
if any([isinstance(x,dict) for x in data[0].itervalues()]):
662+
# naive normalization, this is idempotent for flat records
663+
# and potentially will inflate the data considerably for
664+
# deeply nested structures:
665+
# {VeryLong: { b: 1,c:2}} -> {VeryLong.b:1 ,VeryLong.c:@}
666+
#
667+
# TODO: handle record value which are lists, at least error reasonabley
668+
data = nested_to_record(data)
669+
return DataFrame(data)
670+
elif not isinstance(record_path, list):
671+
record_path = [record_path]
672+
673+
if meta is None:
674+
meta = []
675+
elif not isinstance(meta, list):
676+
meta = [meta]
677+
678+
for i, x in enumerate(meta):
679+
if not isinstance(x, list):
680+
meta[i] = [x]
681+
682+
# Disastrously inefficient for now
683+
records = []
684+
lengths = []
685+
686+
meta_vals = defaultdict(list)
687+
meta_keys = ['.'.join(val) for val in meta]
688+
689+
def _recursive_extract(data, path, seen_meta, level=0):
690+
if len(path) > 1:
691+
for obj in data:
692+
for val, key in zip(meta, meta_keys):
693+
if level + 1 == len(val):
694+
seen_meta[key] = _pull_field(obj, val[-1])
695+
696+
_recursive_extract(obj[path[0]], path[1:],
697+
seen_meta, level=level+1)
698+
else:
699+
for obj in data:
700+
recs = _pull_field(obj, path[0])
701+
702+
# For repeating the metadata later
703+
lengths.append(len(recs))
704+
705+
for val, key in zip(meta, meta_keys):
706+
if level + 1 > len(val):
707+
meta_val = seen_meta[key]
708+
else:
709+
meta_val = _pull_field(obj, val[level:])
710+
meta_vals[key].append(meta_val)
711+
712+
records.extend(recs)
713+
714+
_recursive_extract(data, record_path, {}, level=0)
715+
716+
result = DataFrame(records)
717+
718+
if record_prefix is not None:
719+
result.rename(columns=lambda x: record_prefix + x, inplace=True)
720+
721+
# Data types, a problem
722+
for k, v in meta_vals.iteritems():
723+
if meta_prefix is not None:
724+
k = meta_prefix + k
725+
726+
if k in result:
727+
raise ValueError('Conflicting metadata name %s, '
728+
'need distinguishing prefix ' % k)
729+
730+
result[k] = np.array(v).repeat(lengths)
731+
732+
return result

0 commit comments

Comments
 (0)