1
-
2
1
# pylint: disable-msg=E1101,W0613,W0603
3
- from StringIO import StringIO
2
+
3
+ from collections import defaultdict
4
+
4
5
import os
5
6
6
7
from pandas import Series , DataFrame , to_datetime
11
12
12
13
import numpy as np
13
14
from pandas .tslib import iNaT
14
- import pandas .lib as lib
15
15
16
16
### interface to/from ###
17
17
18
- def to_json (path_or_buf , obj , orient = None , date_format = 'epoch' , double_precision = 10 , force_ascii = True ):
19
-
18
+ def to_json (path_or_buf , obj , orient = None , date_format = 'epoch' ,
19
+ double_precision = 10 , force_ascii = True ):
20
+
20
21
if isinstance (obj , Series ):
21
- s = SeriesWriter (obj , orient = orient , date_format = date_format , double_precision = double_precision ,
22
+ s = SeriesWriter (obj , orient = orient , date_format = date_format ,
23
+ double_precision = double_precision ,
22
24
ensure_ascii = force_ascii ).write ()
23
25
elif isinstance (obj , DataFrame ):
24
- s = FrameWriter (obj , orient = orient , date_format = date_format , double_precision = double_precision ,
26
+ s = FrameWriter (obj , orient = orient , date_format = date_format ,
27
+ double_precision = double_precision ,
25
28
ensure_ascii = force_ascii ).write ()
26
29
else :
27
30
raise NotImplementedError
@@ -36,12 +39,13 @@ def to_json(path_or_buf, obj, orient=None, date_format='epoch', double_precision
36
39
37
40
class Writer (object ):
38
41
39
- def __init__ (self , obj , orient , date_format , double_precision , ensure_ascii ):
42
+ def __init__ (self , obj , orient , date_format , double_precision ,
43
+ ensure_ascii ):
40
44
self .obj = obj
41
45
42
46
if orient is None :
43
47
orient = self ._default_orient
44
-
48
+
45
49
self .orient = orient
46
50
self .date_format = date_format
47
51
self .double_precision = double_precision
@@ -64,15 +68,18 @@ def _format_to_date(self, data):
64
68
if self ._needs_to_date (data ):
65
69
return data .apply (lambda x : x .isoformat ())
66
70
return data
67
-
71
+
68
72
def copy_if_needed (self ):
69
73
""" copy myself if necessary """
70
74
if not self .is_copy :
71
75
self .obj = self .obj .copy ()
72
76
self .is_copy = True
73
77
74
78
def write (self ):
75
- return dumps (self .obj , orient = self .orient , double_precision = self .double_precision , ensure_ascii = self .ensure_ascii )
79
+ return dumps (self .obj , orient = self .orient ,
80
+ double_precision = self .double_precision ,
81
+ ensure_ascii = self .ensure_ascii )
82
+
76
83
77
84
class SeriesWriter (Writer ):
78
85
_default_orient = 'index'
@@ -186,13 +193,13 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
186
193
return obj
187
194
188
195
class Parser (object ):
189
-
196
+
190
197
def __init__ (self , json , orient , dtype = True , convert_axes = True , convert_dates = True , keep_default_dates = False , numpy = False ):
191
198
self .json = json
192
199
193
200
if orient is None :
194
201
orient = self ._default_orient
195
-
202
+
196
203
self .orient = orient
197
204
self .dtype = dtype
198
205
@@ -207,7 +214,7 @@ def __init__(self, json, orient, dtype=True, convert_axes=True, convert_dates=Tr
207
214
208
215
def parse (self ):
209
216
210
- # try numpy
217
+ # try numpy
211
218
numpy = self .numpy
212
219
if numpy :
213
220
self ._parse_numpy ()
@@ -269,7 +276,7 @@ def _try_convert_data(self, name, data, use_dtypes=True, convert_dates=True):
269
276
pass
270
277
271
278
if data .dtype == 'float' :
272
-
279
+
273
280
# coerce floats to 64
274
281
try :
275
282
data = data .astype ('float64' )
@@ -291,7 +298,7 @@ def _try_convert_data(self, name, data, use_dtypes=True, convert_dates=True):
291
298
292
299
# coerce ints to 64
293
300
if data .dtype == 'int' :
294
-
301
+
295
302
# coerce floats to 64
296
303
try :
297
304
data = data .astype ('int64' )
@@ -322,7 +329,7 @@ def _try_convert_to_date(self, data):
322
329
if issubclass (new_data .dtype .type ,np .number ):
323
330
if not ((new_data == iNaT ) | (new_data > 31536000000000000L )).all ():
324
331
return data , False
325
-
332
+
326
333
try :
327
334
new_data = to_datetime (new_data )
328
335
except :
@@ -342,7 +349,7 @@ class SeriesParser(Parser):
342
349
_default_orient = 'index'
343
350
344
351
def _parse_no_numpy (self ):
345
-
352
+
346
353
json = self .json
347
354
orient = self .orient
348
355
if orient == "split" :
@@ -446,3 +453,197 @@ def is_ok(col):
446
453
new_data , result = self ._try_convert_to_date (self .obj [col ])
447
454
if result :
448
455
self .obj [col ] = new_data
456
+
457
+
458
+ #----------------------------------------------------------------------
459
+ # JSON normalization routines
460
+
461
+ def nested_to_record (ds ,prefix = "" ,level = 0 ):
462
+ """a simplified json_normalize
463
+
464
+ converts a nested dict into a flat dict ("record"), unlike json_normalize,
465
+ it does not attempt to extract a subset of the data.
466
+
467
+ Parameters
468
+ ----------
469
+ ds : dict or list of dicts
470
+
471
+ Returns
472
+ -------
473
+ d - dict or list of dicts, matching `ds`
474
+
475
+ Example:
476
+ IN[52]: nested_to_record(dict(flat1=1,dict1=dict(c=1,d=2),nested=dict(e=dict(c=1,d=2),d=2)))
477
+ Out[52]:
478
+ {'dict1.c': 1,
479
+ 'dict1.d': 2,
480
+ 'flat1': 1,
481
+ 'nested.d': 2,
482
+ 'nested.e.c': 1,
483
+ 'nested.e.d': 2}
484
+ """
485
+ singleton = False
486
+ if isinstance (ds ,dict ):
487
+ ds = [ds ]
488
+ singleton = True
489
+
490
+ for d in ds :
491
+ for k ,v in d .items (): # modifying keys inside loop, not lazy
492
+ # each key gets renamed with prefix
493
+ if level == 0 :
494
+ newkey = str (k )
495
+ else :
496
+ newkey = prefix + '.' + str (k )
497
+
498
+ # only dicts gets recurse-flattend
499
+ # only at level>1 do we rename the rest of the keys
500
+ if not isinstance (v ,dict ):
501
+ if level != 0 : # so we skip copying for top level, common case
502
+ v = d .pop (k )
503
+ d [newkey ]= v
504
+ continue
505
+ else :
506
+ v = d .pop (k )
507
+ d .update (nested_to_record (v ,newkey ,level + 1 ))
508
+
509
+ if singleton :
510
+ return ds [0 ]
511
+ return ds
512
+
513
+
514
+ def json_normalize (data , record_path = None , meta = None ,
515
+ meta_prefix = None ,
516
+ record_prefix = None ):
517
+ """
518
+ "Normalize" semi-structured JSON data into a flat table
519
+
520
+ Parameters
521
+ ----------
522
+ data : dict or list of dicts
523
+ Unserialized JSON objects
524
+ record_path : string or list of strings, default None
525
+ Path in each object to list of records. If not passed, data will be
526
+ assumed to be an array of records
527
+ meta : list of paths (string or list of strings)
528
+ Fields to use as metadata for each record in resulting table
529
+ record_prefix : string, default None
530
+ If True, prefix records with dotted (?) path, e.g. foo.bar.field if
531
+ path to records is ['foo', 'bar']
532
+ meta_prefix : string, default None
533
+
534
+ Examples
535
+ --------
536
+ data = [{'state': 'Florida',
537
+ 'shortname': 'FL',
538
+ 'info': {
539
+ 'governor': 'Rick Scott'
540
+ },
541
+ 'counties': [{'name': 'Dade', 'population': 12345},
542
+ {'name': 'Broward', 'population': 40000},
543
+ {'name': 'Palm Beach', 'population': 60000}]},
544
+ {'state': 'Ohio',
545
+ 'shortname': 'OH',
546
+ 'info': {
547
+ 'governor': 'John Kasich'
548
+ },
549
+ 'counties': [{'name': 'Summit', 'population': 1234},
550
+ {'name': 'Cuyahoga', 'population': 1337}]}]
551
+
552
+ result = json_normalize(data, 'counties', ['state', 'shortname',
553
+ ['info', 'governor']])
554
+
555
+ state governor
556
+ Florida Rick Scott
557
+
558
+
559
+ Returns
560
+ -------
561
+ frame : DataFrame
562
+ """
563
+ def _pull_field (js , spec ):
564
+ result = js
565
+ if isinstance (spec , list ):
566
+ for field in spec :
567
+ result = result [field ]
568
+ else :
569
+ result = result [spec ]
570
+
571
+ return result
572
+
573
+ # A bit of a hackjob
574
+ if isinstance (data , dict ):
575
+ data = [data ]
576
+
577
+ if record_path is None :
578
+ if any ([isinstance (x ,dict ) for x in data [0 ].itervalues ()]):
579
+ # naive normalization, this is idempotent for flat records
580
+ # and potentially will inflate the data considerably for
581
+ # deeply nested structures:
582
+ # {VeryLong: { b: 1,c:2}} -> {VeryLong.b:1 ,VeryLong.c:@}
583
+ #
584
+ # TODO: handle record value which are lists, at least error reasonabley
585
+ data = nested_to_record (data )
586
+ return DataFrame (data )
587
+ elif not isinstance (record_path , list ):
588
+ record_path = [record_path ]
589
+
590
+ if meta is None :
591
+ meta = []
592
+ elif not isinstance (meta , list ):
593
+ meta = [meta ]
594
+
595
+ for i , x in enumerate (meta ):
596
+ if not isinstance (x , list ):
597
+ meta [i ] = [x ]
598
+
599
+ # Disastrously inefficient for now
600
+ records = []
601
+ lengths = []
602
+
603
+ meta_vals = defaultdict (list )
604
+ meta_keys = ['.' .join (val ) for val in meta ]
605
+
606
+ def _recursive_extract (data , path , seen_meta , level = 0 ):
607
+ if len (path ) > 1 :
608
+ for obj in data :
609
+ for val , key in zip (meta , meta_keys ):
610
+ if level + 1 == len (val ):
611
+ seen_meta [key ] = _pull_field (obj , val [- 1 ])
612
+
613
+ _recursive_extract (obj [path [0 ]], path [1 :],
614
+ seen_meta , level = level + 1 )
615
+ else :
616
+ for obj in data :
617
+ recs = _pull_field (obj , path [0 ])
618
+
619
+ # For repeating the metadata later
620
+ lengths .append (len (recs ))
621
+
622
+ for val , key in zip (meta , meta_keys ):
623
+ if level + 1 > len (val ):
624
+ meta_val = seen_meta [key ]
625
+ else :
626
+ meta_val = _pull_field (obj , val [level :])
627
+ meta_vals [key ].append (meta_val )
628
+
629
+ records .extend (recs )
630
+
631
+ _recursive_extract (data , record_path , {}, level = 0 )
632
+
633
+ result = DataFrame (records )
634
+
635
+ if record_prefix is not None :
636
+ result .rename (columns = lambda x : record_prefix + x , inplace = True )
637
+
638
+ # Data types, a problem
639
+ for k , v in meta_vals .iteritems ():
640
+ if meta_prefix is not None :
641
+ k = meta_prefix + k
642
+
643
+ if k in result :
644
+ raise ValueError ('Conflicting metadata name %s, '
645
+ 'need distinguishing prefix ' % k )
646
+
647
+ result [k ] = np .array (v ).repeat (lengths )
648
+
649
+ return result
0 commit comments