|
1 | 1 | # pylint: disable-msg=E1101,W0613,W0603
|
2 |
| -import os |
3 | 2 |
|
| 3 | +import os |
| 4 | +from collections import defaultdict |
4 | 5 | import numpy as np
|
5 | 6 |
|
6 | 7 | import pandas.json as _json
|
|
15 | 16 | dumps = _json.dumps
|
16 | 17 | ### interface to/from ###
|
17 | 18 |
|
18 |
| - |
19 | 19 | def to_json(path_or_buf, obj, orient=None, date_format='epoch',
|
20 | 20 | double_precision=10, force_ascii=True, date_unit='ms'):
|
21 | 21 |
|
@@ -71,7 +71,6 @@ def write(self):
|
71 | 71 | date_unit=self.date_unit,
|
72 | 72 | iso_dates=self.date_format == 'iso')
|
73 | 73 |
|
74 |
| - |
75 | 74 | class SeriesWriter(Writer):
|
76 | 75 | _default_orient = 'index'
|
77 | 76 |
|
@@ -537,3 +536,197 @@ def is_ok(col):
|
537 | 536 | lambda col, c: self._try_convert_to_date(c),
|
538 | 537 | lambda col, c: ((self.keep_default_dates and is_ok(col))
|
539 | 538 | or col in convert_dates))
|
| 539 | + |
| 540 | + |
| 541 | +#---------------------------------------------------------------------- |
| 542 | +# JSON normalization routines |
| 543 | + |
| 544 | +def nested_to_record(ds,prefix="",level=0): |
| 545 | + """a simplified json_normalize |
| 546 | +
|
| 547 | + converts a nested dict into a flat dict ("record"), unlike json_normalize, |
| 548 | + it does not attempt to extract a subset of the data. |
| 549 | +
|
| 550 | + Parameters |
| 551 | + ---------- |
| 552 | + ds : dict or list of dicts |
| 553 | +
|
| 554 | + Returns |
| 555 | + ------- |
| 556 | + d - dict or list of dicts, matching `ds` |
| 557 | +
|
| 558 | + Example: |
| 559 | + IN[52]: nested_to_record(dict(flat1=1,dict1=dict(c=1,d=2),nested=dict(e=dict(c=1,d=2),d=2))) |
| 560 | + Out[52]: |
| 561 | + {'dict1.c': 1, |
| 562 | + 'dict1.d': 2, |
| 563 | + 'flat1': 1, |
| 564 | + 'nested.d': 2, |
| 565 | + 'nested.e.c': 1, |
| 566 | + 'nested.e.d': 2} |
| 567 | + """ |
| 568 | + singleton = False |
| 569 | + if isinstance(ds,dict): |
| 570 | + ds = [ds] |
| 571 | + singleton = True |
| 572 | + |
| 573 | + for d in ds: |
| 574 | + for k,v in d.items(): # modifying keys inside loop, not lazy |
| 575 | + # each key gets renamed with prefix |
| 576 | + if level == 0: |
| 577 | + newkey = str(k) |
| 578 | + else: |
| 579 | + newkey = prefix+'.'+ str(k) |
| 580 | + |
| 581 | + # only dicts gets recurse-flattend |
| 582 | + # only at level>1 do we rename the rest of the keys |
| 583 | + if not isinstance(v,dict): |
| 584 | + if level!=0: # so we skip copying for top level, common case |
| 585 | + v = d.pop(k) |
| 586 | + d[newkey]= v |
| 587 | + continue |
| 588 | + else: |
| 589 | + v = d.pop(k) |
| 590 | + d.update(nested_to_record(v,newkey,level+1)) |
| 591 | + |
| 592 | + if singleton: |
| 593 | + return ds[0] |
| 594 | + return ds |
| 595 | + |
| 596 | + |
| 597 | +def json_normalize(data, record_path=None, meta=None, |
| 598 | + meta_prefix=None, |
| 599 | + record_prefix=None): |
| 600 | + """ |
| 601 | + "Normalize" semi-structured JSON data into a flat table |
| 602 | +
|
| 603 | + Parameters |
| 604 | + ---------- |
| 605 | + data : dict or list of dicts |
| 606 | + Unserialized JSON objects |
| 607 | + record_path : string or list of strings, default None |
| 608 | + Path in each object to list of records. If not passed, data will be |
| 609 | + assumed to be an array of records |
| 610 | + meta : list of paths (string or list of strings) |
| 611 | + Fields to use as metadata for each record in resulting table |
| 612 | + record_prefix : string, default None |
| 613 | + If True, prefix records with dotted (?) path, e.g. foo.bar.field if |
| 614 | + path to records is ['foo', 'bar'] |
| 615 | + meta_prefix : string, default None |
| 616 | +
|
| 617 | + Examples |
| 618 | + -------- |
| 619 | + data = [{'state': 'Florida', |
| 620 | + 'shortname': 'FL', |
| 621 | + 'info': { |
| 622 | + 'governor': 'Rick Scott' |
| 623 | + }, |
| 624 | + 'counties': [{'name': 'Dade', 'population': 12345}, |
| 625 | + {'name': 'Broward', 'population': 40000}, |
| 626 | + {'name': 'Palm Beach', 'population': 60000}]}, |
| 627 | + {'state': 'Ohio', |
| 628 | + 'shortname': 'OH', |
| 629 | + 'info': { |
| 630 | + 'governor': 'John Kasich' |
| 631 | + }, |
| 632 | + 'counties': [{'name': 'Summit', 'population': 1234}, |
| 633 | + {'name': 'Cuyahoga', 'population': 1337}]}] |
| 634 | +
|
| 635 | + result = json_normalize(data, 'counties', ['state', 'shortname', |
| 636 | + ['info', 'governor']]) |
| 637 | +
|
| 638 | + state governor |
| 639 | + Florida Rick Scott |
| 640 | +
|
| 641 | +
|
| 642 | + Returns |
| 643 | + ------- |
| 644 | + frame : DataFrame |
| 645 | + """ |
| 646 | + def _pull_field(js, spec): |
| 647 | + result = js |
| 648 | + if isinstance(spec, list): |
| 649 | + for field in spec: |
| 650 | + result = result[field] |
| 651 | + else: |
| 652 | + result = result[spec] |
| 653 | + |
| 654 | + return result |
| 655 | + |
| 656 | + # A bit of a hackjob |
| 657 | + if isinstance(data, dict): |
| 658 | + data = [data] |
| 659 | + |
| 660 | + if record_path is None: |
| 661 | + if any([isinstance(x,dict) for x in data[0].itervalues()]): |
| 662 | + # naive normalization, this is idempotent for flat records |
| 663 | + # and potentially will inflate the data considerably for |
| 664 | + # deeply nested structures: |
| 665 | + # {VeryLong: { b: 1,c:2}} -> {VeryLong.b:1 ,VeryLong.c:@} |
| 666 | + # |
| 667 | + # TODO: handle record value which are lists, at least error reasonabley |
| 668 | + data = nested_to_record(data) |
| 669 | + return DataFrame(data) |
| 670 | + elif not isinstance(record_path, list): |
| 671 | + record_path = [record_path] |
| 672 | + |
| 673 | + if meta is None: |
| 674 | + meta = [] |
| 675 | + elif not isinstance(meta, list): |
| 676 | + meta = [meta] |
| 677 | + |
| 678 | + for i, x in enumerate(meta): |
| 679 | + if not isinstance(x, list): |
| 680 | + meta[i] = [x] |
| 681 | + |
| 682 | + # Disastrously inefficient for now |
| 683 | + records = [] |
| 684 | + lengths = [] |
| 685 | + |
| 686 | + meta_vals = defaultdict(list) |
| 687 | + meta_keys = ['.'.join(val) for val in meta] |
| 688 | + |
| 689 | + def _recursive_extract(data, path, seen_meta, level=0): |
| 690 | + if len(path) > 1: |
| 691 | + for obj in data: |
| 692 | + for val, key in zip(meta, meta_keys): |
| 693 | + if level + 1 == len(val): |
| 694 | + seen_meta[key] = _pull_field(obj, val[-1]) |
| 695 | + |
| 696 | + _recursive_extract(obj[path[0]], path[1:], |
| 697 | + seen_meta, level=level+1) |
| 698 | + else: |
| 699 | + for obj in data: |
| 700 | + recs = _pull_field(obj, path[0]) |
| 701 | + |
| 702 | + # For repeating the metadata later |
| 703 | + lengths.append(len(recs)) |
| 704 | + |
| 705 | + for val, key in zip(meta, meta_keys): |
| 706 | + if level + 1 > len(val): |
| 707 | + meta_val = seen_meta[key] |
| 708 | + else: |
| 709 | + meta_val = _pull_field(obj, val[level:]) |
| 710 | + meta_vals[key].append(meta_val) |
| 711 | + |
| 712 | + records.extend(recs) |
| 713 | + |
| 714 | + _recursive_extract(data, record_path, {}, level=0) |
| 715 | + |
| 716 | + result = DataFrame(records) |
| 717 | + |
| 718 | + if record_prefix is not None: |
| 719 | + result.rename(columns=lambda x: record_prefix + x, inplace=True) |
| 720 | + |
| 721 | + # Data types, a problem |
| 722 | + for k, v in meta_vals.iteritems(): |
| 723 | + if meta_prefix is not None: |
| 724 | + k = meta_prefix + k |
| 725 | + |
| 726 | + if k in result: |
| 727 | + raise ValueError('Conflicting metadata name %s, ' |
| 728 | + 'need distinguishing prefix ' % k) |
| 729 | + |
| 730 | + result[k] = np.array(v).repeat(lengths) |
| 731 | + |
| 732 | + return result |
0 commit comments