|
1 | 1 | # pylint: disable-msg=E1101,W0613,W0603
|
2 | 2 |
|
3 | 3 | import os
|
4 |
| -import copy |
5 |
| -from collections import defaultdict |
6 | 4 | import numpy as np
|
7 | 5 |
|
8 | 6 | import pandas.json as _json
|
|
13 | 11 | from pandas.io.common import get_filepath_or_buffer, _get_handle
|
14 | 12 | from pandas.core.common import AbstractMethodError
|
15 | 13 | from pandas.formats.printing import pprint_thing
|
| 14 | +from .normalize import _convert_to_line_delimits |
16 | 15 |
|
17 | 16 | loads = _json.loads
|
18 | 17 | dumps = _json.dumps
|
@@ -641,246 +640,3 @@ def is_ok(col):
|
641 | 640 | lambda col, c: self._try_convert_to_date(c),
|
642 | 641 | lambda col, c: ((self.keep_default_dates and is_ok(col)) or
|
643 | 642 | col in convert_dates))
|
644 |
| - |
645 |
| -# --------------------------------------------------------------------- |
646 |
| -# JSON normalization routines |
647 |
| - |
648 |
| - |
649 |
| -def _convert_to_line_delimits(s): |
650 |
| - """Helper function that converts json lists to line delimited json.""" |
651 |
| - |
652 |
| - # Determine we have a JSON list to turn to lines otherwise just return the |
653 |
| - # json object, only lists can |
654 |
| - if not s[0] == '[' and s[-1] == ']': |
655 |
| - return s |
656 |
| - s = s[1:-1] |
657 |
| - |
658 |
| - from pandas.lib import convert_json_to_lines |
659 |
| - return convert_json_to_lines(s) |
660 |
| - |
661 |
| - |
662 |
| -def nested_to_record(ds, prefix="", level=0): |
663 |
| - """a simplified json_normalize |
664 |
| -
|
665 |
| - converts a nested dict into a flat dict ("record"), unlike json_normalize, |
666 |
| - it does not attempt to extract a subset of the data. |
667 |
| -
|
668 |
| - Parameters |
669 |
| - ---------- |
670 |
| - ds : dict or list of dicts |
671 |
| - prefix: the prefix, optional, default: "" |
672 |
| - level: the number of levels in the jason string, optional, default: 0 |
673 |
| -
|
674 |
| - Returns |
675 |
| - ------- |
676 |
| - d - dict or list of dicts, matching `ds` |
677 |
| -
|
678 |
| - Examples |
679 |
| - -------- |
680 |
| -
|
681 |
| - IN[52]: nested_to_record(dict(flat1=1,dict1=dict(c=1,d=2), |
682 |
| - nested=dict(e=dict(c=1,d=2),d=2))) |
683 |
| - Out[52]: |
684 |
| - {'dict1.c': 1, |
685 |
| - 'dict1.d': 2, |
686 |
| - 'flat1': 1, |
687 |
| - 'nested.d': 2, |
688 |
| - 'nested.e.c': 1, |
689 |
| - 'nested.e.d': 2} |
690 |
| - """ |
691 |
| - singleton = False |
692 |
| - if isinstance(ds, dict): |
693 |
| - ds = [ds] |
694 |
| - singleton = True |
695 |
| - |
696 |
| - new_ds = [] |
697 |
| - for d in ds: |
698 |
| - |
699 |
| - new_d = copy.deepcopy(d) |
700 |
| - for k, v in d.items(): |
701 |
| - # each key gets renamed with prefix |
702 |
| - if not isinstance(k, compat.string_types): |
703 |
| - k = str(k) |
704 |
| - if level == 0: |
705 |
| - newkey = k |
706 |
| - else: |
707 |
| - newkey = prefix + '.' + k |
708 |
| - |
709 |
| - # only dicts gets recurse-flattend |
710 |
| - # only at level>1 do we rename the rest of the keys |
711 |
| - if not isinstance(v, dict): |
712 |
| - if level != 0: # so we skip copying for top level, common case |
713 |
| - v = new_d.pop(k) |
714 |
| - new_d[newkey] = v |
715 |
| - continue |
716 |
| - else: |
717 |
| - v = new_d.pop(k) |
718 |
| - new_d.update(nested_to_record(v, newkey, level + 1)) |
719 |
| - new_ds.append(new_d) |
720 |
| - |
721 |
| - if singleton: |
722 |
| - return new_ds[0] |
723 |
| - return new_ds |
724 |
| - |
725 |
| - |
726 |
| -def json_normalize(data, record_path=None, meta=None, |
727 |
| - meta_prefix=None, |
728 |
| - record_prefix=None, |
729 |
| - errors='raise'): |
730 |
| - |
731 |
| - """ |
732 |
| - "Normalize" semi-structured JSON data into a flat table |
733 |
| -
|
734 |
| - Parameters |
735 |
| - ---------- |
736 |
| - data : dict or list of dicts |
737 |
| - Unserialized JSON objects |
738 |
| - record_path : string or list of strings, default None |
739 |
| - Path in each object to list of records. If not passed, data will be |
740 |
| - assumed to be an array of records |
741 |
| - meta : list of paths (string or list of strings), default None |
742 |
| - Fields to use as metadata for each record in resulting table |
743 |
| - record_prefix : string, default None |
744 |
| - If True, prefix records with dotted (?) path, e.g. foo.bar.field if |
745 |
| - path to records is ['foo', 'bar'] |
746 |
| - meta_prefix : string, default None |
747 |
| - errors : {'raise', 'ignore'}, default 'raise' |
748 |
| -
|
749 |
| - * ignore : will ignore KeyError if keys listed in meta are not |
750 |
| - always present |
751 |
| - * raise : will raise KeyError if keys listed in meta are not |
752 |
| - always present |
753 |
| -
|
754 |
| - .. versionadded:: 0.20.0 |
755 |
| -
|
756 |
| - Returns |
757 |
| - ------- |
758 |
| - frame : DataFrame |
759 |
| -
|
760 |
| - Examples |
761 |
| - -------- |
762 |
| -
|
763 |
| - >>> data = [{'state': 'Florida', |
764 |
| - ... 'shortname': 'FL', |
765 |
| - ... 'info': { |
766 |
| - ... 'governor': 'Rick Scott' |
767 |
| - ... }, |
768 |
| - ... 'counties': [{'name': 'Dade', 'population': 12345}, |
769 |
| - ... {'name': 'Broward', 'population': 40000}, |
770 |
| - ... {'name': 'Palm Beach', 'population': 60000}]}, |
771 |
| - ... {'state': 'Ohio', |
772 |
| - ... 'shortname': 'OH', |
773 |
| - ... 'info': { |
774 |
| - ... 'governor': 'John Kasich' |
775 |
| - ... }, |
776 |
| - ... 'counties': [{'name': 'Summit', 'population': 1234}, |
777 |
| - ... {'name': 'Cuyahoga', 'population': 1337}]}] |
778 |
| - >>> from pandas.io.json import json_normalize |
779 |
| - >>> result = json_normalize(data, 'counties', ['state', 'shortname', |
780 |
| - ... ['info', 'governor']]) |
781 |
| - >>> result |
782 |
| - name population info.governor state shortname |
783 |
| - 0 Dade 12345 Rick Scott Florida FL |
784 |
| - 1 Broward 40000 Rick Scott Florida FL |
785 |
| - 2 Palm Beach 60000 Rick Scott Florida FL |
786 |
| - 3 Summit 1234 John Kasich Ohio OH |
787 |
| - 4 Cuyahoga 1337 John Kasich Ohio OH |
788 |
| -
|
789 |
| - """ |
790 |
| - def _pull_field(js, spec): |
791 |
| - result = js |
792 |
| - if isinstance(spec, list): |
793 |
| - for field in spec: |
794 |
| - result = result[field] |
795 |
| - else: |
796 |
| - result = result[spec] |
797 |
| - |
798 |
| - return result |
799 |
| - |
800 |
| - # A bit of a hackjob |
801 |
| - if isinstance(data, dict): |
802 |
| - data = [data] |
803 |
| - |
804 |
| - if record_path is None: |
805 |
| - if any([isinstance(x, dict) for x in compat.itervalues(data[0])]): |
806 |
| - # naive normalization, this is idempotent for flat records |
807 |
| - # and potentially will inflate the data considerably for |
808 |
| - # deeply nested structures: |
809 |
| - # {VeryLong: { b: 1,c:2}} -> {VeryLong.b:1 ,VeryLong.c:@} |
810 |
| - # |
811 |
| - # TODO: handle record value which are lists, at least error |
812 |
| - # reasonably |
813 |
| - data = nested_to_record(data) |
814 |
| - return DataFrame(data) |
815 |
| - elif not isinstance(record_path, list): |
816 |
| - record_path = [record_path] |
817 |
| - |
818 |
| - if meta is None: |
819 |
| - meta = [] |
820 |
| - elif not isinstance(meta, list): |
821 |
| - meta = [meta] |
822 |
| - |
823 |
| - for i, x in enumerate(meta): |
824 |
| - if not isinstance(x, list): |
825 |
| - meta[i] = [x] |
826 |
| - |
827 |
| - # Disastrously inefficient for now |
828 |
| - records = [] |
829 |
| - lengths = [] |
830 |
| - |
831 |
| - meta_vals = defaultdict(list) |
832 |
| - meta_keys = ['.'.join(val) for val in meta] |
833 |
| - |
834 |
| - def _recursive_extract(data, path, seen_meta, level=0): |
835 |
| - if len(path) > 1: |
836 |
| - for obj in data: |
837 |
| - for val, key in zip(meta, meta_keys): |
838 |
| - if level + 1 == len(val): |
839 |
| - seen_meta[key] = _pull_field(obj, val[-1]) |
840 |
| - |
841 |
| - _recursive_extract(obj[path[0]], path[1:], |
842 |
| - seen_meta, level=level + 1) |
843 |
| - else: |
844 |
| - for obj in data: |
845 |
| - recs = _pull_field(obj, path[0]) |
846 |
| - |
847 |
| - # For repeating the metadata later |
848 |
| - lengths.append(len(recs)) |
849 |
| - |
850 |
| - for val, key in zip(meta, meta_keys): |
851 |
| - if level + 1 > len(val): |
852 |
| - meta_val = seen_meta[key] |
853 |
| - else: |
854 |
| - try: |
855 |
| - meta_val = _pull_field(obj, val[level:]) |
856 |
| - except KeyError as e: |
857 |
| - if errors == 'ignore': |
858 |
| - meta_val = np.nan |
859 |
| - else: |
860 |
| - raise \ |
861 |
| - KeyError("Try running with " |
862 |
| - "errors='ignore' as key " |
863 |
| - "%s is not always present", e) |
864 |
| - meta_vals[key].append(meta_val) |
865 |
| - |
866 |
| - records.extend(recs) |
867 |
| - |
868 |
| - _recursive_extract(data, record_path, {}, level=0) |
869 |
| - |
870 |
| - result = DataFrame(records) |
871 |
| - |
872 |
| - if record_prefix is not None: |
873 |
| - result.rename(columns=lambda x: record_prefix + x, inplace=True) |
874 |
| - |
875 |
| - # Data types, a problem |
876 |
| - for k, v in compat.iteritems(meta_vals): |
877 |
| - if meta_prefix is not None: |
878 |
| - k = meta_prefix + k |
879 |
| - |
880 |
| - if k in result: |
881 |
| - raise ValueError('Conflicting metadata name %s, ' |
882 |
| - 'need distinguishing prefix ' % k) |
883 |
| - |
884 |
| - result[k] = np.array(v).repeat(lengths) |
885 |
| - |
886 |
| - return result |
0 commit comments