ENH: GH14883: json_normalize now takes a user-specified separator

jreback · jreback · commit 34c6bd0fb7ad · 2017-03-28T17:43:42.000-04:00
closes #14883 Author: Jeff Reback <jeff@reback.net> Author: John Owens <jowens@ece.ucdavis.edu> Closes #14950 from jowens/json_normalize-separator and squashes the following commits: 0327dd1 [Jeff Reback] compare sorted columns bc5aae8 [Jeff Reback] CLN: fixup json_normalize with sep 8edc40e [John Owens] ENH: json_normalize now takes a user-specified separator
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -300,9 +300,9 @@ Other Enhancements
 - ``pd.DataFrame.plot`` now prints a title above each subplot if ``suplots=True`` and ``title`` is a list of strings (:issue:`14753`)
 - ``pd.Series.interpolate`` now supports timedelta as an index type with ``method='time'`` (:issue:`6424`)
 - ``Timedelta.isoformat`` method added for formatting Timedeltas as an `ISO 8601 duration`_. See the :ref:`Timedelta docs <timedeltas.isoformat>` (:issue:`15136`)
-- ``pandas.io.json.json_normalize()`` gained the option ``errors='ignore'|'raise'``; the default is ``errors='raise'`` which is backward compatible. (:issue:`14583`)
 - ``.select_dtypes()`` now allows the string 'datetimetz' to generically select datetimes with tz (:issue:`14910`)
 - The ``.to_latex()`` method will now accept ``multicolumn`` and ``multirow`` arguments to use the accompanying LaTeX enhancements
+
 - ``pd.merge_asof()`` gained the option ``direction='backward'|'forward'|'nearest'`` (:issue:`14887`)
 - ``Series/DataFrame.asfreq()`` have gained a ``fill_value`` parameter, to fill missing values (:issue:`3715`).
 - ``Series/DataFrame.resample.asfreq`` have gained a ``fill_value`` parameter, to fill missing values during resampling (:issue:`3715`).
@@ -313,11 +313,15 @@ Other Enhancements
 
 - ``pd.TimedeltaIndex`` now has a custom datetick formatter specifically designed for nanosecond level precision (:issue:`8711`)
 - ``pd.types.concat.union_categoricals`` gained the ``ignore_ordered`` argument to allow ignoring the ordered attribute of unioned categoricals (:issue:`13410`). See the :ref:`categorical union docs <categorical.union>` for more information.
-- ``pandas.io.json.json_normalize()`` with an empty ``list`` will return an empty ``DataFrame`` (:issue:`15534`)
 - ``pd.DataFrame.to_latex`` and ``pd.DataFrame.to_string`` now allow optional header aliases. (:issue:`15536`)
 - Re-enable the ``parse_dates`` keyword of ``read_excel`` to parse string columns as dates (:issue:`14326`)
 - Added ``.empty`` property to subclasses of ``Index``. (:issue:`15270`)
 
+- ``pandas.io.json.json_normalize()`` gained the option ``errors='ignore'|'raise'``; the default is ``errors='raise'`` which is backward compatible. (:issue:`14583`)
+- ``pandas.io.json.json_normalize()`` with an empty ``list`` will return an empty ``DataFrame`` (:issue:`15534`)
+- ``pandas.io.json.json_normalize()`` has gained a ``sep`` option that accepts ``str`` to separate joined fields; the default is ".", which is backward compatible. (:issue:`14883`)
+
+
 .. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations
 
 
diff --git a/pandas/io/json/normalize.py b/pandas/io/json/normalize.py
@@ -21,7 +21,7 @@ def _convert_to_line_delimits(s):
     return convert_json_to_lines(s)
 
 
-def nested_to_record(ds, prefix="", level=0):
+def nested_to_record(ds, prefix="", sep=".", level=0):
     """a simplified json_normalize
 
     converts a nested dict into a flat dict ("record"), unlike json_normalize,
@@ -31,6 +31,12 @@ def nested_to_record(ds, prefix="", level=0):
     ----------
     ds : dict or list of dicts
     prefix: the prefix, optional, default: ""
+    sep : string, default '.'
+        Nested records will generate names separated by sep,
+        e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
+
+        .. versionadded:: 0.20.0
+
     level: the number of levels in the jason string, optional, default: 0
 
     Returns
@@ -66,7 +72,7 @@ def nested_to_record(ds, prefix="", level=0):
             if level == 0:
                 newkey = k
             else:
-                newkey = prefix + '.' + k
+                newkey = prefix + sep + k
 
             # only dicts gets recurse-flattend
             # only at level>1 do we rename the rest of the keys
@@ -77,7 +83,7 @@ def nested_to_record(ds, prefix="", level=0):
                 continue
             else:
                 v = new_d.pop(k)
-                new_d.update(nested_to_record(v, newkey, level + 1))
+                new_d.update(nested_to_record(v, newkey, sep, level + 1))
         new_ds.append(new_d)
 
     if singleton:
@@ -88,7 +94,8 @@ def nested_to_record(ds, prefix="", level=0):
 def json_normalize(data, record_path=None, meta=None,
                    meta_prefix=None,
                    record_prefix=None,
-                   errors='raise'):
+                   errors='raise',
+                   sep='.'):
     """
     "Normalize" semi-structured JSON data into a flat table
 
@@ -106,13 +113,21 @@ def json_normalize(data, record_path=None, meta=None,
         path to records is ['foo', 'bar']
     meta_prefix : string, default None
     errors : {'raise', 'ignore'}, default 'raise'
-        * 'ignore' : will ignore KeyError if keys listed in meta are not
-          always present
-        * 'raise' : will raise KeyError if keys listed in meta are not
-          always present
+
+        * ignore : will ignore KeyError if keys listed in meta are not
+        always present
+        * raise : will raise KeyError if keys listed in meta are not
+        always present
 
         .. versionadded:: 0.20.0
 
+    sep : string, default '.'
+        Nested records will generate names separated by sep,
+        e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
+
+        .. versionadded:: 0.20.0
+
+
     Returns
     -------
     frame : DataFrame
@@ -173,7 +188,7 @@ def _pull_field(js, spec):
             #
             # TODO: handle record value which are lists, at least error
             #       reasonably
-            data = nested_to_record(data)
+            data = nested_to_record(data, sep=sep)
         return DataFrame(data)
     elif not isinstance(record_path, list):
         record_path = [record_path]
@@ -192,7 +207,9 @@ def _pull_field(js, spec):
     lengths = []
 
     meta_vals = defaultdict(list)
-    meta_keys = ['.'.join(val) for val in meta]
+    if not isinstance(sep, compat.string_types):
+        sep = str(sep)
+    meta_keys = [sep.join(val) for val in meta]
 
     def _recursive_extract(data, path, seen_meta, level=0):
         if len(path) > 1:
diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py
@@ -1,36 +1,60 @@
-from pandas import DataFrame
+import pytest
 import numpy as np
 import json
 
 import pandas.util.testing as tm
-from pandas import compat
+from pandas import compat, Index, DataFrame
 
 from pandas.io.json import json_normalize
 from pandas.io.json.normalize import nested_to_record
 
 
-def _assert_equal_data(left, right):
-    if not left.columns.equals(right.columns):
-        left = left.reindex(columns=right.columns)
+@pytest.fixture
+def deep_nested():
+    # deeply nested data
+    return [{'country': 'USA',
+             'states': [{'name': 'California',
+                         'cities': [{'name': 'San Francisco',
+                                     'pop': 12345},
+                                    {'name': 'Los Angeles',
+                                     'pop': 12346}]
+                         },
+                        {'name': 'Ohio',
+                         'cities': [{'name': 'Columbus',
+                                     'pop': 1234},
+                                    {'name': 'Cleveland',
+                                     'pop': 1236}]}
+                        ]
+             },
+            {'country': 'Germany',
+             'states': [{'name': 'Bayern',
+                         'cities': [{'name': 'Munich', 'pop': 12347}]
+                         },
+                        {'name': 'Nordrhein-Westfalen',
+                         'cities': [{'name': 'Duesseldorf', 'pop': 1238},
+                                    {'name': 'Koeln', 'pop': 1239}]}
+                        ]
+             }
+            ]
 
-    tm.assert_frame_equal(left, right)
 
+@pytest.fixture
+def state_data():
+    return [
+        {'counties': [{'name': 'Dade', 'population': 12345},
+                      {'name': 'Broward', 'population': 40000},
+                      {'name': 'Palm Beach', 'population': 60000}],
+         'info': {'governor': 'Rick Scott'},
+         'shortname': 'FL',
+         'state': 'Florida'},
+        {'counties': [{'name': 'Summit', 'population': 1234},
+                      {'name': 'Cuyahoga', 'population': 1337}],
+         'info': {'governor': 'John Kasich'},
+         'shortname': 'OH',
+         'state': 'Ohio'}]
 
-class TestJSONNormalize(tm.TestCase):
 
-    def setUp(self):
-        self.state_data = [
-            {'counties': [{'name': 'Dade', 'population': 12345},
-                          {'name': 'Broward', 'population': 40000},
-                          {'name': 'Palm Beach', 'population': 60000}],
-             'info': {'governor': 'Rick Scott'},
-             'shortname': 'FL',
-             'state': 'Florida'},
-            {'counties': [{'name': 'Summit', 'population': 1234},
-                          {'name': 'Cuyahoga', 'population': 1337}],
-             'info': {'governor': 'John Kasich'},
-             'shortname': 'OH',
-             'state': 'Ohio'}]
+class TestJSONNormalize(object):
 
     def test_simple_records(self):
         recs = [{'a': 1, 'b': 2, 'c': 3},
@@ -43,21 +67,21 @@ def test_simple_records(self):
 
         tm.assert_frame_equal(result, expected)
 
-    def test_simple_normalize(self):
-        result = json_normalize(self.state_data[0], 'counties')
-        expected = DataFrame(self.state_data[0]['counties'])
+    def test_simple_normalize(self, state_data):
+        result = json_normalize(state_data[0], 'counties')
+        expected = DataFrame(state_data[0]['counties'])
         tm.assert_frame_equal(result, expected)
 
-        result = json_normalize(self.state_data, 'counties')
+        result = json_normalize(state_data, 'counties')
 
         expected = []
-        for rec in self.state_data:
+        for rec in state_data:
             expected.extend(rec['counties'])
         expected = DataFrame(expected)
 
         tm.assert_frame_equal(result, expected)
 
-        result = json_normalize(self.state_data, 'counties', meta='state')
+        result = json_normalize(state_data, 'counties', meta='state')
         expected['state'] = np.array(['Florida', 'Ohio']).repeat([3, 2])
 
         tm.assert_frame_equal(result, expected)
@@ -67,33 +91,30 @@ def test_empty_array(self):
         expected = DataFrame()
         tm.assert_frame_equal(result, expected)
 
-    def test_more_deeply_nested(self):
-        data = [{'country': 'USA',
-                 'states': [{'name': 'California',
-                             'cities': [{'name': 'San Francisco',
-                                         'pop': 12345},
-                                        {'name': 'Los Angeles',
-                                         'pop': 12346}]
-                             },
-                            {'name': 'Ohio',
-                             'cities': [{'name': 'Columbus',
-                                         'pop': 1234},
-                                        {'name': 'Cleveland',
-                                         'pop': 1236}]}
-                            ]
-                 },
-                {'country': 'Germany',
-                 'states': [{'name': 'Bayern',
-                             'cities': [{'name': 'Munich', 'pop': 12347}]
-                             },
-                            {'name': 'Nordrhein-Westfalen',
-                             'cities': [{'name': 'Duesseldorf', 'pop': 1238},
-                                        {'name': 'Koeln', 'pop': 1239}]}
-                            ]
-                 }
-                ]
+    def test_simple_normalize_with_separator(self, deep_nested):
+        # GH 14883
+        result = json_normalize({'A': {'A': 1, 'B': 2}})
+        expected = DataFrame([[1, 2]], columns=['A.A', 'A.B'])
+        tm.assert_frame_equal(result.reindex_like(expected), expected)
+
+        result = json_normalize({'A': {'A': 1, 'B': 2}}, sep='_')
+        expected = DataFrame([[1, 2]], columns=['A_A', 'A_B'])
+        tm.assert_frame_equal(result.reindex_like(expected), expected)
+
+        result = json_normalize({'A': {'A': 1, 'B': 2}}, sep=u'\u03c3')
+        expected = DataFrame([[1, 2]], columns=[u'A\u03c3A', u'A\u03c3B'])
+        tm.assert_frame_equal(result.reindex_like(expected), expected)
+
+        result = json_normalize(deep_nested, ['states', 'cities'],
+                                meta=['country', ['states', 'name']],
+                                sep='_')
+        expected = Index(['name', 'pop',
+                          'country', 'states_name']).sort_values()
+        assert result.columns.sort_values().equals(expected)
+
+    def test_more_deeply_nested(self, deep_nested):
 
-        result = json_normalize(data, ['states', 'cities'],
+        result = json_normalize(deep_nested, ['states', 'cities'],
                                 meta=['country', ['states', 'name']])
         # meta_prefix={'states': 'state_'})
 
@@ -143,26 +164,26 @@ def test_meta_name_conflict(self):
                  'data': [{'foo': 'something', 'bar': 'else'},
                           {'foo': 'something2', 'bar': 'else2'}]}]
 
-        self.assertRaises(ValueError, json_normalize, data,
-                          'data', meta=['foo', 'bar'])
+        with pytest.raises(ValueError):
+            json_normalize(data, 'data', meta=['foo', 'bar'])
 
         result = json_normalize(data, 'data', meta=['foo', 'bar'],
                                 meta_prefix='meta')
 
         for val in ['metafoo', 'metabar', 'foo', 'bar']:
-            self.assertTrue(val in result)
+            assert val in result
 
-    def test_record_prefix(self):
-        result = json_normalize(self.state_data[0], 'counties')
-        expected = DataFrame(self.state_data[0]['counties'])
+    def test_record_prefix(self, state_data):
+        result = json_normalize(state_data[0], 'counties')
+        expected = DataFrame(state_data[0]['counties'])
         tm.assert_frame_equal(result, expected)
 
-        result = json_normalize(self.state_data, 'counties',
+        result = json_normalize(state_data, 'counties',
                                 meta='state',
                                 record_prefix='county_')
 
         expected = []
-        for rec in self.state_data:
+        for rec in state_data:
             expected.extend(rec['counties'])
         expected = DataFrame(expected)
         expected = expected.rename(columns=lambda x: 'county_' + x)