BUG: Fix #13213 json_normalize() and non-ascii characters in keys

Felix Marczinowski · jreback · commit eeccd058a519 · 2016-05-19T09:14:22.000-04:00
closes #13213 Author: Felix Marczinowski <felix.marczinowski@blue-yonder.com> Closes #13214 from fmarczin/13213-unicode-json_normalize and squashes the following commits: 22e01b2 [Felix Marczinowski] fix linter warnings 44745ca [Felix Marczinowski] fix tests for py3 25fd0f8 [Felix Marczinowski] move test, fix py3 issue 7a38110 [Felix Marczinowski] add whatsnew note dd7302c [Felix Marczinowski] remove encoding signature from test 4dcd2c5 [Felix Marczinowski] fix for #13213 b9751e9 [Felix Marczinowski] add test for #13213
diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt
@@ -113,6 +113,7 @@ Performance Improvements
 Bug Fixes
 ~~~~~~~~~
 
+- Bug in ``io.json.json_normalize()``, where non-ascii keys raised an exception (:issue:`13213`)
 - Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing may raise ``IndexError`` (:issue:`13144`)
 - Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing result may have normal ``Index`` (:issue:`13144`)
 - Bug in ``SparseDataFrame`` in which ``axis=None`` did not default to ``axis=0`` (:issue:`13048`)
diff --git a/pandas/io/json.py b/pandas/io/json.py
@@ -614,10 +614,12 @@ def nested_to_record(ds, prefix="", level=0):
         new_d = copy.deepcopy(d)
         for k, v in d.items():
             # each key gets renamed with prefix
+            if not isinstance(k, compat.string_types):
+                k = str(k)
             if level == 0:
-                newkey = str(k)
+                newkey = k
             else:
-                newkey = prefix + '.' + str(k)
+                newkey = prefix + '.' + k
 
             # only dicts gets recurse-flattend
             # only at level>1 do we rename the rest of the keys
diff --git a/pandas/io/tests/json/test_json_norm.py b/pandas/io/tests/json/test_json_norm.py
@@ -2,8 +2,10 @@
 
 from pandas import DataFrame
 import numpy as np
+import json
 
 import pandas.util.testing as tm
+from pandas import compat
 
 from pandas.io.json import json_normalize, nested_to_record
 
@@ -164,6 +166,26 @@ def test_record_prefix(self):
 
         tm.assert_frame_equal(result, expected)
 
+    def test_non_ascii_key(self):
+        if compat.PY3:
+            testjson = (
+                b'[{"\xc3\x9cnic\xc3\xb8de":0,"sub":{"A":1, "B":2}},' +
+                b'{"\xc3\x9cnic\xc3\xb8de":1,"sub":{"A":3, "B":4}}]'
+            ).decode('utf8')
+        else:
+            testjson = ('[{"\xc3\x9cnic\xc3\xb8de":0,"sub":{"A":1, "B":2}},'
+                        '{"\xc3\x9cnic\xc3\xb8de":1,"sub":{"A":3, "B":4}}]')
+
+        testdata = {
+            u'sub.A': [1, 3],
+            u'sub.B': [2, 4],
+            b"\xc3\x9cnic\xc3\xb8de".decode('utf8'): [0, 1]
+        }
+        expected = DataFrame(testdata)
+
+        result = json_normalize(json.loads(testjson))
+        tm.assert_frame_equal(result, expected)
+
 
 class TestNestedToRecord(tm.TestCase):