Merge pull request #4593 from Komnomnomnom/ujson-slash-happy

jreback · jreback · commit 36443700675a · 2013-08-20T19:01:54.000-07:00
BUG: ujson labels are encoded twice
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -212,6 +212,7 @@ See :ref:`Internal Refactoring<whatsnew_0130.refactoring>`
   - In ``to_json``, fix date handling so milliseconds are the default timestamp
     as the docstring says (:issue:`4362`). 
   - JSON NaT handling fixed, NaTs are now serialised to `null` (:issue:`4498`)
+  - Fixed JSON handling of escapable characters in JSON object keys (:issue:`4593`)
   - Fixed passing ``keep_default_na=False`` when ``na_values=None`` (:issue:`4318`)
   - Fixed bug with ``values`` raising an error on a DataFrame with duplicate columns and mixed
     dtypes, surfaced in (:issue:`4377`)
diff --git a/pandas/io/tests/test_json/test_pandas.py b/pandas/io/tests/test_json/test_pandas.py
@@ -14,7 +14,7 @@
 
 from pandas.util.testing import (assert_almost_equal, assert_frame_equal,
                                  assert_series_equal, network,
-                                 ensure_clean)
+                                 ensure_clean, assert_index_equal)
 import pandas.util.testing as tm
 from numpy.testing.decorators import slow
 
@@ -53,6 +53,21 @@ def setUp(self):
         self.tsframe = _tsframe.copy()
         self.mixed_frame = _mixed_frame.copy()
 
+    def test_frame_double_encoded_labels(self):
+        df = DataFrame([['a', 'b'], ['c', 'd']],
+                       index=['index " 1', 'index / 2'],
+                       columns=['a \\ b', 'y / z'])
+
+        assert_frame_equal(
+            df, read_json(df.to_json(orient='split'), orient='split'))
+        assert_frame_equal(
+            df, read_json(df.to_json(orient='columns'), orient='columns'))
+        assert_frame_equal(
+            df, read_json(df.to_json(orient='index'), orient='index'))
+        df_unser = read_json(df.to_json(orient='records'), orient='records')
+        assert_index_equal(df.columns, df_unser.columns)
+        np.testing.assert_equal(df.values, df_unser.values)
+
     def test_frame_non_unique_index(self):
         df = DataFrame([['a', 'b'], ['c', 'd']], index=[1, 1],
                        columns=['x', 'y'])
diff --git a/pandas/src/ujson/python/objToJSON.c b/pandas/src/ujson/python/objToJSON.c
@@ -488,6 +488,7 @@ JSOBJ NpyArr_iterGetValue(JSOBJ obj, JSONTypeContext *tc)
 
 char *NpyArr_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen)
 {
+  JSONObjectEncoder* enc = (JSONObjectEncoder*) tc->encoder;
   NpyArrContext* npyarr;
   npy_intp idx;
   PRINTMARK();
@@ -496,13 +497,19 @@ char *NpyArr_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen)
   {
     idx = npyarr->index[npyarr->stridedim] - 1;
     *outLen = strlen(npyarr->columnLabels[idx]);
-    return npyarr->columnLabels[idx];
+    memcpy(enc->offset, npyarr->columnLabels[idx], sizeof(char)*(*outLen));
+    enc->offset += *outLen;
+    *outLen = 0;
+    return NULL;
   }
   else
   {
     idx = npyarr->index[npyarr->stridedim - npyarr->inc] - 1;
     *outLen = strlen(npyarr->rowLabels[idx]);
-    return npyarr->rowLabels[idx];
+    memcpy(enc->offset, npyarr->rowLabels[idx], sizeof(char)*(*outLen));
+    enc->offset += *outLen;
+    *outLen = 0;
+    return NULL;
   }
 }
 
@@ -1064,7 +1071,7 @@ char** NpyArr_encodeLabels(PyArrayObject* labels, JSONObjectEncoder* enc, npy_in
     // NOTE this function steals a reference to labels.
     PyArrayObject* labelsTmp = NULL;
     PyObject* item = NULL;
-    npy_intp i, stride, len;
+    npy_intp i, stride, len, need_quotes;
     char** ret;
     char *dataptr, *cLabel, *origend, *origst, *origoffset;
     char labelBuffer[NPY_JSON_BUFSIZE];
@@ -1117,15 +1124,8 @@ char** NpyArr_encodeLabels(PyArrayObject* labels, JSONObjectEncoder* enc, npy_in
             break;
         }
 
-        // trim off any quotes surrounding the result
-        if (*cLabel == '\"')
-        {
-            cLabel++;
-            enc->offset -= 2;
-            *(enc->offset) = '\0';
-        }
-
-        len = enc->offset - cLabel + 1;
+        need_quotes = ((*cLabel) != '"');
+        len = enc->offset - cLabel + 1 + 2 * need_quotes;
         ret[i] = PyObject_Malloc(sizeof(char)*len);
 
         if (!ret[i])
@@ -1135,7 +1135,18 @@ char** NpyArr_encodeLabels(PyArrayObject* labels, JSONObjectEncoder* enc, npy_in
             break;
         }
 
-        memcpy(ret[i], cLabel, sizeof(char)*len);
+        if (need_quotes)
+        {
+          ret[i][0] = '"';
+          memcpy(ret[i]+1, cLabel, sizeof(char)*(len-4));
+          ret[i][len-3] = '"';
+        }
+        else
+        {
+          memcpy(ret[i], cLabel, sizeof(char)*(len-2));
+        }
+        ret[i][len-2] = ':';
+        ret[i][len-1] = '\0';
         dataptr += stride;
     }