From c293fd7bd0cfe56bf20fbfe3130b2cdf3e377b45 Mon Sep 17 00:00:00 2001
From: Jeff Reback <jeff@reback.net>
Date: Tue, 9 Jun 2015 20:34:02 -0400
Subject: [PATCH] BUG: Bug in to_json with certain orients and a
 CategoricalIndex would segfault #10307

---
 doc/source/whatsnew/v0.16.2.txt          |  2 +-
 pandas/io/json.py                        | 46 +++++++++++--
 pandas/io/tests/test_json/test_pandas.py | 82 ++++++++++++++++--------
 3 files changed, 96 insertions(+), 34 deletions(-)

diff --git a/doc/source/whatsnew/v0.16.2.txt b/doc/source/whatsnew/v0.16.2.txt
index feccc19d8f70b..2c954f33e26b7 100644
--- a/doc/source/whatsnew/v0.16.2.txt
+++ b/doc/source/whatsnew/v0.16.2.txt
@@ -120,7 +120,7 @@ Bug Fixes
 - Bug where read_hdf store.select modifies the passed columns list when
   multi-indexed (:issue:`7212`)
 - Bug in ``Categorical`` repr with ``display.width`` of ``None`` in Python 3 (:issue:`10087`)
-
+- Bug in ``to_json`` with certain orients and a ``CategoricalIndex`` would segfault (:issue:`10307`)
 - Bug where some of the nan funcs do not have consistent return dtypes (:issue:`10251`)
 
 - Bug in ``DataFrame.quantile`` on checking that a valid axis was passed (:issue:`9543`)
diff --git a/pandas/io/json.py b/pandas/io/json.py
index 0659e34c3f27b..4291c4544a074 100644
--- a/pandas/io/json.py
+++ b/pandas/io/json.py
@@ -11,7 +11,7 @@
 from pandas import compat, isnull
 from pandas import Series, DataFrame, to_datetime
 from pandas.io.common import get_filepath_or_buffer
-from pandas.core.common import AbstractMethodError
+from pandas.core.common import AbstractMethodError, is_categorical_dtype
 import pandas.core.common as com
 
 loads = _json.loads
@@ -60,11 +60,32 @@ def __init__(self, obj, orient, date_format, double_precision,
         self.ensure_ascii = ensure_ascii
         self.date_unit = date_unit
         self.default_handler = default_handler
+        self._coerce_axes()
+        self._coerce_data()
 
-        self.is_copy = None
-        self._format_axes()
+    def _coerce_axes(self):
+        for i in range(self.obj._AXIS_LEN):
+            self._coerce_axis(i)
 
-    def _format_axes(self):
+    def _coerce_axis(self, axis):
+        """
+        Parameters
+        ----------
+        axis : axis number
+
+        if the axis needs coercion, then copy the .obj
+        and set the index
+
+        """
+
+        # GH 10317
+        # coerce CategoricalIndexes to Index dtypes
+        ax = self.obj._get_axis(axis)
+        if is_categorical_dtype(ax):
+            self.obj = self.obj.copy()
+            self.obj.set_axis(axis, np.array(ax))
+
+    def _coerce_data(self):
         raise AbstractMethodError(self)
 
     def write(self):
@@ -81,16 +102,20 @@ def write(self):
 class SeriesWriter(Writer):
     _default_orient = 'index'
 
-    def _format_axes(self):
+    def _coerce_axes(self):
         if not self.obj.index.is_unique and self.orient == 'index':
             raise ValueError("Series index must be unique for orient="
                              "'%s'" % self.orient)
+        super(SeriesWriter, self)._coerce_axes()
 
+    def _coerce_data(self):
+        if is_categorical_dtype(self.obj):
+            self.obj = np.array(self.obj)
 
 class FrameWriter(Writer):
     _default_orient = 'columns'
 
-    def _format_axes(self):
+    def _coerce_axes(self):
         """ try to axes if they are datelike """
         if not self.obj.index.is_unique and self.orient in (
                 'index', 'columns'):
@@ -100,7 +125,16 @@ def _format_axes(self):
                 'index', 'columns', 'records'):
             raise ValueError("DataFrame columns must be unique for orient="
                              "'%s'." % self.orient)
+        super(FrameWriter, self)._coerce_axes()
+
+    def _coerce_data(self):
 
+        is_copy = False
+        for c, col in self.obj.iteritems():
+            if is_categorical_dtype(col):
+                if not is_copy:
+                    is_copy, self.obj = True, self.obj.copy()
+                self.obj[c] = np.array(col)
 
 def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
               convert_axes=True, convert_dates=True, keep_default_dates=True,
diff --git a/pandas/io/tests/test_json/test_pandas.py b/pandas/io/tests/test_json/test_pandas.py
index be9e0eccda8a1..bb0ad58a47d88 100644
--- a/pandas/io/tests/test_json/test_pandas.py
+++ b/pandas/io/tests/test_json/test_pandas.py
@@ -4,7 +4,7 @@
 import os
 
 import numpy as np
-from pandas import Series, DataFrame, DatetimeIndex, Timestamp
+from pandas import Series, DataFrame, DatetimeIndex, Timestamp, CategoricalIndex
 from datetime import timedelta
 import pandas as pd
 read_json = pd.read_json
@@ -23,6 +23,11 @@
                            for k, v in compat.iteritems(_seriesd)))
 
 _tsframe = DataFrame(_tsd)
+_cat_frame = _frame.copy()
+cat = ['bah']*5 + ['bar']*5 + ['baz']*5 + ['foo']*(len(_cat_frame)-15)
+_cat_frame.index = pd.CategoricalIndex(cat,name='E')
+_cat_frame['E'] = list(reversed(cat))
+_cat_frame['sort'] = np.arange(len(_cat_frame))
 
 _mixed_frame = _frame.copy()
 
@@ -48,6 +53,7 @@ def setUp(self):
         self.intframe = _intframe.copy()
         self.tsframe = _tsframe.copy()
         self.mixed_frame = _mixed_frame.copy()
+        self.categorical = _cat_frame.copy()
 
     def tearDown(self):
         del self.dirpath
@@ -128,8 +134,22 @@ def _check(df):
 
     def test_frame_from_json_to_json(self):
         def _check_orient(df, orient, dtype=None, numpy=False,
-                          convert_axes=True, check_dtype=True, raise_ok=None):
-            df = df.sort()
+                          convert_axes=True, check_dtype=True, raise_ok=None,
+                          sort=None):
+            if sort is not None:
+                df = df.sort(sort)
+            else:
+                df = df.sort()
+
+            # if we are not unique, then check that we are raising ValueError
+            # for the appropriate orients
+            if not df.index.is_unique and orient in ['index','columns']:
+                self.assertRaises(ValueError, lambda : df.to_json(orient=orient))
+                return
+            if not df.columns.is_unique and orient in ['index','columns','records']:
+                self.assertRaises(ValueError, lambda : df.to_json(orient=orient))
+                return
+
             dfjson = df.to_json(orient=orient)
 
             try:
@@ -141,7 +161,10 @@ def _check_orient(df, orient, dtype=None, numpy=False,
                         return
                     raise
 
-            unser = unser.sort()
+            if sort is not None and sort in unser.columns:
+                unser = unser.sort(sort)
+            else:
+                unser = unser.sort()
 
             if dtype is False:
                 check_dtype=False
@@ -160,7 +183,9 @@ def _check_orient(df, orient, dtype=None, numpy=False,
                 # index and col labels might not be strings
                 unser.index = [str(i) for i in unser.index]
                 unser.columns = [str(i) for i in unser.columns]
-                unser = unser.sort()
+
+                if sort is None:
+                    unser = unser.sort()
                 assert_almost_equal(df.values, unser.values)
             else:
                 if convert_axes:
@@ -169,45 +194,45 @@ def _check_orient(df, orient, dtype=None, numpy=False,
                     assert_frame_equal(df, unser, check_less_precise=False,
                                        check_dtype=check_dtype)
 
-        def _check_all_orients(df, dtype=None, convert_axes=True, raise_ok=None):
+        def _check_all_orients(df, dtype=None, convert_axes=True, raise_ok=None, sort=None):
 
             # numpy=False
             if convert_axes:
-                _check_orient(df, "columns", dtype=dtype)
-                _check_orient(df, "records", dtype=dtype)
-                _check_orient(df, "split", dtype=dtype)
-                _check_orient(df, "index", dtype=dtype)
-                _check_orient(df, "values", dtype=dtype)
-
-            _check_orient(df, "columns", dtype=dtype, convert_axes=False)
-            _check_orient(df, "records", dtype=dtype, convert_axes=False)
-            _check_orient(df, "split", dtype=dtype, convert_axes=False)
-            _check_orient(df, "index", dtype=dtype, convert_axes=False)
-            _check_orient(df, "values", dtype=dtype ,convert_axes=False)
+                _check_orient(df, "columns", dtype=dtype, sort=sort)
+                _check_orient(df, "records", dtype=dtype, sort=sort)
+                _check_orient(df, "split", dtype=dtype, sort=sort)
+                _check_orient(df, "index", dtype=dtype, sort=sort)
+                _check_orient(df, "values", dtype=dtype, sort=sort)
+
+            _check_orient(df, "columns", dtype=dtype, convert_axes=False, sort=sort)
+            _check_orient(df, "records", dtype=dtype, convert_axes=False, sort=sort)
+            _check_orient(df, "split", dtype=dtype, convert_axes=False, sort=sort)
+            _check_orient(df, "index", dtype=dtype, convert_axes=False, sort=sort)
+            _check_orient(df, "values", dtype=dtype ,convert_axes=False, sort=sort)
 
             # numpy=True and raise_ok might be not None, so ignore the error
             if convert_axes:
                 _check_orient(df, "columns", dtype=dtype, numpy=True,
-                              raise_ok=raise_ok)
+                              raise_ok=raise_ok, sort=sort)
                 _check_orient(df, "records", dtype=dtype, numpy=True,
-                              raise_ok=raise_ok)
+                              raise_ok=raise_ok, sort=sort)
                 _check_orient(df, "split", dtype=dtype, numpy=True,
-                              raise_ok=raise_ok)
+                              raise_ok=raise_ok, sort=sort)
                 _check_orient(df, "index", dtype=dtype, numpy=True,
-                              raise_ok=raise_ok)
+                              raise_ok=raise_ok, sort=sort)
                 _check_orient(df, "values", dtype=dtype, numpy=True,
-                              raise_ok=raise_ok)
+                              raise_ok=raise_ok, sort=sort)
 
             _check_orient(df, "columns", dtype=dtype, numpy=True,
-                          convert_axes=False, raise_ok=raise_ok)
+                          convert_axes=False, raise_ok=raise_ok, sort=sort)
             _check_orient(df, "records", dtype=dtype, numpy=True,
-                          convert_axes=False, raise_ok=raise_ok)
+                          convert_axes=False, raise_ok=raise_ok, sort=sort)
             _check_orient(df, "split", dtype=dtype, numpy=True,
-                          convert_axes=False, raise_ok=raise_ok)
+                          convert_axes=False, raise_ok=raise_ok, sort=sort)
             _check_orient(df, "index", dtype=dtype, numpy=True,
-                          convert_axes=False, raise_ok=raise_ok)
+                          convert_axes=False, raise_ok=raise_ok, sort=sort)
             _check_orient(df, "values", dtype=dtype, numpy=True,
-                          convert_axes=False, raise_ok=raise_ok)
+                          convert_axes=False, raise_ok=raise_ok, sort=sort)
 
         # basic
         _check_all_orients(self.frame)
@@ -233,6 +258,9 @@ def _check_all_orients(df, dtype=None, convert_axes=True, raise_ok=None):
         _check_all_orients(DataFrame(biggie, dtype='U3'), dtype='U3',
                            convert_axes=False, raise_ok=ValueError)
 
+        # categorical
+        _check_all_orients(self.categorical, sort='sort', raise_ok=ValueError)
+
         # empty
         _check_all_orients(self.empty_frame)