From c293fd7bd0cfe56bf20fbfe3130b2cdf3e377b45 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 9 Jun 2015 20:34:02 -0400 Subject: [PATCH] BUG: Bug in to_json with certain orients and a CategoricalIndex would segfault #10307 --- doc/source/whatsnew/v0.16.2.txt | 2 +- pandas/io/json.py | 46 +++++++++++-- pandas/io/tests/test_json/test_pandas.py | 82 ++++++++++++++++-------- 3 files changed, 96 insertions(+), 34 deletions(-) diff --git a/doc/source/whatsnew/v0.16.2.txt b/doc/source/whatsnew/v0.16.2.txt index feccc19d8f70b..2c954f33e26b7 100644 --- a/doc/source/whatsnew/v0.16.2.txt +++ b/doc/source/whatsnew/v0.16.2.txt @@ -120,7 +120,7 @@ Bug Fixes - Bug where read_hdf store.select modifies the passed columns list when multi-indexed (:issue:`7212`) - Bug in ``Categorical`` repr with ``display.width`` of ``None`` in Python 3 (:issue:`10087`) - +- Bug in ``to_json`` with certain orients and a ``CategoricalIndex`` would segfault (:issue:`10307`) - Bug where some of the nan funcs do not have consistent return dtypes (:issue:`10251`) - Bug in ``DataFrame.quantile`` on checking that a valid axis was passed (:issue:`9543`) diff --git a/pandas/io/json.py b/pandas/io/json.py index 0659e34c3f27b..4291c4544a074 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -11,7 +11,7 @@ from pandas import compat, isnull from pandas import Series, DataFrame, to_datetime from pandas.io.common import get_filepath_or_buffer -from pandas.core.common import AbstractMethodError +from pandas.core.common import AbstractMethodError, is_categorical_dtype import pandas.core.common as com loads = _json.loads @@ -60,11 +60,32 @@ def __init__(self, obj, orient, date_format, double_precision, self.ensure_ascii = ensure_ascii self.date_unit = date_unit self.default_handler = default_handler + self._coerce_axes() + self._coerce_data() - self.is_copy = None - self._format_axes() + def _coerce_axes(self): + for i in range(self.obj._AXIS_LEN): + self._coerce_axis(i) - def _format_axes(self): + def _coerce_axis(self, axis): + """ + Parameters + ---------- + axis : axis number + + if the axis needs coercion, then copy the .obj + and set the index + + """ + + # GH 10317 + # coerce CategoricalIndexes to Index dtypes + ax = self.obj._get_axis(axis) + if is_categorical_dtype(ax): + self.obj = self.obj.copy() + self.obj.set_axis(axis, np.array(ax)) + + def _coerce_data(self): raise AbstractMethodError(self) def write(self): @@ -81,16 +102,20 @@ def write(self): class SeriesWriter(Writer): _default_orient = 'index' - def _format_axes(self): + def _coerce_axes(self): if not self.obj.index.is_unique and self.orient == 'index': raise ValueError("Series index must be unique for orient=" "'%s'" % self.orient) + super(SeriesWriter, self)._coerce_axes() + def _coerce_data(self): + if is_categorical_dtype(self.obj): + self.obj = np.array(self.obj) class FrameWriter(Writer): _default_orient = 'columns' - def _format_axes(self): + def _coerce_axes(self): """ try to axes if they are datelike """ if not self.obj.index.is_unique and self.orient in ( 'index', 'columns'): @@ -100,7 +125,16 @@ def _format_axes(self): 'index', 'columns', 'records'): raise ValueError("DataFrame columns must be unique for orient=" "'%s'." % self.orient) + super(FrameWriter, self)._coerce_axes() + + def _coerce_data(self): + is_copy = False + for c, col in self.obj.iteritems(): + if is_categorical_dtype(col): + if not is_copy: + is_copy, self.obj = True, self.obj.copy() + self.obj[c] = np.array(col) def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, convert_axes=True, convert_dates=True, keep_default_dates=True, diff --git a/pandas/io/tests/test_json/test_pandas.py b/pandas/io/tests/test_json/test_pandas.py index be9e0eccda8a1..bb0ad58a47d88 100644 --- a/pandas/io/tests/test_json/test_pandas.py +++ b/pandas/io/tests/test_json/test_pandas.py @@ -4,7 +4,7 @@ import os import numpy as np -from pandas import Series, DataFrame, DatetimeIndex, Timestamp +from pandas import Series, DataFrame, DatetimeIndex, Timestamp, CategoricalIndex from datetime import timedelta import pandas as pd read_json = pd.read_json @@ -23,6 +23,11 @@ for k, v in compat.iteritems(_seriesd))) _tsframe = DataFrame(_tsd) +_cat_frame = _frame.copy() +cat = ['bah']*5 + ['bar']*5 + ['baz']*5 + ['foo']*(len(_cat_frame)-15) +_cat_frame.index = pd.CategoricalIndex(cat,name='E') +_cat_frame['E'] = list(reversed(cat)) +_cat_frame['sort'] = np.arange(len(_cat_frame)) _mixed_frame = _frame.copy() @@ -48,6 +53,7 @@ def setUp(self): self.intframe = _intframe.copy() self.tsframe = _tsframe.copy() self.mixed_frame = _mixed_frame.copy() + self.categorical = _cat_frame.copy() def tearDown(self): del self.dirpath @@ -128,8 +134,22 @@ def _check(df): def test_frame_from_json_to_json(self): def _check_orient(df, orient, dtype=None, numpy=False, - convert_axes=True, check_dtype=True, raise_ok=None): - df = df.sort() + convert_axes=True, check_dtype=True, raise_ok=None, + sort=None): + if sort is not None: + df = df.sort(sort) + else: + df = df.sort() + + # if we are not unique, then check that we are raising ValueError + # for the appropriate orients + if not df.index.is_unique and orient in ['index','columns']: + self.assertRaises(ValueError, lambda : df.to_json(orient=orient)) + return + if not df.columns.is_unique and orient in ['index','columns','records']: + self.assertRaises(ValueError, lambda : df.to_json(orient=orient)) + return + dfjson = df.to_json(orient=orient) try: @@ -141,7 +161,10 @@ def _check_orient(df, orient, dtype=None, numpy=False, return raise - unser = unser.sort() + if sort is not None and sort in unser.columns: + unser = unser.sort(sort) + else: + unser = unser.sort() if dtype is False: check_dtype=False @@ -160,7 +183,9 @@ def _check_orient(df, orient, dtype=None, numpy=False, # index and col labels might not be strings unser.index = [str(i) for i in unser.index] unser.columns = [str(i) for i in unser.columns] - unser = unser.sort() + + if sort is None: + unser = unser.sort() assert_almost_equal(df.values, unser.values) else: if convert_axes: @@ -169,45 +194,45 @@ def _check_orient(df, orient, dtype=None, numpy=False, assert_frame_equal(df, unser, check_less_precise=False, check_dtype=check_dtype) - def _check_all_orients(df, dtype=None, convert_axes=True, raise_ok=None): + def _check_all_orients(df, dtype=None, convert_axes=True, raise_ok=None, sort=None): # numpy=False if convert_axes: - _check_orient(df, "columns", dtype=dtype) - _check_orient(df, "records", dtype=dtype) - _check_orient(df, "split", dtype=dtype) - _check_orient(df, "index", dtype=dtype) - _check_orient(df, "values", dtype=dtype) - - _check_orient(df, "columns", dtype=dtype, convert_axes=False) - _check_orient(df, "records", dtype=dtype, convert_axes=False) - _check_orient(df, "split", dtype=dtype, convert_axes=False) - _check_orient(df, "index", dtype=dtype, convert_axes=False) - _check_orient(df, "values", dtype=dtype ,convert_axes=False) + _check_orient(df, "columns", dtype=dtype, sort=sort) + _check_orient(df, "records", dtype=dtype, sort=sort) + _check_orient(df, "split", dtype=dtype, sort=sort) + _check_orient(df, "index", dtype=dtype, sort=sort) + _check_orient(df, "values", dtype=dtype, sort=sort) + + _check_orient(df, "columns", dtype=dtype, convert_axes=False, sort=sort) + _check_orient(df, "records", dtype=dtype, convert_axes=False, sort=sort) + _check_orient(df, "split", dtype=dtype, convert_axes=False, sort=sort) + _check_orient(df, "index", dtype=dtype, convert_axes=False, sort=sort) + _check_orient(df, "values", dtype=dtype ,convert_axes=False, sort=sort) # numpy=True and raise_ok might be not None, so ignore the error if convert_axes: _check_orient(df, "columns", dtype=dtype, numpy=True, - raise_ok=raise_ok) + raise_ok=raise_ok, sort=sort) _check_orient(df, "records", dtype=dtype, numpy=True, - raise_ok=raise_ok) + raise_ok=raise_ok, sort=sort) _check_orient(df, "split", dtype=dtype, numpy=True, - raise_ok=raise_ok) + raise_ok=raise_ok, sort=sort) _check_orient(df, "index", dtype=dtype, numpy=True, - raise_ok=raise_ok) + raise_ok=raise_ok, sort=sort) _check_orient(df, "values", dtype=dtype, numpy=True, - raise_ok=raise_ok) + raise_ok=raise_ok, sort=sort) _check_orient(df, "columns", dtype=dtype, numpy=True, - convert_axes=False, raise_ok=raise_ok) + convert_axes=False, raise_ok=raise_ok, sort=sort) _check_orient(df, "records", dtype=dtype, numpy=True, - convert_axes=False, raise_ok=raise_ok) + convert_axes=False, raise_ok=raise_ok, sort=sort) _check_orient(df, "split", dtype=dtype, numpy=True, - convert_axes=False, raise_ok=raise_ok) + convert_axes=False, raise_ok=raise_ok, sort=sort) _check_orient(df, "index", dtype=dtype, numpy=True, - convert_axes=False, raise_ok=raise_ok) + convert_axes=False, raise_ok=raise_ok, sort=sort) _check_orient(df, "values", dtype=dtype, numpy=True, - convert_axes=False, raise_ok=raise_ok) + convert_axes=False, raise_ok=raise_ok, sort=sort) # basic _check_all_orients(self.frame) @@ -233,6 +258,9 @@ def _check_all_orients(df, dtype=None, convert_axes=True, raise_ok=None): _check_all_orients(DataFrame(biggie, dtype='U3'), dtype='U3', convert_axes=False, raise_ok=ValueError) + # categorical + _check_all_orients(self.categorical, sort='sort', raise_ok=ValueError) + # empty _check_all_orients(self.empty_frame)