Skip to content

BUG: Bug in to_json with certain orients and a CategoricalIndex would segfault, closes #10317 #10321

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.16.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ Bug Fixes
- Bug where read_hdf store.select modifies the passed columns list when
multi-indexed (:issue:`7212`)
- Bug in ``Categorical`` repr with ``display.width`` of ``None`` in Python 3 (:issue:`10087`)

- Bug in ``to_json`` with certain orients and a ``CategoricalIndex`` would segfault (:issue:`10307`)
- Bug where some of the nan funcs do not have consistent return dtypes (:issue:`10251`)

- Bug in ``DataFrame.quantile`` on checking that a valid axis was passed (:issue:`9543`)
Expand Down
46 changes: 40 additions & 6 deletions pandas/io/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from pandas import compat, isnull
from pandas import Series, DataFrame, to_datetime
from pandas.io.common import get_filepath_or_buffer
from pandas.core.common import AbstractMethodError
from pandas.core.common import AbstractMethodError, is_categorical_dtype
import pandas.core.common as com

loads = _json.loads
Expand Down Expand Up @@ -60,11 +60,32 @@ def __init__(self, obj, orient, date_format, double_precision,
self.ensure_ascii = ensure_ascii
self.date_unit = date_unit
self.default_handler = default_handler
self._coerce_axes()
self._coerce_data()

self.is_copy = None
self._format_axes()
def _coerce_axes(self):
for i in range(self.obj._AXIS_LEN):
self._coerce_axis(i)

def _format_axes(self):
def _coerce_axis(self, axis):
"""
Parameters
----------
axis : axis number

if the axis needs coercion, then copy the .obj
and set the index

"""

# GH 10317
# coerce CategoricalIndexes to Index dtypes
ax = self.obj._get_axis(axis)
if is_categorical_dtype(ax):
self.obj = self.obj.copy()
self.obj.set_axis(axis, np.array(ax))

def _coerce_data(self):
raise AbstractMethodError(self)

def write(self):
Expand All @@ -81,16 +102,20 @@ def write(self):
class SeriesWriter(Writer):
_default_orient = 'index'

def _format_axes(self):
def _coerce_axes(self):
if not self.obj.index.is_unique and self.orient == 'index':
raise ValueError("Series index must be unique for orient="
"'%s'" % self.orient)
super(SeriesWriter, self)._coerce_axes()

def _coerce_data(self):
if is_categorical_dtype(self.obj):
self.obj = np.array(self.obj)

class FrameWriter(Writer):
_default_orient = 'columns'

def _format_axes(self):
def _coerce_axes(self):
""" try to axes if they are datelike """
if not self.obj.index.is_unique and self.orient in (
'index', 'columns'):
Expand All @@ -100,7 +125,16 @@ def _format_axes(self):
'index', 'columns', 'records'):
raise ValueError("DataFrame columns must be unique for orient="
"'%s'." % self.orient)
super(FrameWriter, self)._coerce_axes()

def _coerce_data(self):

is_copy = False
for c, col in self.obj.iteritems():
if is_categorical_dtype(col):
if not is_copy:
is_copy, self.obj = True, self.obj.copy()
self.obj[c] = np.array(col)

def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
convert_axes=True, convert_dates=True, keep_default_dates=True,
Expand Down
82 changes: 55 additions & 27 deletions pandas/io/tests/test_json/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import os

import numpy as np
from pandas import Series, DataFrame, DatetimeIndex, Timestamp
from pandas import Series, DataFrame, DatetimeIndex, Timestamp, CategoricalIndex
from datetime import timedelta
import pandas as pd
read_json = pd.read_json
Expand All @@ -23,6 +23,11 @@
for k, v in compat.iteritems(_seriesd)))

_tsframe = DataFrame(_tsd)
_cat_frame = _frame.copy()
cat = ['bah']*5 + ['bar']*5 + ['baz']*5 + ['foo']*(len(_cat_frame)-15)
_cat_frame.index = pd.CategoricalIndex(cat,name='E')
_cat_frame['E'] = list(reversed(cat))
_cat_frame['sort'] = np.arange(len(_cat_frame))

_mixed_frame = _frame.copy()

Expand All @@ -48,6 +53,7 @@ def setUp(self):
self.intframe = _intframe.copy()
self.tsframe = _tsframe.copy()
self.mixed_frame = _mixed_frame.copy()
self.categorical = _cat_frame.copy()

def tearDown(self):
del self.dirpath
Expand Down Expand Up @@ -128,8 +134,22 @@ def _check(df):

def test_frame_from_json_to_json(self):
def _check_orient(df, orient, dtype=None, numpy=False,
convert_axes=True, check_dtype=True, raise_ok=None):
df = df.sort()
convert_axes=True, check_dtype=True, raise_ok=None,
sort=None):
if sort is not None:
df = df.sort(sort)
else:
df = df.sort()

# if we are not unique, then check that we are raising ValueError
# for the appropriate orients
if not df.index.is_unique and orient in ['index','columns']:
self.assertRaises(ValueError, lambda : df.to_json(orient=orient))
return
if not df.columns.is_unique and orient in ['index','columns','records']:
self.assertRaises(ValueError, lambda : df.to_json(orient=orient))
return

dfjson = df.to_json(orient=orient)

try:
Expand All @@ -141,7 +161,10 @@ def _check_orient(df, orient, dtype=None, numpy=False,
return
raise

unser = unser.sort()
if sort is not None and sort in unser.columns:
unser = unser.sort(sort)
else:
unser = unser.sort()

if dtype is False:
check_dtype=False
Expand All @@ -160,7 +183,9 @@ def _check_orient(df, orient, dtype=None, numpy=False,
# index and col labels might not be strings
unser.index = [str(i) for i in unser.index]
unser.columns = [str(i) for i in unser.columns]
unser = unser.sort()

if sort is None:
unser = unser.sort()
assert_almost_equal(df.values, unser.values)
else:
if convert_axes:
Expand All @@ -169,45 +194,45 @@ def _check_orient(df, orient, dtype=None, numpy=False,
assert_frame_equal(df, unser, check_less_precise=False,
check_dtype=check_dtype)

def _check_all_orients(df, dtype=None, convert_axes=True, raise_ok=None):
def _check_all_orients(df, dtype=None, convert_axes=True, raise_ok=None, sort=None):

# numpy=False
if convert_axes:
_check_orient(df, "columns", dtype=dtype)
_check_orient(df, "records", dtype=dtype)
_check_orient(df, "split", dtype=dtype)
_check_orient(df, "index", dtype=dtype)
_check_orient(df, "values", dtype=dtype)

_check_orient(df, "columns", dtype=dtype, convert_axes=False)
_check_orient(df, "records", dtype=dtype, convert_axes=False)
_check_orient(df, "split", dtype=dtype, convert_axes=False)
_check_orient(df, "index", dtype=dtype, convert_axes=False)
_check_orient(df, "values", dtype=dtype ,convert_axes=False)
_check_orient(df, "columns", dtype=dtype, sort=sort)
_check_orient(df, "records", dtype=dtype, sort=sort)
_check_orient(df, "split", dtype=dtype, sort=sort)
_check_orient(df, "index", dtype=dtype, sort=sort)
_check_orient(df, "values", dtype=dtype, sort=sort)

_check_orient(df, "columns", dtype=dtype, convert_axes=False, sort=sort)
_check_orient(df, "records", dtype=dtype, convert_axes=False, sort=sort)
_check_orient(df, "split", dtype=dtype, convert_axes=False, sort=sort)
_check_orient(df, "index", dtype=dtype, convert_axes=False, sort=sort)
_check_orient(df, "values", dtype=dtype ,convert_axes=False, sort=sort)

# numpy=True and raise_ok might be not None, so ignore the error
if convert_axes:
_check_orient(df, "columns", dtype=dtype, numpy=True,
raise_ok=raise_ok)
raise_ok=raise_ok, sort=sort)
_check_orient(df, "records", dtype=dtype, numpy=True,
raise_ok=raise_ok)
raise_ok=raise_ok, sort=sort)
_check_orient(df, "split", dtype=dtype, numpy=True,
raise_ok=raise_ok)
raise_ok=raise_ok, sort=sort)
_check_orient(df, "index", dtype=dtype, numpy=True,
raise_ok=raise_ok)
raise_ok=raise_ok, sort=sort)
_check_orient(df, "values", dtype=dtype, numpy=True,
raise_ok=raise_ok)
raise_ok=raise_ok, sort=sort)

_check_orient(df, "columns", dtype=dtype, numpy=True,
convert_axes=False, raise_ok=raise_ok)
convert_axes=False, raise_ok=raise_ok, sort=sort)
_check_orient(df, "records", dtype=dtype, numpy=True,
convert_axes=False, raise_ok=raise_ok)
convert_axes=False, raise_ok=raise_ok, sort=sort)
_check_orient(df, "split", dtype=dtype, numpy=True,
convert_axes=False, raise_ok=raise_ok)
convert_axes=False, raise_ok=raise_ok, sort=sort)
_check_orient(df, "index", dtype=dtype, numpy=True,
convert_axes=False, raise_ok=raise_ok)
convert_axes=False, raise_ok=raise_ok, sort=sort)
_check_orient(df, "values", dtype=dtype, numpy=True,
convert_axes=False, raise_ok=raise_ok)
convert_axes=False, raise_ok=raise_ok, sort=sort)

# basic
_check_all_orients(self.frame)
Expand All @@ -233,6 +258,9 @@ def _check_all_orients(df, dtype=None, convert_axes=True, raise_ok=None):
_check_all_orients(DataFrame(biggie, dtype='U3'), dtype='U3',
convert_axes=False, raise_ok=ValueError)

# categorical
_check_all_orients(self.categorical, sort='sort', raise_ok=ValueError)

# empty
_check_all_orients(self.empty_frame)

Expand Down