Skip to content

Bug in to_json causing segfault with a CategoricalIndex (GH #10317) #10322

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 10, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.16.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ Bug Fixes
- Bug where read_hdf store.select modifies the passed columns list when
multi-indexed (:issue:`7212`)
- Bug in ``Categorical`` repr with ``display.width`` of ``None`` in Python 3 (:issue:`10087`)
- Bug in ``to_json`` with certain orients and a ``CategoricalIndex`` would segfault (:issue:`10317`)

- Bug where some of the nan funcs do not have consistent return dtypes (:issue:`10251`)

Expand Down
82 changes: 55 additions & 27 deletions pandas/io/tests/test_json/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import os

import numpy as np
from pandas import Series, DataFrame, DatetimeIndex, Timestamp
from pandas import Series, DataFrame, DatetimeIndex, Timestamp, CategoricalIndex
from datetime import timedelta
import pandas as pd
read_json = pd.read_json
Expand All @@ -23,6 +23,11 @@
for k, v in compat.iteritems(_seriesd)))

_tsframe = DataFrame(_tsd)
_cat_frame = _frame.copy()
cat = ['bah']*5 + ['bar']*5 + ['baz']*5 + ['foo']*(len(_cat_frame)-15)
_cat_frame.index = pd.CategoricalIndex(cat,name='E')
_cat_frame['E'] = list(reversed(cat))
_cat_frame['sort'] = np.arange(len(_cat_frame))

_mixed_frame = _frame.copy()

Expand All @@ -48,6 +53,7 @@ def setUp(self):
self.intframe = _intframe.copy()
self.tsframe = _tsframe.copy()
self.mixed_frame = _mixed_frame.copy()
self.categorical = _cat_frame.copy()

def tearDown(self):
del self.dirpath
Expand Down Expand Up @@ -128,8 +134,22 @@ def _check(df):

def test_frame_from_json_to_json(self):
def _check_orient(df, orient, dtype=None, numpy=False,
convert_axes=True, check_dtype=True, raise_ok=None):
df = df.sort()
convert_axes=True, check_dtype=True, raise_ok=None,
sort=None):
if sort is not None:
df = df.sort(sort)
else:
df = df.sort()

# if we are not unique, then check that we are raising ValueError
# for the appropriate orients
if not df.index.is_unique and orient in ['index','columns']:
self.assertRaises(ValueError, lambda : df.to_json(orient=orient))
return
if not df.columns.is_unique and orient in ['index','columns','records']:
self.assertRaises(ValueError, lambda : df.to_json(orient=orient))
return

dfjson = df.to_json(orient=orient)

try:
Expand All @@ -141,7 +161,10 @@ def _check_orient(df, orient, dtype=None, numpy=False,
return
raise

unser = unser.sort()
if sort is not None and sort in unser.columns:
unser = unser.sort(sort)
else:
unser = unser.sort()

if dtype is False:
check_dtype=False
Expand All @@ -160,7 +183,9 @@ def _check_orient(df, orient, dtype=None, numpy=False,
# index and col labels might not be strings
unser.index = [str(i) for i in unser.index]
unser.columns = [str(i) for i in unser.columns]
unser = unser.sort()

if sort is None:
unser = unser.sort()
assert_almost_equal(df.values, unser.values)
else:
if convert_axes:
Expand All @@ -169,45 +194,45 @@ def _check_orient(df, orient, dtype=None, numpy=False,
assert_frame_equal(df, unser, check_less_precise=False,
check_dtype=check_dtype)

def _check_all_orients(df, dtype=None, convert_axes=True, raise_ok=None):
def _check_all_orients(df, dtype=None, convert_axes=True, raise_ok=None, sort=None):

# numpy=False
if convert_axes:
_check_orient(df, "columns", dtype=dtype)
_check_orient(df, "records", dtype=dtype)
_check_orient(df, "split", dtype=dtype)
_check_orient(df, "index", dtype=dtype)
_check_orient(df, "values", dtype=dtype)

_check_orient(df, "columns", dtype=dtype, convert_axes=False)
_check_orient(df, "records", dtype=dtype, convert_axes=False)
_check_orient(df, "split", dtype=dtype, convert_axes=False)
_check_orient(df, "index", dtype=dtype, convert_axes=False)
_check_orient(df, "values", dtype=dtype ,convert_axes=False)
_check_orient(df, "columns", dtype=dtype, sort=sort)
_check_orient(df, "records", dtype=dtype, sort=sort)
_check_orient(df, "split", dtype=dtype, sort=sort)
_check_orient(df, "index", dtype=dtype, sort=sort)
_check_orient(df, "values", dtype=dtype, sort=sort)

_check_orient(df, "columns", dtype=dtype, convert_axes=False, sort=sort)
_check_orient(df, "records", dtype=dtype, convert_axes=False, sort=sort)
_check_orient(df, "split", dtype=dtype, convert_axes=False, sort=sort)
_check_orient(df, "index", dtype=dtype, convert_axes=False, sort=sort)
_check_orient(df, "values", dtype=dtype ,convert_axes=False, sort=sort)

# numpy=True and raise_ok might be not None, so ignore the error
if convert_axes:
_check_orient(df, "columns", dtype=dtype, numpy=True,
raise_ok=raise_ok)
raise_ok=raise_ok, sort=sort)
_check_orient(df, "records", dtype=dtype, numpy=True,
raise_ok=raise_ok)
raise_ok=raise_ok, sort=sort)
_check_orient(df, "split", dtype=dtype, numpy=True,
raise_ok=raise_ok)
raise_ok=raise_ok, sort=sort)
_check_orient(df, "index", dtype=dtype, numpy=True,
raise_ok=raise_ok)
raise_ok=raise_ok, sort=sort)
_check_orient(df, "values", dtype=dtype, numpy=True,
raise_ok=raise_ok)
raise_ok=raise_ok, sort=sort)

_check_orient(df, "columns", dtype=dtype, numpy=True,
convert_axes=False, raise_ok=raise_ok)
convert_axes=False, raise_ok=raise_ok, sort=sort)
_check_orient(df, "records", dtype=dtype, numpy=True,
convert_axes=False, raise_ok=raise_ok)
convert_axes=False, raise_ok=raise_ok, sort=sort)
_check_orient(df, "split", dtype=dtype, numpy=True,
convert_axes=False, raise_ok=raise_ok)
convert_axes=False, raise_ok=raise_ok, sort=sort)
_check_orient(df, "index", dtype=dtype, numpy=True,
convert_axes=False, raise_ok=raise_ok)
convert_axes=False, raise_ok=raise_ok, sort=sort)
_check_orient(df, "values", dtype=dtype, numpy=True,
convert_axes=False, raise_ok=raise_ok)
convert_axes=False, raise_ok=raise_ok, sort=sort)

# basic
_check_all_orients(self.frame)
Expand All @@ -233,6 +258,9 @@ def _check_all_orients(df, dtype=None, convert_axes=True, raise_ok=None):
_check_all_orients(DataFrame(biggie, dtype='U3'), dtype='U3',
convert_axes=False, raise_ok=ValueError)

# categorical
_check_all_orients(self.categorical, sort='sort', raise_ok=ValueError)

# empty
_check_all_orients(self.empty_frame)

Expand Down
29 changes: 20 additions & 9 deletions pandas/src/ujson/python/objToJSON.c
Original file line number Diff line number Diff line change
Expand Up @@ -1814,7 +1814,7 @@ char** NpyArr_encodeLabels(PyArrayObject* labels, JSONObjectEncoder* enc, npy_in

void Object_beginTypeContext (JSOBJ _obj, JSONTypeContext *tc)
{
PyObject *obj, *exc, *toDictFunc, *tmpObj;
PyObject *obj, *exc, *toDictFunc, *tmpObj, *getValuesFunc;
TypeContext *pc;
PyObjectEncoder *enc;
double val;
Expand Down Expand Up @@ -2082,14 +2082,25 @@ void Object_beginTypeContext (JSOBJ _obj, JSONTypeContext *tc)
return;
}

PRINTMARK();
tc->type = JT_ARRAY;
pc->newObj = PyObject_GetAttrString(obj, "values");
pc->iterBegin = NpyArr_iterBegin;
pc->iterEnd = NpyArr_iterEnd;
pc->iterNext = NpyArr_iterNext;
pc->iterGetValue = NpyArr_iterGetValue;
pc->iterGetName = NpyArr_iterGetName;
PyObject* getValuesFunc = PyObject_GetAttrString(obj, "get_values");
if (getValuesFunc)
{
PRINTMARK();
tc->type = JT_ARRAY;
pc->newObj = PyObject_CallObject(getValuesFunc, NULL);
pc->iterBegin = NpyArr_iterBegin;
pc->iterEnd = NpyArr_iterEnd;
pc->iterNext = NpyArr_iterNext;
pc->iterGetValue = NpyArr_iterGetValue;
pc->iterGetName = NpyArr_iterGetName;

Py_DECREF(getValuesFunc);
}
else
{
goto INVALID;
}

return;
}
else
Expand Down