diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index 5b3e607956f7a..c908d29716a7d 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -81,6 +81,7 @@ Bug Fixes **Categorical** +- Bug in rendering :class:`Series` with ``Categorical`` dtype in rare conditions under Python 2.7 (:issue:`21002`) - **Timezones** diff --git a/pandas/_libs/hashing.pyx b/pandas/_libs/hashing.pyx index c6f182ac5003f..4489847518a1d 100644 --- a/pandas/_libs/hashing.pyx +++ b/pandas/_libs/hashing.pyx @@ -8,8 +8,7 @@ import numpy as np from numpy cimport ndarray, uint8_t, uint32_t, uint64_t from util cimport _checknull -from cpython cimport (PyString_Check, - PyBytes_Check, +from cpython cimport (PyBytes_Check, PyUnicode_Check) from libc.stdlib cimport malloc, free @@ -62,9 +61,7 @@ def hash_object_array(ndarray[object] arr, object key, object encoding='utf8'): cdef list datas = [] for i in range(n): val = arr[i] - if PyString_Check(val): - data = val.encode(encoding) - elif PyBytes_Check(val): + if PyBytes_Check(val): data = val elif PyUnicode_Check(val): data = val.encode(encoding) diff --git a/pandas/tests/series/test_repr.py b/pandas/tests/series/test_repr.py index 97236f028b1c4..730c2b7865f1f 100644 --- a/pandas/tests/series/test_repr.py +++ b/pandas/tests/series/test_repr.py @@ -11,6 +11,7 @@ from pandas import (Index, Series, DataFrame, date_range, option_context, Categorical, period_range, timedelta_range) from pandas.core.index import MultiIndex +from pandas.core.base import StringMixin from pandas.compat import lrange, range, u from pandas import compat @@ -202,6 +203,35 @@ def test_latex_repr(self): class TestCategoricalRepr(object): + def test_categorical_repr_unicode(self): + # GH#21002 if len(index) > 60, sys.getdefaultencoding()=='ascii', + # and we are working in PY2, then rendering a Categorical could raise + # UnicodeDecodeError by trying to decode when it shouldn't + + class County(StringMixin): + name = u'San Sebastián' + state = u'PR' + + def __unicode__(self): + return self.name + u', ' + self.state + + cat = pd.Categorical([County() for n in range(61)]) + idx = pd.Index(cat) + ser = idx.to_series() + + if compat.PY3: + # no reloading of sys, just check that the default (utf8) works + # as expected + repr(ser) + str(ser) + + else: + # set sys.defaultencoding to ascii, then change it back after + # the test + with tm.set_defaultencoding('ascii'): + repr(ser) + str(ser) + def test_categorical_repr(self): a = Series(Categorical([1, 2, 3, 4])) exp = u("0 1\n1 2\n2 3\n3 4\n" + diff --git a/pandas/util/testing.py b/pandas/util/testing.py index d26a2116fb3ce..b9e53dfc80020 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -553,6 +553,28 @@ def _valid_locales(locales, normalize): # Stdout / stderr decorators +@contextmanager +def set_defaultencoding(encoding): + """ + Set default encoding (as given by sys.getdefaultencoding()) to the given + encoding; restore on exit. + + Parameters + ---------- + encoding : str + """ + if not PY2: + raise ValueError("set_defaultencoding context is only available " + "in Python 2.") + orig = sys.getdefaultencoding() + reload(sys) # noqa:F821 + sys.setdefaultencoding(encoding) + try: + yield + finally: + sys.setdefaultencoding(orig) + + def capture_stdout(f): """ Decorator to capture stdout in a buffer so that it can be checked