Skip to content

fix hashing string-casting error #21187

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Jun 21, 2018
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.23.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ Bug Fixes

**Categorical**

- Bug in rendering :class:`Series` with ``Categorical`` dtype in rare conditions under Python 2.7 (:issue:`21002`)
-

**Timezones**
Expand Down
7 changes: 2 additions & 5 deletions pandas/_libs/hashing.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@ import numpy as np
from numpy cimport ndarray, uint8_t, uint32_t, uint64_t

from util cimport _checknull
from cpython cimport (PyString_Check,
PyBytes_Check,
from cpython cimport (PyBytes_Check,
PyUnicode_Check)
from libc.stdlib cimport malloc, free

Expand Down Expand Up @@ -62,9 +61,7 @@ def hash_object_array(ndarray[object] arr, object key, object encoding='utf8'):
cdef list datas = []
for i in range(n):
val = arr[i]
if PyString_Check(val):
data = <bytes>val.encode(encoding)
elif PyBytes_Check(val):
if PyBytes_Check(val):
data = <bytes>val
elif PyUnicode_Check(val):
data = <bytes>val.encode(encoding)
Expand Down
30 changes: 30 additions & 0 deletions pandas/tests/series/test_repr.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from pandas import (Index, Series, DataFrame, date_range, option_context,
Categorical, period_range, timedelta_range)
from pandas.core.index import MultiIndex
from pandas.core.base import StringMixin

from pandas.compat import lrange, range, u
from pandas import compat
Expand Down Expand Up @@ -202,6 +203,35 @@ def test_latex_repr(self):

class TestCategoricalRepr(object):

def test_categorical_repr_unicode(self):
# GH#21002 if len(index) > 60, sys.getdefaultencoding()=='ascii',
# and we are working in PY2, then rendering a Categorical could raise
# UnicodeDecodeError by trying to decode when it shouldn't

class County(StringMixin):
name = u'San Sebastián'
state = u'PR'

def __unicode__(self):
return self.name + u', ' + self.state

cat = pd.Categorical([County() for n in range(61)])
idx = pd.Index(cat)
ser = idx.to_series()

if compat.PY3:
# no reloading of sys, just check that the default (utf8) works
# as expected
repr(ser)
str(ser)

else:
# set sys.defaultencoding to ascii, then change it back after
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you make this into a context manager in pandas.util.testing

# the test
with tm.set_defaultencoding('ascii'):
repr(ser)
str(ser)

def test_categorical_repr(self):
a = Series(Categorical([1, 2, 3, 4]))
exp = u("0 1\n1 2\n2 3\n3 4\n" +
Expand Down
22 changes: 22 additions & 0 deletions pandas/util/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -553,6 +553,28 @@ def _valid_locales(locales, normalize):
# Stdout / stderr decorators


@contextmanager
def set_defaultencoding(encoding):
"""
Set default encoding (as given by sys.getdefaultencoding()) to the given
encoding; restore on exit.

Parameters
----------
encoding : str
"""
if not PY2:
raise ValueError("set_defaultencoding context is only available "
"in Python 2.")
orig = sys.getdefaultencoding()
reload(sys) # noqa:F821
sys.setdefaultencoding(encoding)
try:
yield
finally:
sys.setdefaultencoding(orig)


def capture_stdout(f):
"""
Decorator to capture stdout in a buffer so that it can be checked
Expand Down