Skip to content

Commit f4ef546

Browse files
jbrockmendelvictor
authored and
victor
committed
fix hashing string-casting error (pandas-dev#21187)
1 parent 82be391 commit f4ef546

File tree

4 files changed

+55
-5
lines changed

4 files changed

+55
-5
lines changed

doc/source/whatsnew/v0.23.2.txt

+1
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ Bug Fixes
8686

8787
**Categorical**
8888

89+
- Bug in rendering :class:`Series` with ``Categorical`` dtype in rare conditions under Python 2.7 (:issue:`21002`)
8990
-
9091

9192
**Timezones**

pandas/_libs/hashing.pyx

+2-5
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,7 @@ import numpy as np
88
from numpy cimport ndarray, uint8_t, uint32_t, uint64_t
99

1010
from util cimport _checknull
11-
from cpython cimport (PyString_Check,
12-
PyBytes_Check,
11+
from cpython cimport (PyBytes_Check,
1312
PyUnicode_Check)
1413
from libc.stdlib cimport malloc, free
1514

@@ -62,9 +61,7 @@ def hash_object_array(ndarray[object] arr, object key, object encoding='utf8'):
6261
cdef list datas = []
6362
for i in range(n):
6463
val = arr[i]
65-
if PyString_Check(val):
66-
data = <bytes>val.encode(encoding)
67-
elif PyBytes_Check(val):
64+
if PyBytes_Check(val):
6865
data = <bytes>val
6966
elif PyUnicode_Check(val):
7067
data = <bytes>val.encode(encoding)

pandas/tests/series/test_repr.py

+30
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from pandas import (Index, Series, DataFrame, date_range, option_context,
1212
Categorical, period_range, timedelta_range)
1313
from pandas.core.index import MultiIndex
14+
from pandas.core.base import StringMixin
1415

1516
from pandas.compat import lrange, range, u
1617
from pandas import compat
@@ -202,6 +203,35 @@ def test_latex_repr(self):
202203

203204
class TestCategoricalRepr(object):
204205

206+
def test_categorical_repr_unicode(self):
207+
# GH#21002 if len(index) > 60, sys.getdefaultencoding()=='ascii',
208+
# and we are working in PY2, then rendering a Categorical could raise
209+
# UnicodeDecodeError by trying to decode when it shouldn't
210+
211+
class County(StringMixin):
212+
name = u'San Sebastián'
213+
state = u'PR'
214+
215+
def __unicode__(self):
216+
return self.name + u', ' + self.state
217+
218+
cat = pd.Categorical([County() for n in range(61)])
219+
idx = pd.Index(cat)
220+
ser = idx.to_series()
221+
222+
if compat.PY3:
223+
# no reloading of sys, just check that the default (utf8) works
224+
# as expected
225+
repr(ser)
226+
str(ser)
227+
228+
else:
229+
# set sys.defaultencoding to ascii, then change it back after
230+
# the test
231+
with tm.set_defaultencoding('ascii'):
232+
repr(ser)
233+
str(ser)
234+
205235
def test_categorical_repr(self):
206236
a = Series(Categorical([1, 2, 3, 4]))
207237
exp = u("0 1\n1 2\n2 3\n3 4\n" +

pandas/util/testing.py

+22
Original file line numberDiff line numberDiff line change
@@ -553,6 +553,28 @@ def _valid_locales(locales, normalize):
553553
# Stdout / stderr decorators
554554

555555

556+
@contextmanager
557+
def set_defaultencoding(encoding):
558+
"""
559+
Set default encoding (as given by sys.getdefaultencoding()) to the given
560+
encoding; restore on exit.
561+
562+
Parameters
563+
----------
564+
encoding : str
565+
"""
566+
if not PY2:
567+
raise ValueError("set_defaultencoding context is only available "
568+
"in Python 2.")
569+
orig = sys.getdefaultencoding()
570+
reload(sys) # noqa:F821
571+
sys.setdefaultencoding(encoding)
572+
try:
573+
yield
574+
finally:
575+
sys.setdefaultencoding(orig)
576+
577+
556578
def capture_stdout(f):
557579
"""
558580
Decorator to capture stdout in a buffer so that it can be checked

0 commit comments

Comments
 (0)