Closed
Description
calling repr() on a Series with categorical-dtype can raise UnicodeDecodeError under certain conditions. These conditions appear to include:
- The series must have length at least 61 (Note:
pd.get_option('max_rows') == 60
) - python2
- sys.getdefaultencoding() == 'ascii'
Reproduce with:
from pandas.core.base import StringMixin
class County(StringMixin):
name = u'San Sebastián'
state = u'PR'
def __unicode__(self):
return self.name + u', ' + self.state
cat = pd.Categorical([County() for n in range(61)])
idx = pd.Index(cat)
ser = idx.to_series()
>>> ser
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "pandas/core/base.py", line 82, in __repr__
return str(self)
File "pandas/core/base.py", line 62, in __str__
return self.__bytes__()
File "pandas/core/base.py", line 74, in __bytes__
return self.__unicode__().encode(encoding, 'replace')
File "pandas/core/series.py", line 1233, in __unicode__
max_rows=max_rows, length=show_dimensions)
File "pandas/core/series.py", line 1276, in to_string
max_rows=max_rows)
File "pandas/io/formats/format.py", line 187, in __init__
self._chk_truncate()
File "pandas/io/formats/format.py", line 201, in _chk_truncate
series.iloc[-row_num:]))
File "pandas/core/reshape/concat.py", line 225, in concat
copy=copy, sort=sort)
File "pandas/core/reshape/concat.py", line 378, in __init__
self.new_axes = self._get_new_axes()
File "pandas/core/reshape/concat.py", line 458, in _get_new_axes
new_axes[self.axis] = self._get_concat_axis()
File "pandas/core/reshape/concat.py", line 511, in _get_concat_axis
concat_axis = _concat_indexes(indexes)
File "pandas/core/reshape/concat.py", line 529, in _concat_indexes
return indexes[0].append(indexes[1:])
File "pandas/core/indexes/base.py", line 2126, in append
return self._concat(to_concat, name)
File "pandas/core/indexes/category.py", line 771, in _concat
return CategoricalIndex._concat_same_dtype(self, to_concat, name)
File "pandas/core/indexes/category.py", line 778, in _concat_same_dtype
to_concat = [self._is_dtype_compat(c) for c in to_concat]
File "pandas/core/indexes/category.py", line 232, in _is_dtype_compat
if not other.is_dtype_equal(self):
File "pandas/core/arrays/categorical.py", line 2242, in is_dtype_equal
return hash(self.dtype) == hash(other.dtype)
File "pandas/core/dtypes/dtypes.py", line 181, in __hash__
return int(self._hash_categories(self.categories, self.ordered))
File "pandas/core/dtypes/dtypes.py", line 250, in _hash_categories
cat_array = hash_array(np.asarray(categories), categorize=False)
File "pandas/core/util/hashing.py", line 296, in hash_array
hash_key, encoding)
File "pandas/_libs/hashing.pyx", line 66, in pandas._libs.hashing.hash_object_array
data = <bytes>val.encode(encoding)
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 11: ordinal not in range(128)
It tentatively looks like the issue is in _libs.hashing.hash_object_array
:
if PyString_Check(val):
data = <bytes>val.encode(encoding)
elif PyBytes_Check(val):
data = <bytes>val
elif PyUnicode_Check(val):
data = <bytes>val.encode(encoding)
When we get here, val
is already a str
in both py2 and py3, so we go down the if PyString_Check(val):
branch. But when it tries to encode
a str
in py2, it first will try to decode with sys.getdefaultencoding()
, which raises.
So my best guess is that the PyString_Check
branch just doesn't belong.
I'll take a look for related issues.