Skip to content

Commit 3caed45

Browse files
Scorpilvictor
authored and
victor
committed
BUG: Fix get dummies unicode error (pandas-dev#22131)
1 parent fc9de76 commit 3caed45

File tree

3 files changed

+36
-7
lines changed

3 files changed

+36
-7
lines changed

doc/source/whatsnew/v0.24.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -660,6 +660,7 @@ Reshaping
660660
- Bug in :meth:`Series.mask` and :meth:`DataFrame.mask` with ``list`` conditionals (:issue:`21891`)
661661
- Bug in :meth:`DataFrame.replace` raises RecursionError when converting OutOfBounds ``datetime64[ns, tz]`` (:issue:`20380`)
662662
- :func:`pandas.core.groupby.GroupBy.rank` now raises a ``ValueError`` when an invalid value is passed for argument ``na_option`` (:issue:`22124`)
663+
- Bug in :func:`get_dummies` with Unicode attributes in Python 2 (:issue:`22084`)
663664
-
664665

665666
Build Changes

pandas/core/reshape/reshape.py

+17-7
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# pylint: disable=E1101,E1103
22
# pylint: disable=W0703,W0622,W0613,W0201
3-
from pandas.compat import range, text_type, zip
3+
from pandas.compat import range, text_type, zip, u, PY2
44
from pandas import compat
55
from functools import partial
66
import itertools
@@ -923,13 +923,23 @@ def get_empty_Frame(data, sparse):
923923

924924
number_of_cols = len(levels)
925925

926-
if prefix is not None:
927-
dummy_strs = [u'{prefix}{sep}{level}' if isinstance(v, text_type)
928-
else '{prefix}{sep}{level}' for v in levels]
929-
dummy_cols = [dummy_str.format(prefix=prefix, sep=prefix_sep, level=v)
930-
for dummy_str, v in zip(dummy_strs, levels)]
931-
else:
926+
if prefix is None:
932927
dummy_cols = levels
928+
else:
929+
930+
# PY2 embedded unicode, gh-22084
931+
def _make_col_name(prefix, prefix_sep, level):
932+
fstr = '{prefix}{prefix_sep}{level}'
933+
if PY2 and (isinstance(prefix, text_type) or
934+
isinstance(prefix_sep, text_type) or
935+
isinstance(level, text_type)):
936+
fstr = u(fstr)
937+
return fstr.format(prefix=prefix,
938+
prefix_sep=prefix_sep,
939+
level=level)
940+
941+
dummy_cols = [_make_col_name(prefix, prefix_sep, level)
942+
for level in levels]
933943

934944
if isinstance(data, Series):
935945
index = data.index

pandas/tests/reshape/test_reshape.py

+18
Original file line numberDiff line numberDiff line change
@@ -302,6 +302,24 @@ def test_dataframe_dummies_with_categorical(self, df, sparse, dtype):
302302
expected.sort_index(axis=1)
303303
assert_frame_equal(result, expected)
304304

305+
@pytest.mark.parametrize('get_dummies_kwargs,expected', [
306+
({'data': pd.DataFrame(({u'ä': ['a']}))},
307+
pd.DataFrame({u'ä_a': [1]}, dtype=np.uint8)),
308+
309+
({'data': pd.DataFrame({'x': [u'ä']})},
310+
pd.DataFrame({u'x_ä': [1]}, dtype=np.uint8)),
311+
312+
({'data': pd.DataFrame({'x': [u'a']}), 'prefix':u'ä'},
313+
pd.DataFrame({u'ä_a': [1]}, dtype=np.uint8)),
314+
315+
({'data': pd.DataFrame({'x': [u'a']}), 'prefix_sep':u'ä'},
316+
pd.DataFrame({u'xäa': [1]}, dtype=np.uint8))])
317+
def test_dataframe_dummies_unicode(self, get_dummies_kwargs, expected):
318+
# GH22084 pd.get_dummies incorrectly encodes unicode characters
319+
# in dataframe column names
320+
result = get_dummies(**get_dummies_kwargs)
321+
assert_frame_equal(result, expected)
322+
305323
def test_basic_drop_first(self, sparse):
306324
# GH12402 Add a new parameter `drop_first` to avoid collinearity
307325
# Basic case

0 commit comments

Comments
 (0)