Skip to content

Commit c2d3cae

Browse files
committed
resolve comments and add whatsnew entry
1 parent 3b8086a commit c2d3cae

File tree

2 files changed

+12
-4
lines changed

2 files changed

+12
-4
lines changed

doc/source/whatsnew/v0.23.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -1356,6 +1356,7 @@ Reshaping
13561356
- Bug in :meth:`DataFrame.astype` where column metadata is lost when converting to categorical or a dictionary of dtypes (:issue:`19920`)
13571357
- Bug in :func:`cut` and :func:`qcut` where timezone information was dropped (:issue:`19872`)
13581358
- Bug in :class:`Series` constructor with a ``dtype=str``, previously raised in some cases (:issue:`19853`)
1359+
- Bug in :func: `get_dummies`, :func: `select_dtypes`, where duplicate column names caused incorrect behavior (:issue: `20848`)
13591360

13601361
Other
13611362
^^^^^

pandas/core/reshape/reshape.py

+11-4
Original file line numberDiff line numberDiff line change
@@ -821,12 +821,13 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
821821
from pandas.core.reshape.concat import concat
822822
from itertools import cycle
823823

824+
dtypes_to_encode = ['object', 'category']
825+
824826
if isinstance(data, DataFrame):
825827
# determine columns being encoded
826-
827828
if columns is None:
828829
data_to_encode = data.select_dtypes(
829-
include=['object', 'category'])
830+
include=dtypes_to_encode)
830831
else:
831832
data_to_encode = data[columns]
832833

@@ -844,6 +845,7 @@ def check_len(item, name):
844845

845846
check_len(prefix, 'prefix')
846847
check_len(prefix_sep, 'prefix_sep')
848+
847849
if isinstance(prefix, compat.string_types):
848850
prefix = cycle([prefix])
849851
if isinstance(prefix, dict):
@@ -859,15 +861,20 @@ def check_len(item, name):
859861
prefix_sep = [prefix_sep[col] for col in data_to_encode.columns]
860862

861863
if data_to_encode.shape == data.shape:
864+
# Encoding the entire df, do not prepend any dropped columns
862865
with_dummies = []
863866
elif columns is not None:
867+
# Encoding only cols specified in columns. Get all cols not in
868+
# columns to prepend to result.
864869
with_dummies = [data.drop(columns, axis=1)]
865870
else:
866-
with_dummies = [data.select_dtypes(exclude=['object', 'category'])]
871+
# Encoding only object and category dtype columns. Get remaining
872+
# columns to prepend to result.
873+
with_dummies = [data.select_dtypes(exclude=dtypes_to_encode)]
867874

868875
for (col, pre, sep) in zip(data_to_encode.iteritems(), prefix,
869876
prefix_sep):
870-
877+
# col is (column_name, column), use just column data here
871878
dummy = _get_dummies_1d(col[1], prefix=pre, prefix_sep=sep,
872879
dummy_na=dummy_na, sparse=sparse,
873880
drop_first=drop_first, dtype=dtype)

0 commit comments

Comments
 (0)