Skip to content

Commit da43864

Browse files
committed
fix for duplicate cols in select_dtypes and get_dummies
1 parent 6d610a4 commit da43864

File tree

2 files changed

+22
-18
lines changed

2 files changed

+22
-18
lines changed

pandas/core/frame.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -2991,15 +2991,15 @@ def select_dtypes(self, include=None, exclude=None):
29912991
include_these = Series(not bool(include), index=self.columns)
29922992
exclude_these = Series(not bool(exclude), index=self.columns)
29932993

2994-
def is_dtype_instance_mapper(column, dtype):
2995-
return column, functools.partial(issubclass, dtype.type)
2994+
def is_dtype_instance_mapper(idx, dtype):
2995+
return idx, functools.partial(issubclass, dtype.type)
29962996

2997-
for column, f in itertools.starmap(is_dtype_instance_mapper,
2998-
self.dtypes.iteritems()):
2997+
for idx, f in itertools.starmap(is_dtype_instance_mapper,
2998+
enumerate(self.dtypes)):
29992999
if include: # checks for the case of empty include or exclude
3000-
include_these[column] = any(map(f, include))
3000+
include_these.iloc[idx] = any(map(f, include))
30013001
if exclude:
3002-
exclude_these[column] = not any(map(f, exclude))
3002+
exclude_these.iloc[idx] = not any(map(f, exclude))
30033003

30043004
dtype_indexer = include_these & exclude_these
30053005
return self.loc[com._get_info_slice(self, dtype_indexer)]

pandas/core/reshape/reshape.py

+16-12
Original file line numberDiff line numberDiff line change
@@ -826,45 +826,49 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
826826

827827
if columns is None:
828828
columns_to_encode = data.select_dtypes(
829-
include=['object', 'category']).columns
829+
include=['object', 'category'])
830830
else:
831-
columns_to_encode = columns
831+
columns_to_encode = data[columns]
832832

833833
# validate prefixes and separator to avoid silently dropping cols
834834
def check_len(item, name):
835835
len_msg = ("Length of '{name}' ({len_item}) did not match the "
836836
"length of the columns being encoded ({len_enc}).")
837837

838838
if is_list_like(item):
839-
if not len(item) == len(columns_to_encode):
840-
len_msg = len_msg.format(name=name, len_item=len(item),
841-
len_enc=len(columns_to_encode))
839+
if not len(item) == columns_to_encode.shape[1]:
840+
len_msg = \
841+
len_msg.format(name=name, len_item=len(item),
842+
len_enc=columns_to_encode.shape[1])
842843
raise ValueError(len_msg)
843844

844845
check_len(prefix, 'prefix')
845846
check_len(prefix_sep, 'prefix_sep')
846847
if isinstance(prefix, compat.string_types):
847848
prefix = cycle([prefix])
848849
if isinstance(prefix, dict):
849-
prefix = [prefix[col] for col in columns_to_encode]
850+
prefix = [prefix[col] for col in columns_to_encode.columns]
850851

851852
if prefix is None:
852-
prefix = columns_to_encode
853+
prefix = columns_to_encode.columns
853854

854855
# validate separators
855856
if isinstance(prefix_sep, compat.string_types):
856857
prefix_sep = cycle([prefix_sep])
857858
elif isinstance(prefix_sep, dict):
858-
prefix_sep = [prefix_sep[col] for col in columns_to_encode]
859+
prefix_sep = [prefix_sep[col] for col in columns_to_encode.columns]
859860

860-
if set(columns_to_encode) == set(data.columns):
861+
if columns_to_encode.shape == data.shape:
861862
with_dummies = []
863+
elif columns is not None:
864+
with_dummies = [data.drop(columns, axis=1)]
862865
else:
863-
with_dummies = [data.drop(columns_to_encode, axis=1)]
866+
with_dummies = [data.select_dtypes(exclude=['object', 'category'])]
864867

865-
for (col, pre, sep) in zip(columns_to_encode, prefix, prefix_sep):
868+
for (col, pre, sep) in zip(columns_to_encode.iteritems(), prefix,
869+
prefix_sep):
866870

867-
dummy = _get_dummies_1d(data[col], prefix=pre, prefix_sep=sep,
871+
dummy = _get_dummies_1d(col[1], prefix=pre, prefix_sep=sep,
868872
dummy_na=dummy_na, sparse=sparse,
869873
drop_first=drop_first, dtype=dtype)
870874
with_dummies.append(dummy)

0 commit comments

Comments
 (0)