From 111184760f9d910b607d261285c27c2d4b162c52 Mon Sep 17 00:00:00 2001 From: Kunal Gosar Date: Thu, 26 Apr 2018 22:02:16 -0700 Subject: [PATCH 1/5] fix for duplicate cols in select_dtypes and get_dummies --- pandas/core/frame.py | 12 ++++++------ pandas/core/reshape/reshape.py | 28 ++++++++++++++++------------ 2 files changed, 22 insertions(+), 18 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ffb124af4f5fc..ffb2ad046158f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3076,15 +3076,15 @@ def select_dtypes(self, include=None, exclude=None): include_these = Series(not bool(include), index=self.columns) exclude_these = Series(not bool(exclude), index=self.columns) - def is_dtype_instance_mapper(column, dtype): - return column, functools.partial(issubclass, dtype.type) + def is_dtype_instance_mapper(idx, dtype): + return idx, functools.partial(issubclass, dtype.type) - for column, f in itertools.starmap(is_dtype_instance_mapper, - self.dtypes.iteritems()): + for idx, f in itertools.starmap(is_dtype_instance_mapper, + enumerate(self.dtypes)): if include: # checks for the case of empty include or exclude - include_these[column] = any(map(f, include)) + include_these.iloc[idx] = any(map(f, include)) if exclude: - exclude_these[column] = not any(map(f, exclude)) + exclude_these.iloc[idx] = not any(map(f, exclude)) dtype_indexer = include_these & exclude_these return self.loc[com._get_info_slice(self, dtype_indexer)] diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 389f1af48434a..279691eed63af 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -826,9 +826,9 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, if columns is None: columns_to_encode = data.select_dtypes( - include=['object', 'category']).columns + include=['object', 'category']) else: - columns_to_encode = columns + columns_to_encode = data[columns] # validate prefixes and separator to avoid silently dropping cols def check_len(item, name): @@ -836,9 +836,10 @@ def check_len(item, name): "length of the columns being encoded ({len_enc}).") if is_list_like(item): - if not len(item) == len(columns_to_encode): - len_msg = len_msg.format(name=name, len_item=len(item), - len_enc=len(columns_to_encode)) + if not len(item) == columns_to_encode.shape[1]: + len_msg = \ + len_msg.format(name=name, len_item=len(item), + len_enc=columns_to_encode.shape[1]) raise ValueError(len_msg) check_len(prefix, 'prefix') @@ -846,25 +847,28 @@ def check_len(item, name): if isinstance(prefix, compat.string_types): prefix = cycle([prefix]) if isinstance(prefix, dict): - prefix = [prefix[col] for col in columns_to_encode] + prefix = [prefix[col] for col in columns_to_encode.columns] if prefix is None: - prefix = columns_to_encode + prefix = columns_to_encode.columns # validate separators if isinstance(prefix_sep, compat.string_types): prefix_sep = cycle([prefix_sep]) elif isinstance(prefix_sep, dict): - prefix_sep = [prefix_sep[col] for col in columns_to_encode] + prefix_sep = [prefix_sep[col] for col in columns_to_encode.columns] - if set(columns_to_encode) == set(data.columns): + if columns_to_encode.shape == data.shape: with_dummies = [] + elif columns is not None: + with_dummies = [data.drop(columns, axis=1)] else: - with_dummies = [data.drop(columns_to_encode, axis=1)] + with_dummies = [data.select_dtypes(exclude=['object', 'category'])] - for (col, pre, sep) in zip(columns_to_encode, prefix, prefix_sep): + for (col, pre, sep) in zip(columns_to_encode.iteritems(), prefix, + prefix_sep): - dummy = _get_dummies_1d(data[col], prefix=pre, prefix_sep=sep, + dummy = _get_dummies_1d(col[1], prefix=pre, prefix_sep=sep, dummy_na=dummy_na, sparse=sparse, drop_first=drop_first, dtype=dtype) with_dummies.append(dummy) From e97398c42d41b5be73ae1cd8f1491cc3b8dc9bfb Mon Sep 17 00:00:00 2001 From: Kunal Gosar Date: Fri, 27 Apr 2018 00:05:12 -0700 Subject: [PATCH 2/5] implement tests to check duplicate cols --- pandas/tests/frame/test_dtypes.py | 15 +++++++++++++++ pandas/tests/reshape/test_reshape.py | 14 ++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index 152159965036d..13ca649dcec84 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -287,6 +287,21 @@ def test_select_dtypes_include_exclude_mixed_scalars_lists(self): ei = df[['b', 'c', 'f', 'k']] assert_frame_equal(ri, ei) + def test_select_dtypes_duplicate_columns(self): + df = DataFrame({'a': list('abc'), + 'b': list(range(1, 4)), + 'c': np.arange(3, 6).astype('u1'), + 'd': np.arange(4.0, 7.0, dtype='float64'), + 'e': [True, False, True], + 'f': pd.date_range('now', periods=3).values}) + df.columns = ['a', 'a', 'b', 'b', 'b', 'c'] + + e = DataFrame({'a': list(range(1, 4)), + 'b': np.arange(3, 6).astype('u1')}) + + r = df.select_dtypes(include=[np.number], exclude=['floating']) + assert_frame_equal(r, e) + def test_select_dtypes_not_an_attr_but_still_valid_dtype(self): df = DataFrame({'a': list('abc'), 'b': list(range(1, 4)), diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_reshape.py index c4d925b83585b..fbade33454bf1 100644 --- a/pandas/tests/reshape/test_reshape.py +++ b/pandas/tests/reshape/test_reshape.py @@ -465,6 +465,20 @@ def test_get_dummies_dont_sparsify_all_columns(self, sparse): tm.assert_frame_equal(df[['GDP']], df2) + def test_get_dummies_duplicate_columns(self, df): + df.columns = ["A", "A", "A"] + result = get_dummies(df).sort_index(axis=1) + + expected = DataFrame([[1, 1, 0, 1, 0], + [2, 0, 1, 1, 0], + [3, 1, 0, 0, 1]], + columns=['A', 'A_a', 'A_b', 'A_b', 'A_c'], + dtype=np.uint8).sort_index(axis=1) + + expected = expected.astype({"A": np.int64}) + + tm.assert_frame_equal(result, expected) + class TestCategoricalReshape(object): From 3b8086a26110edd0bff3be57918f21363abcca9f Mon Sep 17 00:00:00 2001 From: Kunal Gosar Date: Fri, 27 Apr 2018 10:09:10 -0700 Subject: [PATCH 3/5] addressing comments --- pandas/core/reshape/reshape.py | 18 +++++++++--------- pandas/tests/frame/test_dtypes.py | 22 ++++++++++++---------- pandas/tests/reshape/test_reshape.py | 1 + 3 files changed, 22 insertions(+), 19 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 279691eed63af..928a08e03c5b5 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -825,10 +825,10 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, # determine columns being encoded if columns is None: - columns_to_encode = data.select_dtypes( + data_to_encode = data.select_dtypes( include=['object', 'category']) else: - columns_to_encode = data[columns] + data_to_encode = data[columns] # validate prefixes and separator to avoid silently dropping cols def check_len(item, name): @@ -836,10 +836,10 @@ def check_len(item, name): "length of the columns being encoded ({len_enc}).") if is_list_like(item): - if not len(item) == columns_to_encode.shape[1]: + if not len(item) == data_to_encode.shape[1]: len_msg = \ len_msg.format(name=name, len_item=len(item), - len_enc=columns_to_encode.shape[1]) + len_enc=data_to_encode.shape[1]) raise ValueError(len_msg) check_len(prefix, 'prefix') @@ -847,25 +847,25 @@ def check_len(item, name): if isinstance(prefix, compat.string_types): prefix = cycle([prefix]) if isinstance(prefix, dict): - prefix = [prefix[col] for col in columns_to_encode.columns] + prefix = [prefix[col] for col in data_to_encode.columns] if prefix is None: - prefix = columns_to_encode.columns + prefix = data_to_encode.columns # validate separators if isinstance(prefix_sep, compat.string_types): prefix_sep = cycle([prefix_sep]) elif isinstance(prefix_sep, dict): - prefix_sep = [prefix_sep[col] for col in columns_to_encode.columns] + prefix_sep = [prefix_sep[col] for col in data_to_encode.columns] - if columns_to_encode.shape == data.shape: + if data_to_encode.shape == data.shape: with_dummies = [] elif columns is not None: with_dummies = [data.drop(columns, axis=1)] else: with_dummies = [data.select_dtypes(exclude=['object', 'category'])] - for (col, pre, sep) in zip(columns_to_encode.iteritems(), prefix, + for (col, pre, sep) in zip(data_to_encode.iteritems(), prefix, prefix_sep): dummy = _get_dummies_1d(col[1], prefix=pre, prefix_sep=sep, diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index 13ca649dcec84..4c9f8c2ea0980 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -288,19 +288,21 @@ def test_select_dtypes_include_exclude_mixed_scalars_lists(self): assert_frame_equal(ri, ei) def test_select_dtypes_duplicate_columns(self): - df = DataFrame({'a': list('abc'), - 'b': list(range(1, 4)), - 'c': np.arange(3, 6).astype('u1'), - 'd': np.arange(4.0, 7.0, dtype='float64'), - 'e': [True, False, True], - 'f': pd.date_range('now', periods=3).values}) + # GH20839 + odict = compat.OrderedDict + df = DataFrame(odict([('a', list('abc')), + ('b', list(range(1, 4))), + ('c', np.arange(3, 6).astype('u1')), + ('d', np.arange(4.0, 7.0, dtype='float64')), + ('e', [True, False, True]), + ('f', pd.date_range('now', periods=3).values)])) df.columns = ['a', 'a', 'b', 'b', 'b', 'c'] - e = DataFrame({'a': list(range(1, 4)), - 'b': np.arange(3, 6).astype('u1')}) + expected = DataFrame({'a': list(range(1, 4)), + 'b': np.arange(3, 6).astype('u1')}) - r = df.select_dtypes(include=[np.number], exclude=['floating']) - assert_frame_equal(r, e) + result = df.select_dtypes(include=[np.number], exclude=['floating']) + assert_frame_equal(result, expected) def test_select_dtypes_not_an_attr_but_still_valid_dtype(self): df = DataFrame({'a': list('abc'), diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_reshape.py index fbade33454bf1..295801f3e8def 100644 --- a/pandas/tests/reshape/test_reshape.py +++ b/pandas/tests/reshape/test_reshape.py @@ -466,6 +466,7 @@ def test_get_dummies_dont_sparsify_all_columns(self, sparse): tm.assert_frame_equal(df[['GDP']], df2) def test_get_dummies_duplicate_columns(self, df): + # GH20839 df.columns = ["A", "A", "A"] result = get_dummies(df).sort_index(axis=1) From c2d3cae732aa3c1ce5efceedaafb004008514315 Mon Sep 17 00:00:00 2001 From: Kunal Gosar Date: Fri, 4 May 2018 14:34:33 -0700 Subject: [PATCH 4/5] resolve comments and add whatsnew entry --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/core/reshape/reshape.py | 15 +++++++++++---- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 979fbb5ddfdd0..0fc32f025f604 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -1356,6 +1356,7 @@ Reshaping - Bug in :meth:`DataFrame.astype` where column metadata is lost when converting to categorical or a dictionary of dtypes (:issue:`19920`) - Bug in :func:`cut` and :func:`qcut` where timezone information was dropped (:issue:`19872`) - Bug in :class:`Series` constructor with a ``dtype=str``, previously raised in some cases (:issue:`19853`) +- Bug in :func: `get_dummies`, :func: `select_dtypes`, where duplicate column names caused incorrect behavior (:issue: `20848`) Other ^^^^^ diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 928a08e03c5b5..0829aa8f5a509 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -821,12 +821,13 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, from pandas.core.reshape.concat import concat from itertools import cycle + dtypes_to_encode = ['object', 'category'] + if isinstance(data, DataFrame): # determine columns being encoded - if columns is None: data_to_encode = data.select_dtypes( - include=['object', 'category']) + include=dtypes_to_encode) else: data_to_encode = data[columns] @@ -844,6 +845,7 @@ def check_len(item, name): check_len(prefix, 'prefix') check_len(prefix_sep, 'prefix_sep') + if isinstance(prefix, compat.string_types): prefix = cycle([prefix]) if isinstance(prefix, dict): @@ -859,15 +861,20 @@ def check_len(item, name): prefix_sep = [prefix_sep[col] for col in data_to_encode.columns] if data_to_encode.shape == data.shape: + # Encoding the entire df, do not prepend any dropped columns with_dummies = [] elif columns is not None: + # Encoding only cols specified in columns. Get all cols not in + # columns to prepend to result. with_dummies = [data.drop(columns, axis=1)] else: - with_dummies = [data.select_dtypes(exclude=['object', 'category'])] + # Encoding only object and category dtype columns. Get remaining + # columns to prepend to result. + with_dummies = [data.select_dtypes(exclude=dtypes_to_encode)] for (col, pre, sep) in zip(data_to_encode.iteritems(), prefix, prefix_sep): - + # col is (column_name, column), use just column data here dummy = _get_dummies_1d(col[1], prefix=pre, prefix_sep=sep, dummy_na=dummy_na, sparse=sparse, drop_first=drop_first, dtype=dtype) From 9b72a834c2e47b629c118262daeb8afc8fb60921 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 5 May 2018 08:57:47 -0400 Subject: [PATCH 5/5] doc typo --- doc/source/whatsnew/v0.23.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index a46eb6a694745..eb6c212731822 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -1359,7 +1359,7 @@ Reshaping - Bug in :meth:`DataFrame.astype` where column metadata is lost when converting to categorical or a dictionary of dtypes (:issue:`19920`) - Bug in :func:`cut` and :func:`qcut` where timezone information was dropped (:issue:`19872`) - Bug in :class:`Series` constructor with a ``dtype=str``, previously raised in some cases (:issue:`19853`) -- Bug in :func: `get_dummies`, :func: `select_dtypes`, where duplicate column names caused incorrect behavior (:issue: `20848`) +- Bug in :func:`get_dummies`, and :func:`select_dtypes`, where duplicate column names caused incorrect behavior (:issue:`20848`) Other ^^^^^