From 111184760f9d910b607d261285c27c2d4b162c52 Mon Sep 17 00:00:00 2001
From: Kunal Gosar <gosarkunal9@gmail.com>
Date: Thu, 26 Apr 2018 22:02:16 -0700
Subject: [PATCH 1/5] fix for duplicate cols in select_dtypes and get_dummies

---
 pandas/core/frame.py           | 12 ++++++------
 pandas/core/reshape/reshape.py | 28 ++++++++++++++++------------
 2 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index ffb124af4f5fc..ffb2ad046158f 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -3076,15 +3076,15 @@ def select_dtypes(self, include=None, exclude=None):
         include_these = Series(not bool(include), index=self.columns)
         exclude_these = Series(not bool(exclude), index=self.columns)
 
-        def is_dtype_instance_mapper(column, dtype):
-            return column, functools.partial(issubclass, dtype.type)
+        def is_dtype_instance_mapper(idx, dtype):
+            return idx, functools.partial(issubclass, dtype.type)
 
-        for column, f in itertools.starmap(is_dtype_instance_mapper,
-                                           self.dtypes.iteritems()):
+        for idx, f in itertools.starmap(is_dtype_instance_mapper,
+                                        enumerate(self.dtypes)):
             if include:  # checks for the case of empty include or exclude
-                include_these[column] = any(map(f, include))
+                include_these.iloc[idx] = any(map(f, include))
             if exclude:
-                exclude_these[column] = not any(map(f, exclude))
+                exclude_these.iloc[idx] = not any(map(f, exclude))
 
         dtype_indexer = include_these & exclude_these
         return self.loc[com._get_info_slice(self, dtype_indexer)]
diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py
index 389f1af48434a..279691eed63af 100644
--- a/pandas/core/reshape/reshape.py
+++ b/pandas/core/reshape/reshape.py
@@ -826,9 +826,9 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
 
         if columns is None:
             columns_to_encode = data.select_dtypes(
-                include=['object', 'category']).columns
+                include=['object', 'category'])
         else:
-            columns_to_encode = columns
+            columns_to_encode = data[columns]
 
         # validate prefixes and separator to avoid silently dropping cols
         def check_len(item, name):
@@ -836,9 +836,10 @@ def check_len(item, name):
                        "length of the columns being encoded ({len_enc}).")
 
             if is_list_like(item):
-                if not len(item) == len(columns_to_encode):
-                    len_msg = len_msg.format(name=name, len_item=len(item),
-                                             len_enc=len(columns_to_encode))
+                if not len(item) == columns_to_encode.shape[1]:
+                    len_msg = \
+                        len_msg.format(name=name, len_item=len(item),
+                                       len_enc=columns_to_encode.shape[1])
                     raise ValueError(len_msg)
 
         check_len(prefix, 'prefix')
@@ -846,25 +847,28 @@ def check_len(item, name):
         if isinstance(prefix, compat.string_types):
             prefix = cycle([prefix])
         if isinstance(prefix, dict):
-            prefix = [prefix[col] for col in columns_to_encode]
+            prefix = [prefix[col] for col in columns_to_encode.columns]
 
         if prefix is None:
-            prefix = columns_to_encode
+            prefix = columns_to_encode.columns
 
         # validate separators
         if isinstance(prefix_sep, compat.string_types):
             prefix_sep = cycle([prefix_sep])
         elif isinstance(prefix_sep, dict):
-            prefix_sep = [prefix_sep[col] for col in columns_to_encode]
+            prefix_sep = [prefix_sep[col] for col in columns_to_encode.columns]
 
-        if set(columns_to_encode) == set(data.columns):
+        if columns_to_encode.shape == data.shape:
             with_dummies = []
+        elif columns is not None:
+            with_dummies = [data.drop(columns, axis=1)]
         else:
-            with_dummies = [data.drop(columns_to_encode, axis=1)]
+            with_dummies = [data.select_dtypes(exclude=['object', 'category'])]
 
-        for (col, pre, sep) in zip(columns_to_encode, prefix, prefix_sep):
+        for (col, pre, sep) in zip(columns_to_encode.iteritems(), prefix,
+                                   prefix_sep):
 
-            dummy = _get_dummies_1d(data[col], prefix=pre, prefix_sep=sep,
+            dummy = _get_dummies_1d(col[1], prefix=pre, prefix_sep=sep,
                                     dummy_na=dummy_na, sparse=sparse,
                                     drop_first=drop_first, dtype=dtype)
             with_dummies.append(dummy)

From e97398c42d41b5be73ae1cd8f1491cc3b8dc9bfb Mon Sep 17 00:00:00 2001
From: Kunal Gosar <gosarkunal9@gmail.com>
Date: Fri, 27 Apr 2018 00:05:12 -0700
Subject: [PATCH 2/5] implement tests to check duplicate cols

---
 pandas/tests/frame/test_dtypes.py    | 15 +++++++++++++++
 pandas/tests/reshape/test_reshape.py | 14 ++++++++++++++
 2 files changed, 29 insertions(+)

diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py
index 152159965036d..13ca649dcec84 100644
--- a/pandas/tests/frame/test_dtypes.py
+++ b/pandas/tests/frame/test_dtypes.py
@@ -287,6 +287,21 @@ def test_select_dtypes_include_exclude_mixed_scalars_lists(self):
         ei = df[['b', 'c', 'f', 'k']]
         assert_frame_equal(ri, ei)
 
+    def test_select_dtypes_duplicate_columns(self):
+        df = DataFrame({'a': list('abc'),
+                        'b': list(range(1, 4)),
+                        'c': np.arange(3, 6).astype('u1'),
+                        'd': np.arange(4.0, 7.0, dtype='float64'),
+                        'e': [True, False, True],
+                        'f': pd.date_range('now', periods=3).values})
+        df.columns = ['a', 'a', 'b', 'b', 'b', 'c']
+
+        e = DataFrame({'a': list(range(1, 4)),
+                       'b': np.arange(3, 6).astype('u1')})
+
+        r = df.select_dtypes(include=[np.number], exclude=['floating'])
+        assert_frame_equal(r, e)
+
     def test_select_dtypes_not_an_attr_but_still_valid_dtype(self):
         df = DataFrame({'a': list('abc'),
                         'b': list(range(1, 4)),
diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_reshape.py
index c4d925b83585b..fbade33454bf1 100644
--- a/pandas/tests/reshape/test_reshape.py
+++ b/pandas/tests/reshape/test_reshape.py
@@ -465,6 +465,20 @@ def test_get_dummies_dont_sparsify_all_columns(self, sparse):
 
         tm.assert_frame_equal(df[['GDP']], df2)
 
+    def test_get_dummies_duplicate_columns(self, df):
+        df.columns = ["A", "A", "A"]
+        result = get_dummies(df).sort_index(axis=1)
+
+        expected = DataFrame([[1, 1, 0, 1, 0],
+                              [2, 0, 1, 1, 0],
+                              [3, 1, 0, 0, 1]],
+                             columns=['A', 'A_a', 'A_b', 'A_b', 'A_c'],
+                             dtype=np.uint8).sort_index(axis=1)
+
+        expected = expected.astype({"A": np.int64})
+
+        tm.assert_frame_equal(result, expected)
+
 
 class TestCategoricalReshape(object):
 

From 3b8086a26110edd0bff3be57918f21363abcca9f Mon Sep 17 00:00:00 2001
From: Kunal Gosar <gosarkunal9@gmail.com>
Date: Fri, 27 Apr 2018 10:09:10 -0700
Subject: [PATCH 3/5] addressing comments

---
 pandas/core/reshape/reshape.py       | 18 +++++++++---------
 pandas/tests/frame/test_dtypes.py    | 22 ++++++++++++----------
 pandas/tests/reshape/test_reshape.py |  1 +
 3 files changed, 22 insertions(+), 19 deletions(-)

diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py
index 279691eed63af..928a08e03c5b5 100644
--- a/pandas/core/reshape/reshape.py
+++ b/pandas/core/reshape/reshape.py
@@ -825,10 +825,10 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
         # determine columns being encoded
 
         if columns is None:
-            columns_to_encode = data.select_dtypes(
+            data_to_encode = data.select_dtypes(
                 include=['object', 'category'])
         else:
-            columns_to_encode = data[columns]
+            data_to_encode = data[columns]
 
         # validate prefixes and separator to avoid silently dropping cols
         def check_len(item, name):
@@ -836,10 +836,10 @@ def check_len(item, name):
                        "length of the columns being encoded ({len_enc}).")
 
             if is_list_like(item):
-                if not len(item) == columns_to_encode.shape[1]:
+                if not len(item) == data_to_encode.shape[1]:
                     len_msg = \
                         len_msg.format(name=name, len_item=len(item),
-                                       len_enc=columns_to_encode.shape[1])
+                                       len_enc=data_to_encode.shape[1])
                     raise ValueError(len_msg)
 
         check_len(prefix, 'prefix')
@@ -847,25 +847,25 @@ def check_len(item, name):
         if isinstance(prefix, compat.string_types):
             prefix = cycle([prefix])
         if isinstance(prefix, dict):
-            prefix = [prefix[col] for col in columns_to_encode.columns]
+            prefix = [prefix[col] for col in data_to_encode.columns]
 
         if prefix is None:
-            prefix = columns_to_encode.columns
+            prefix = data_to_encode.columns
 
         # validate separators
         if isinstance(prefix_sep, compat.string_types):
             prefix_sep = cycle([prefix_sep])
         elif isinstance(prefix_sep, dict):
-            prefix_sep = [prefix_sep[col] for col in columns_to_encode.columns]
+            prefix_sep = [prefix_sep[col] for col in data_to_encode.columns]
 
-        if columns_to_encode.shape == data.shape:
+        if data_to_encode.shape == data.shape:
             with_dummies = []
         elif columns is not None:
             with_dummies = [data.drop(columns, axis=1)]
         else:
             with_dummies = [data.select_dtypes(exclude=['object', 'category'])]
 
-        for (col, pre, sep) in zip(columns_to_encode.iteritems(), prefix,
+        for (col, pre, sep) in zip(data_to_encode.iteritems(), prefix,
                                    prefix_sep):
 
             dummy = _get_dummies_1d(col[1], prefix=pre, prefix_sep=sep,
diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py
index 13ca649dcec84..4c9f8c2ea0980 100644
--- a/pandas/tests/frame/test_dtypes.py
+++ b/pandas/tests/frame/test_dtypes.py
@@ -288,19 +288,21 @@ def test_select_dtypes_include_exclude_mixed_scalars_lists(self):
         assert_frame_equal(ri, ei)
 
     def test_select_dtypes_duplicate_columns(self):
-        df = DataFrame({'a': list('abc'),
-                        'b': list(range(1, 4)),
-                        'c': np.arange(3, 6).astype('u1'),
-                        'd': np.arange(4.0, 7.0, dtype='float64'),
-                        'e': [True, False, True],
-                        'f': pd.date_range('now', periods=3).values})
+        # GH20839
+        odict = compat.OrderedDict
+        df = DataFrame(odict([('a', list('abc')),
+                              ('b', list(range(1, 4))),
+                              ('c', np.arange(3, 6).astype('u1')),
+                              ('d', np.arange(4.0, 7.0, dtype='float64')),
+                              ('e', [True, False, True]),
+                              ('f', pd.date_range('now', periods=3).values)]))
         df.columns = ['a', 'a', 'b', 'b', 'b', 'c']
 
-        e = DataFrame({'a': list(range(1, 4)),
-                       'b': np.arange(3, 6).astype('u1')})
+        expected = DataFrame({'a': list(range(1, 4)),
+                              'b': np.arange(3, 6).astype('u1')})
 
-        r = df.select_dtypes(include=[np.number], exclude=['floating'])
-        assert_frame_equal(r, e)
+        result = df.select_dtypes(include=[np.number], exclude=['floating'])
+        assert_frame_equal(result, expected)
 
     def test_select_dtypes_not_an_attr_but_still_valid_dtype(self):
         df = DataFrame({'a': list('abc'),
diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_reshape.py
index fbade33454bf1..295801f3e8def 100644
--- a/pandas/tests/reshape/test_reshape.py
+++ b/pandas/tests/reshape/test_reshape.py
@@ -466,6 +466,7 @@ def test_get_dummies_dont_sparsify_all_columns(self, sparse):
         tm.assert_frame_equal(df[['GDP']], df2)
 
     def test_get_dummies_duplicate_columns(self, df):
+        # GH20839
         df.columns = ["A", "A", "A"]
         result = get_dummies(df).sort_index(axis=1)
 

From c2d3cae732aa3c1ce5efceedaafb004008514315 Mon Sep 17 00:00:00 2001
From: Kunal Gosar <gosarkunal9@gmail.com>
Date: Fri, 4 May 2018 14:34:33 -0700
Subject: [PATCH 4/5] resolve comments and add whatsnew entry

---
 doc/source/whatsnew/v0.23.0.txt |  1 +
 pandas/core/reshape/reshape.py  | 15 +++++++++++----
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
index 979fbb5ddfdd0..0fc32f025f604 100644
--- a/doc/source/whatsnew/v0.23.0.txt
+++ b/doc/source/whatsnew/v0.23.0.txt
@@ -1356,6 +1356,7 @@ Reshaping
 - Bug in :meth:`DataFrame.astype` where column metadata is lost when converting to categorical or a dictionary of dtypes (:issue:`19920`)
 - Bug in :func:`cut` and :func:`qcut` where timezone information was dropped (:issue:`19872`)
 - Bug in :class:`Series` constructor with a ``dtype=str``, previously raised in some cases (:issue:`19853`)
+- Bug in :func: `get_dummies`, :func: `select_dtypes`, where duplicate column names caused incorrect behavior (:issue: `20848`)
 
 Other
 ^^^^^
diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py
index 928a08e03c5b5..0829aa8f5a509 100644
--- a/pandas/core/reshape/reshape.py
+++ b/pandas/core/reshape/reshape.py
@@ -821,12 +821,13 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
     from pandas.core.reshape.concat import concat
     from itertools import cycle
 
+    dtypes_to_encode = ['object', 'category']
+
     if isinstance(data, DataFrame):
         # determine columns being encoded
-
         if columns is None:
             data_to_encode = data.select_dtypes(
-                include=['object', 'category'])
+                include=dtypes_to_encode)
         else:
             data_to_encode = data[columns]
 
@@ -844,6 +845,7 @@ def check_len(item, name):
 
         check_len(prefix, 'prefix')
         check_len(prefix_sep, 'prefix_sep')
+
         if isinstance(prefix, compat.string_types):
             prefix = cycle([prefix])
         if isinstance(prefix, dict):
@@ -859,15 +861,20 @@ def check_len(item, name):
             prefix_sep = [prefix_sep[col] for col in data_to_encode.columns]
 
         if data_to_encode.shape == data.shape:
+            # Encoding the entire df, do not prepend any dropped columns
             with_dummies = []
         elif columns is not None:
+            # Encoding only cols specified in columns. Get all cols not in
+            # columns to prepend to result.
             with_dummies = [data.drop(columns, axis=1)]
         else:
-            with_dummies = [data.select_dtypes(exclude=['object', 'category'])]
+            # Encoding only object and category dtype columns. Get remaining
+            # columns to prepend to result.
+            with_dummies = [data.select_dtypes(exclude=dtypes_to_encode)]
 
         for (col, pre, sep) in zip(data_to_encode.iteritems(), prefix,
                                    prefix_sep):
-
+            # col is (column_name, column), use just column data here
             dummy = _get_dummies_1d(col[1], prefix=pre, prefix_sep=sep,
                                     dummy_na=dummy_na, sparse=sparse,
                                     drop_first=drop_first, dtype=dtype)

From 9b72a834c2e47b629c118262daeb8afc8fb60921 Mon Sep 17 00:00:00 2001
From: Jeff Reback <jeff@reback.net>
Date: Sat, 5 May 2018 08:57:47 -0400
Subject: [PATCH 5/5] doc typo

---
 doc/source/whatsnew/v0.23.0.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
index a46eb6a694745..eb6c212731822 100644
--- a/doc/source/whatsnew/v0.23.0.txt
+++ b/doc/source/whatsnew/v0.23.0.txt
@@ -1359,7 +1359,7 @@ Reshaping
 - Bug in :meth:`DataFrame.astype` where column metadata is lost when converting to categorical or a dictionary of dtypes (:issue:`19920`)
 - Bug in :func:`cut` and :func:`qcut` where timezone information was dropped (:issue:`19872`)
 - Bug in :class:`Series` constructor with a ``dtype=str``, previously raised in some cases (:issue:`19853`)
-- Bug in :func: `get_dummies`, :func: `select_dtypes`, where duplicate column names caused incorrect behavior (:issue: `20848`)
+- Bug in :func:`get_dummies`, and :func:`select_dtypes`, where duplicate column names caused incorrect behavior (:issue:`20848`)
 
 Other
 ^^^^^