Handle duplicates

TomAugspurger · TomAugspurger · commit ed70cefe012e · 2018-12-21T14:28:40.000-06:00
diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py
@@ -194,11 +194,19 @@ def init_dict(data, index, columns, dtype=None):
     if not isinstance(columns, Index):
         columns = Index(columns, copy=False)
 
+    # Ugh columns make not be unique (even though we're in init_dict and
+    # dict keys have to be unique...). We have two possible strategies
+    # 1.) Gracefully handle duplicates when going through data to build
+    #     new_data.
+    # 2.) Focus only on unique values on a first pass, and insert duplicates
+    #     in the correct positions after the uniques have been handled.
+    # We take option 2.
+
     if not columns.is_unique:
-        # This is silly, but allowed and tested.
-        # Do the check, instead of always calling unique, to preserve
-        # the identity of unique user-provided indexes.
+        columns_with_duplictes = columns.copy()
         columns = columns.unique()
+    else:
+        columns_with_duplictes = None
 
     if data:
         normalized_keys = Index(data.keys(), copy=False)
@@ -284,6 +292,19 @@ def init_dict(data, index, columns, dtype=None):
         # https://github.com/pandas-dev/pandas/issues/24388 for more.
         dtype = np.dtype("object")
 
+    if columns_with_duplictes is not None:
+        duplicated = columns_with_duplictes.duplicated()
+        duplicate_positions = np.arange(len(duplicated))[duplicated]
+        offset = 0
+
+        for position in duplicate_positions:
+            key = columns_with_duplictes[position]
+            loc = columns.get_loc(key)
+            arrays.insert(position, arrays[loc])
+            offset += 1
+
+        columns = columns_with_duplictes
+
     return arrays_to_mgr(arrays, columns, index, columns, dtype=dtype)
 
 
diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py
@@ -1402,9 +1402,24 @@ def test_constructor_column_duplicates(self):
                       OrderedDict([('b', 8), ('a', 5), ('a', 6)]))
 
     def test_constructor_column_dict_duplicates(self):
-        result = DataFrame({"A": [1, 2], "B": [3, 4]}, columns=['A', 'B', 'A'])
-        expected = DataFrame({"A": [1, 2], "B": [3, 4]}, columns=['A', 'B'])
-        tm.assert_frame_equal(result, expected)
+        result = DataFrame({}, columns=['A', 'B', 'A']).columns
+        expected = pd.Index(['A', 'B', 'A'])
+        tm.assert_index_equal(result, expected)
+
+    def test_constructor_column_dict_duplicates_data(self):
+        df = pd.DataFrame({'a': [1], 'b': [2], 'c': [3]},
+                          columns=['c', 'b', 'a', 'a', 'b', 'c'])
+        # do this in pieces to avoid constructing an expected that
+        # maybe hits the same code path.
+        columns = pd.Index(['c', 'b', 'a', 'a', 'b', 'c'])
+        tm.assert_index_equal(df.columns, columns)
+
+        tm.assert_series_equal(df.iloc[:, 0], pd.Series([3], name='c'))
+        tm.assert_series_equal(df.iloc[:, 1], pd.Series([2], name='b'))
+        tm.assert_series_equal(df.iloc[:, 2], pd.Series([1], name='a'))
+        tm.assert_series_equal(df.iloc[:, 3], pd.Series([1], name='a'))
+        tm.assert_series_equal(df.iloc[:, 4], pd.Series([2], name='b'))
+        tm.assert_series_equal(df.iloc[:, 0], pd.Series([3], name='c'))
 
     def test_constructor_empty_with_string_dtype(self):
         # GH 9428