BUG: Fix issue with inserting duplicate columns in a dataframe

paul-mannino · jreback · commit 2e77536bdf90 · 2016-10-24T18:48:33.000-04:00
closes #14291 closes #14431
diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt
@@ -50,6 +50,7 @@ Bug Fixes
 
 
 - Source installs from PyPI will now work without ``cython`` installed, as in previous versions (:issue:`14204`)
+- Bug in ``DataFrame.insert`` where multiple calls with duplicate columns can fail (:issue:`14291`)
 
 - ``pd.merge()`` will raise ``ValueError`` with non-boolean parameters in passed boolean type arguments (:issue:`14434`)
 
@@ -63,4 +64,11 @@ Bug Fixes
 - Bug in ``DataFrame.to_json`` where ``lines=True`` and a value contained a ``}`` character (:issue:`14391`)
 - Bug in ``df.groupby`` causing an ``AttributeError`` when grouping a single index frame by a column and the index level (:issue`14327`)
 
+
+
+
+
+
+
+
 - Bug in ``pd.pivot_table`` may raise ``TypeError`` or ``ValueError`` when ``index`` or ``columns`` is not scalar and ``values`` is not specified (:issue:`14380`)
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -2487,7 +2487,7 @@ def _set_item(self, key, value):
 
         # check if we are modifying a copy
         # try to set first as we want an invalid
-        # value exeption to occur first
+        # value exception to occur first
         if len(self):
             self._check_setitem_copy()
 
@@ -2503,10 +2503,10 @@ def insert(self, loc, column, value, allow_duplicates=False):
         loc : int
             Must have 0 <= loc <= len(columns)
         column : object
-        value : int, Series, or array-like
+        value : scalar, Series, or array-like
         """
         self._ensure_valid_index(value)
-        value = self._sanitize_column(column, value)
+        value = self._sanitize_column(column, value, broadcast=False)
         self._data.insert(loc, column, value,
                           allow_duplicates=allow_duplicates)
 
@@ -2590,9 +2590,25 @@ def assign(self, **kwargs):
 
         return data
 
-    def _sanitize_column(self, key, value):
-        # Need to make sure new columns (which go into the BlockManager as new
-        # blocks) are always copied
+    def _sanitize_column(self, key, value, broadcast=True):
+        """
+        Ensures new columns (which go into the BlockManager as new blocks) are
+        always copied and converted into an array.
+
+        Parameters
+        ----------
+        key : object
+        value : scalar, Series, or array-like
+        broadcast : bool, default True
+            If ``key`` matches multiple duplicate column names in the
+            DataFrame, this parameter indicates whether ``value`` should be
+            tiled so that the returned array contains a (duplicated) column for
+            each occurrence of the key. If False, ``value`` will not be tiled.
+
+        Returns
+        -------
+        sanitized_column : numpy-array
+        """
 
         def reindexer(value):
             # reindex if necessary
@@ -2665,7 +2681,7 @@ def reindexer(value):
             return value
 
         # broadcast across multiple columns if necessary
-        if key in self.columns and value.ndim == 1:
+        if broadcast and key in self.columns and value.ndim == 1:
             if (not self.columns.is_unique or
                     isinstance(self.columns, MultiIndex)):
                 existing_piece = self[key]
diff --git a/pandas/sparse/frame.py b/pandas/sparse/frame.py
@@ -302,7 +302,21 @@ def fillna(self, value=None, method=None, axis=0, inplace=False,
     # ----------------------------------------------------------------------
     # Support different internal representation of SparseDataFrame
 
-    def _sanitize_column(self, key, value):
+    def _sanitize_column(self, key, value, **kwargs):
+        """
+        Creates a new SparseArray from the input value.
+
+        Parameters
+        ----------
+        key : object
+        value : scalar, Series, or array-like
+        kwargs : dict
+
+        Returns
+        -------
+        sanitized_column : SparseArray
+
+        """
         sp_maker = lambda x, index=None: SparseArray(
             x, index=index, fill_value=self._default_fill_value,
             kind=self._default_kind)
diff --git a/pandas/tests/frame/test_nonunique_indexes.py b/pandas/tests/frame/test_nonunique_indexes.py
@@ -468,3 +468,13 @@ def test_set_value_by_index(self):
 
         df.iloc[:, 0] = 3
         assert_series_equal(df.iloc[:, 1], expected)
+
+    def test_insert_with_columns_dups(self):
+        # GH 14291
+        df = pd.DataFrame()
+        df.insert(0, 'A', ['g', 'h', 'i'], allow_duplicates=True)
+        df.insert(0, 'A', ['d', 'e', 'f'], allow_duplicates=True)
+        df.insert(0, 'A', ['a', 'b', 'c'], allow_duplicates=True)
+        exp = pd.DataFrame([['a', 'd', 'g'], ['b', 'e', 'h'],
+                            ['c', 'f', 'i']], columns=['A', 'A', 'A'])
+        assert_frame_equal(df, exp)