diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt index 147ff8795eb00..f7bbe6e92ad3f 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.txt @@ -59,3 +59,4 @@ Bug Fixes - Bug in ``pd.pivot_table`` may raise ``TypeError`` or ``ValueError`` when ``index`` or ``columns`` is not scalar and ``values`` is not specified (:issue:`14380`) +- Bug in ``DataFrame.insert`` where multiple calls with duplicate columns can fail (:issue:`14291`) \ No newline at end of file diff --git a/pandas/core/frame.py b/pandas/core/frame.py index dfe7e90c134fc..05148c1f7e80a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2487,7 +2487,7 @@ def _set_item(self, key, value): # check if we are modifying a copy # try to set first as we want an invalid - # value exeption to occur first + # value exception to occur first if len(self): self._check_setitem_copy() @@ -2503,10 +2503,10 @@ def insert(self, loc, column, value, allow_duplicates=False): loc : int Must have 0 <= loc <= len(columns) column : object - value : int, Series, or array-like + value : scalar, Series, or array-like """ self._ensure_valid_index(value) - value = self._sanitize_column(column, value) + value = self._sanitize_column(column, value, broadcast=False) self._data.insert(loc, column, value, allow_duplicates=allow_duplicates) @@ -2590,9 +2590,25 @@ def assign(self, **kwargs): return data - def _sanitize_column(self, key, value): - # Need to make sure new columns (which go into the BlockManager as new - # blocks) are always copied + def _sanitize_column(self, key, value, broadcast=True): + """ + Ensures new columns (which go into the BlockManager as new blocks) are + always copied and converted into an array. + + Parameters + ---------- + key : object + value : scalar, Series, or array-like + broadcast : bool, default True + If ``key`` matches multiple duplicate column names in the + DataFrame, this parameter indicates whether ``value`` should be + tiled so that the returned array contains a (duplicated) column for + each occurrence of the key. If False, ``value`` will not be tiled. + + Returns + ------- + sanitized_column : numpy-array + """ def reindexer(value): # reindex if necessary @@ -2665,7 +2681,7 @@ def reindexer(value): return value # broadcast across multiple columns if necessary - if key in self.columns and value.ndim == 1: + if broadcast and key in self.columns and value.ndim == 1: if (not self.columns.is_unique or isinstance(self.columns, MultiIndex)): existing_piece = self[key] diff --git a/pandas/sparse/frame.py b/pandas/sparse/frame.py index 8eeff045d1fac..56020e32b9963 100644 --- a/pandas/sparse/frame.py +++ b/pandas/sparse/frame.py @@ -302,7 +302,21 @@ def fillna(self, value=None, method=None, axis=0, inplace=False, # ---------------------------------------------------------------------- # Support different internal representation of SparseDataFrame - def _sanitize_column(self, key, value): + def _sanitize_column(self, key, value, **kwargs): + """ + Creates a new SparseArray from the input value. + + Parameters + ---------- + key : object + value : scalar, Series, or array-like + kwargs : dict + + Returns + ------- + sanitized_column : SparseArray + + """ sp_maker = lambda x, index=None: SparseArray( x, index=index, fill_value=self._default_fill_value, kind=self._default_kind) diff --git a/pandas/tests/frame/test_nonunique_indexes.py b/pandas/tests/frame/test_nonunique_indexes.py index 77974718714f8..220d29f624942 100644 --- a/pandas/tests/frame/test_nonunique_indexes.py +++ b/pandas/tests/frame/test_nonunique_indexes.py @@ -468,3 +468,13 @@ def test_set_value_by_index(self): df.iloc[:, 0] = 3 assert_series_equal(df.iloc[:, 1], expected) + + def test_insert_with_columns_dups(self): + # GH 14291 + df = pd.DataFrame() + df.insert(0, 'A', ['g', 'h', 'i'], allow_duplicates=True) + df.insert(0, 'A', ['d', 'e', 'f'], allow_duplicates=True) + df.insert(0, 'A', ['a', 'b', 'c'], allow_duplicates=True) + exp = pd.DataFrame([['a', 'd', 'g'], ['b', 'e', 'h'], + ['c', 'f', 'i']], columns=['A', 'A', 'A']) + assert_frame_equal(df, exp)