From a00f0fe772d29874d0d77a0bd4599e530fb251ef Mon Sep 17 00:00:00 2001 From: paul-mannino Date: Sun, 9 Oct 2016 17:45:34 -0500 Subject: [PATCH] BUG: Fix issue with inserting duplicate columns in a dataframe (#14291) --- doc/source/whatsnew/v0.19.1.txt | 1 + pandas/core/frame.py | 18 ++++++++++++------ pandas/sparse/frame.py | 7 ++++++- pandas/tests/frame/test_mutate_columns.py | 9 +++++++++ 4 files changed, 28 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt index 3edb8c1fa9071..16a933b6ce8ee 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.txt @@ -45,3 +45,4 @@ Bug Fixes - Bug in ``pd.concat`` where names of the ``keys`` were not propagated to the resulting ``MultiIndex`` (:issue:`14252`) - Bug in ``MultiIndex.set_levels`` where illegal level values were still set after raising an error (:issue:`13754`) +- Bug in ``DataFrame.insert`` where multiple calls with duplicate columns can fail (:issue:`14291`) \ No newline at end of file diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1798a35168265..7d33f929352ae 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2487,7 +2487,7 @@ def _set_item(self, key, value): # check if we are modifying a copy # try to set first as we want an invalid - # value exeption to occur first + # value exception to occur first if len(self): self._check_setitem_copy() @@ -2506,7 +2506,7 @@ def insert(self, loc, column, value, allow_duplicates=False): value : int, Series, or array-like """ self._ensure_valid_index(value) - value = self._sanitize_column(column, value) + value = self._sanitize_column(column, value, broadcast=False) self._data.insert(loc, column, value, allow_duplicates=allow_duplicates) @@ -2590,9 +2590,15 @@ def assign(self, **kwargs): return data - def _sanitize_column(self, key, value): - # Need to make sure new columns (which go into the BlockManager as new - # blocks) are always copied + def _sanitize_column(self, key, value, broadcast=True): + """ + Ensures new columns (which go into the BlockManager as new blocks) are + always copied. + + The "broadcast" parameter indicates whether all columns with the given + key should be returned. The default behavior is desirable when + calling this method prior to modifying existing values in a DataFrame. + """ def reindexer(value): # reindex if necessary @@ -2665,7 +2671,7 @@ def reindexer(value): return value # broadcast across multiple columns if necessary - if key in self.columns and value.ndim == 1: + if broadcast and key in self.columns and value.ndim == 1: if (not self.columns.is_unique or isinstance(self.columns, MultiIndex)): existing_piece = self[key] diff --git a/pandas/sparse/frame.py b/pandas/sparse/frame.py index 8eeff045d1fac..0ccbf08daf41b 100644 --- a/pandas/sparse/frame.py +++ b/pandas/sparse/frame.py @@ -302,7 +302,12 @@ def fillna(self, value=None, method=None, axis=0, inplace=False, # ---------------------------------------------------------------------- # Support different internal representation of SparseDataFrame - def _sanitize_column(self, key, value): + def _sanitize_column(self, key, value, broadcast=True): + """ + The "broadcast" parameter was added to match the method signature of + DataFrame._sanitize_column. However, this method does not make use of + broadcasting. + """ sp_maker = lambda x, index=None: SparseArray( x, index=index, fill_value=self._default_fill_value, kind=self._default_kind) diff --git a/pandas/tests/frame/test_mutate_columns.py b/pandas/tests/frame/test_mutate_columns.py index 5beab1565e538..7059ab969587d 100644 --- a/pandas/tests/frame/test_mutate_columns.py +++ b/pandas/tests/frame/test_mutate_columns.py @@ -163,6 +163,15 @@ def test_insert(self): exp = DataFrame(data={'X': ['x', 'y', 'z']}, index=['A', 'B', 'C']) assert_frame_equal(df, exp) + # GH 14291 + df = DataFrame() + df.insert(0, 'A', ['a', 'b', 'c'], allow_duplicates=True) + df.insert(0, 'A', ['a', 'b', 'c'], allow_duplicates=True) + df.insert(0, 'A', ['a', 'b', 'c'], allow_duplicates=True) + exp = DataFrame([['a', 'a', 'a'], ['b', 'b', 'b'], + ['c', 'c', 'c']], columns=['A', 'A', 'A']) + assert_frame_equal(df, exp) + def test_delitem(self): del self.frame['A'] self.assertNotIn('A', self.frame)