Skip to content

Commit 2e77536

Browse files
paul-manninojreback
authored andcommitted
BUG: Fix issue with inserting duplicate columns in a dataframe
closes #14291 closes #14431
1 parent 5cf6d94 commit 2e77536

File tree

4 files changed

+56
-8
lines changed

4 files changed

+56
-8
lines changed

doc/source/whatsnew/v0.19.1.txt

+8
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ Bug Fixes
5050

5151

5252
- Source installs from PyPI will now work without ``cython`` installed, as in previous versions (:issue:`14204`)
53+
- Bug in ``DataFrame.insert`` where multiple calls with duplicate columns can fail (:issue:`14291`)
5354

5455
- ``pd.merge()`` will raise ``ValueError`` with non-boolean parameters in passed boolean type arguments (:issue:`14434`)
5556

@@ -63,4 +64,11 @@ Bug Fixes
6364
- Bug in ``DataFrame.to_json`` where ``lines=True`` and a value contained a ``}`` character (:issue:`14391`)
6465
- Bug in ``df.groupby`` causing an ``AttributeError`` when grouping a single index frame by a column and the index level (:issue`14327`)
6566

67+
68+
69+
70+
71+
72+
73+
6674
- Bug in ``pd.pivot_table`` may raise ``TypeError`` or ``ValueError`` when ``index`` or ``columns`` is not scalar and ``values`` is not specified (:issue:`14380`)

pandas/core/frame.py

+23-7
Original file line numberDiff line numberDiff line change
@@ -2487,7 +2487,7 @@ def _set_item(self, key, value):
24872487

24882488
# check if we are modifying a copy
24892489
# try to set first as we want an invalid
2490-
# value exeption to occur first
2490+
# value exception to occur first
24912491
if len(self):
24922492
self._check_setitem_copy()
24932493

@@ -2503,10 +2503,10 @@ def insert(self, loc, column, value, allow_duplicates=False):
25032503
loc : int
25042504
Must have 0 <= loc <= len(columns)
25052505
column : object
2506-
value : int, Series, or array-like
2506+
value : scalar, Series, or array-like
25072507
"""
25082508
self._ensure_valid_index(value)
2509-
value = self._sanitize_column(column, value)
2509+
value = self._sanitize_column(column, value, broadcast=False)
25102510
self._data.insert(loc, column, value,
25112511
allow_duplicates=allow_duplicates)
25122512

@@ -2590,9 +2590,25 @@ def assign(self, **kwargs):
25902590

25912591
return data
25922592

2593-
def _sanitize_column(self, key, value):
2594-
# Need to make sure new columns (which go into the BlockManager as new
2595-
# blocks) are always copied
2593+
def _sanitize_column(self, key, value, broadcast=True):
2594+
"""
2595+
Ensures new columns (which go into the BlockManager as new blocks) are
2596+
always copied and converted into an array.
2597+
2598+
Parameters
2599+
----------
2600+
key : object
2601+
value : scalar, Series, or array-like
2602+
broadcast : bool, default True
2603+
If ``key`` matches multiple duplicate column names in the
2604+
DataFrame, this parameter indicates whether ``value`` should be
2605+
tiled so that the returned array contains a (duplicated) column for
2606+
each occurrence of the key. If False, ``value`` will not be tiled.
2607+
2608+
Returns
2609+
-------
2610+
sanitized_column : numpy-array
2611+
"""
25962612

25972613
def reindexer(value):
25982614
# reindex if necessary
@@ -2665,7 +2681,7 @@ def reindexer(value):
26652681
return value
26662682

26672683
# broadcast across multiple columns if necessary
2668-
if key in self.columns and value.ndim == 1:
2684+
if broadcast and key in self.columns and value.ndim == 1:
26692685
if (not self.columns.is_unique or
26702686
isinstance(self.columns, MultiIndex)):
26712687
existing_piece = self[key]

pandas/sparse/frame.py

+15-1
Original file line numberDiff line numberDiff line change
@@ -302,7 +302,21 @@ def fillna(self, value=None, method=None, axis=0, inplace=False,
302302
# ----------------------------------------------------------------------
303303
# Support different internal representation of SparseDataFrame
304304

305-
def _sanitize_column(self, key, value):
305+
def _sanitize_column(self, key, value, **kwargs):
306+
"""
307+
Creates a new SparseArray from the input value.
308+
309+
Parameters
310+
----------
311+
key : object
312+
value : scalar, Series, or array-like
313+
kwargs : dict
314+
315+
Returns
316+
-------
317+
sanitized_column : SparseArray
318+
319+
"""
306320
sp_maker = lambda x, index=None: SparseArray(
307321
x, index=index, fill_value=self._default_fill_value,
308322
kind=self._default_kind)

pandas/tests/frame/test_nonunique_indexes.py

+10
Original file line numberDiff line numberDiff line change
@@ -468,3 +468,13 @@ def test_set_value_by_index(self):
468468

469469
df.iloc[:, 0] = 3
470470
assert_series_equal(df.iloc[:, 1], expected)
471+
472+
def test_insert_with_columns_dups(self):
473+
# GH 14291
474+
df = pd.DataFrame()
475+
df.insert(0, 'A', ['g', 'h', 'i'], allow_duplicates=True)
476+
df.insert(0, 'A', ['d', 'e', 'f'], allow_duplicates=True)
477+
df.insert(0, 'A', ['a', 'b', 'c'], allow_duplicates=True)
478+
exp = pd.DataFrame([['a', 'd', 'g'], ['b', 'e', 'h'],
479+
['c', 'f', 'i']], columns=['A', 'A', 'A'])
480+
assert_frame_equal(df, exp)

0 commit comments

Comments
 (0)