Skip to content

Commit ed70cef

Browse files
committed
Handle duplicates
1 parent 025fb91 commit ed70cef

File tree

2 files changed

+42
-6
lines changed

2 files changed

+42
-6
lines changed

pandas/core/internals/construction.py

+24-3
Original file line numberDiff line numberDiff line change
@@ -194,11 +194,19 @@ def init_dict(data, index, columns, dtype=None):
194194
if not isinstance(columns, Index):
195195
columns = Index(columns, copy=False)
196196

197+
# Ugh columns make not be unique (even though we're in init_dict and
198+
# dict keys have to be unique...). We have two possible strategies
199+
# 1.) Gracefully handle duplicates when going through data to build
200+
# new_data.
201+
# 2.) Focus only on unique values on a first pass, and insert duplicates
202+
# in the correct positions after the uniques have been handled.
203+
# We take option 2.
204+
197205
if not columns.is_unique:
198-
# This is silly, but allowed and tested.
199-
# Do the check, instead of always calling unique, to preserve
200-
# the identity of unique user-provided indexes.
206+
columns_with_duplictes = columns.copy()
201207
columns = columns.unique()
208+
else:
209+
columns_with_duplictes = None
202210

203211
if data:
204212
normalized_keys = Index(data.keys(), copy=False)
@@ -284,6 +292,19 @@ def init_dict(data, index, columns, dtype=None):
284292
# https://github.com/pandas-dev/pandas/issues/24388 for more.
285293
dtype = np.dtype("object")
286294

295+
if columns_with_duplictes is not None:
296+
duplicated = columns_with_duplictes.duplicated()
297+
duplicate_positions = np.arange(len(duplicated))[duplicated]
298+
offset = 0
299+
300+
for position in duplicate_positions:
301+
key = columns_with_duplictes[position]
302+
loc = columns.get_loc(key)
303+
arrays.insert(position, arrays[loc])
304+
offset += 1
305+
306+
columns = columns_with_duplictes
307+
287308
return arrays_to_mgr(arrays, columns, index, columns, dtype=dtype)
288309

289310

pandas/tests/frame/test_constructors.py

+18-3
Original file line numberDiff line numberDiff line change
@@ -1402,9 +1402,24 @@ def test_constructor_column_duplicates(self):
14021402
OrderedDict([('b', 8), ('a', 5), ('a', 6)]))
14031403

14041404
def test_constructor_column_dict_duplicates(self):
1405-
result = DataFrame({"A": [1, 2], "B": [3, 4]}, columns=['A', 'B', 'A'])
1406-
expected = DataFrame({"A": [1, 2], "B": [3, 4]}, columns=['A', 'B'])
1407-
tm.assert_frame_equal(result, expected)
1405+
result = DataFrame({}, columns=['A', 'B', 'A']).columns
1406+
expected = pd.Index(['A', 'B', 'A'])
1407+
tm.assert_index_equal(result, expected)
1408+
1409+
def test_constructor_column_dict_duplicates_data(self):
1410+
df = pd.DataFrame({'a': [1], 'b': [2], 'c': [3]},
1411+
columns=['c', 'b', 'a', 'a', 'b', 'c'])
1412+
# do this in pieces to avoid constructing an expected that
1413+
# maybe hits the same code path.
1414+
columns = pd.Index(['c', 'b', 'a', 'a', 'b', 'c'])
1415+
tm.assert_index_equal(df.columns, columns)
1416+
1417+
tm.assert_series_equal(df.iloc[:, 0], pd.Series([3], name='c'))
1418+
tm.assert_series_equal(df.iloc[:, 1], pd.Series([2], name='b'))
1419+
tm.assert_series_equal(df.iloc[:, 2], pd.Series([1], name='a'))
1420+
tm.assert_series_equal(df.iloc[:, 3], pd.Series([1], name='a'))
1421+
tm.assert_series_equal(df.iloc[:, 4], pd.Series([2], name='b'))
1422+
tm.assert_series_equal(df.iloc[:, 0], pd.Series([3], name='c'))
14081423

14091424
def test_constructor_empty_with_string_dtype(self):
14101425
# GH 9428

0 commit comments

Comments
 (0)