Skip to content

Commit b32febd

Browse files
authored
[BUG]: Fix ValueError in concat() when at least one Index has duplicates (pandas-dev#36290)
1 parent 437aa8b commit b32febd

File tree

6 files changed

+65
-0
lines changed

6 files changed

+65
-0
lines changed

asv_bench/benchmarks/algorithms.py

+12
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from pandas._libs import lib
66

77
import pandas as pd
8+
from pandas.core.algorithms import make_duplicates_of_left_unique_in_right
89

910
from .pandas_vb_common import tm
1011

@@ -174,4 +175,15 @@ def time_argsort(self, N):
174175
self.array.argsort()
175176

176177

178+
class RemoveDuplicates:
179+
def setup(self):
180+
N = 10 ** 5
181+
na = np.arange(int(N / 2))
182+
self.left = np.concatenate([na[: int(N / 4)], na[: int(N / 4)]])
183+
self.right = np.concatenate([na, na])
184+
185+
def time_make_duplicates_of_left_unique_in_right(self):
186+
make_duplicates_of_left_unique_in_right(self.left, self.right)
187+
188+
177189
from .pandas_vb_common import setup # noqa: F401 isort:skip

doc/source/whatsnew/v1.2.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -711,6 +711,7 @@ Reshaping
711711
- Bug in :meth:`DataFrame.combine_first()` caused wrong alignment with dtype ``string`` and one level of ``MultiIndex`` containing only ``NA`` (:issue:`37591`)
712712
- Fixed regression in :func:`merge` on merging DatetimeIndex with empty DataFrame (:issue:`36895`)
713713
- Bug in :meth:`DataFrame.apply` not setting index of return value when ``func`` return type is ``dict`` (:issue:`37544`)
714+
- Bug in :func:`concat` resulting in a ``ValueError`` when at least one of both inputs had a non-unique index (:issue:`36263`)
714715

715716
Sparse
716717
^^^^^^

pandas/core/algorithms.py

+21
Original file line numberDiff line numberDiff line change
@@ -2150,3 +2150,24 @@ def _sort_tuples(values: np.ndarray[tuple]):
21502150
arrays, _ = to_arrays(values, None)
21512151
indexer = lexsort_indexer(arrays, orders=True)
21522152
return values[indexer]
2153+
2154+
2155+
def make_duplicates_of_left_unique_in_right(
2156+
left: np.ndarray, right: np.ndarray
2157+
) -> np.ndarray:
2158+
"""
2159+
If left has duplicates, which are also duplicated in right, this duplicated values
2160+
are dropped from right, meaning that every duplicate value from left exists only
2161+
once in right.
2162+
2163+
Parameters
2164+
----------
2165+
left: ndarray
2166+
right: ndarray
2167+
2168+
Returns
2169+
-------
2170+
Duplicates of left are unique in right
2171+
"""
2172+
left_duplicates = unique(left[duplicated(left)])
2173+
return right[~(duplicated(right) & isin(right, left_duplicates))]

pandas/core/reshape/concat.py

+8
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries
1414
from pandas.core.dtypes.missing import isna
1515

16+
import pandas.core.algorithms as algos
1617
from pandas.core.arrays.categorical import (
1718
factorize_from_iterable,
1819
factorize_from_iterables,
@@ -501,6 +502,13 @@ def get_result(self):
501502
# 1-ax to convert BlockManager axis to DataFrame axis
502503
obj_labels = obj.axes[1 - ax]
503504
if not new_labels.equals(obj_labels):
505+
# We have to remove the duplicates from obj_labels
506+
# in new labels to make them unique, otherwise we would
507+
# duplicate or duplicates again
508+
if not obj_labels.is_unique:
509+
new_labels = algos.make_duplicates_of_left_unique_in_right(
510+
np.asarray(obj_labels), np.asarray(new_labels)
511+
)
504512
indexers[ax] = obj_labels.reindex(new_labels)[1]
505513

506514
mgrs_indexers.append((obj._mgr, indexers))

pandas/tests/reshape/concat/test_dataframe.py

+11
Original file line numberDiff line numberDiff line change
@@ -167,3 +167,14 @@ def test_concat_dataframe_keys_bug(self, sort):
167167
# it works
168168
result = concat([t1, t2], axis=1, keys=["t1", "t2"], sort=sort)
169169
assert list(result.columns) == [("t1", "value"), ("t2", "value")]
170+
171+
def test_concat_duplicate_indexes(self):
172+
# GH#36263 ValueError with non unique indexes
173+
df1 = DataFrame([1, 2, 3, 4], index=[0, 1, 1, 4], columns=["a"])
174+
df2 = DataFrame([6, 7, 8, 9], index=[0, 0, 1, 3], columns=["b"])
175+
result = concat([df1, df2], axis=1)
176+
expected = DataFrame(
177+
{"a": [1, 1, 2, 3, np.nan, 4], "b": [6, 7, 8, 8, 9, np.nan]},
178+
index=Index([0, 0, 1, 1, 3, 4]),
179+
)
180+
tm.assert_frame_equal(result, expected)

pandas/tests/test_algos.py

+12
Original file line numberDiff line numberDiff line change
@@ -2356,3 +2356,15 @@ def test_diff_ea_axis(self):
23562356
msg = "cannot diff DatetimeArray on axis=1"
23572357
with pytest.raises(ValueError, match=msg):
23582358
algos.diff(dta, 1, axis=1)
2359+
2360+
2361+
@pytest.mark.parametrize(
2362+
"left_values", [[0, 1, 1, 4], [0, 1, 1, 4, 4], [0, 1, 1, 1, 4]]
2363+
)
2364+
def test_make_duplicates_of_left_unique_in_right(left_values):
2365+
# GH#36263
2366+
left = np.array(left_values)
2367+
right = np.array([0, 0, 1, 1, 4])
2368+
result = algos.make_duplicates_of_left_unique_in_right(left, right)
2369+
expected = np.array([0, 0, 1, 4])
2370+
tm.assert_numpy_array_equal(result, expected)

0 commit comments

Comments
 (0)