diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 65e52e03c43c7..03480ae198345 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -5,6 +5,7 @@ from pandas._libs import lib import pandas as pd +from pandas.core.algorithms import make_duplicates_of_left_unique_in_right from .pandas_vb_common import tm @@ -174,4 +175,15 @@ def time_argsort(self, N): self.array.argsort() +class RemoveDuplicates: + def setup(self): + N = 10 ** 5 + na = np.arange(int(N / 2)) + self.left = np.concatenate([na[: int(N / 4)], na[: int(N / 4)]]) + self.right = np.concatenate([na, na]) + + def time_make_duplicates_of_left_unique_in_right(self): + make_duplicates_of_left_unique_in_right(self.left, self.right) + + from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index f37552d193f96..20e16ca558d9e 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -708,6 +708,7 @@ Reshaping - Bug in :meth:`DataFrame.combine_first()` caused wrong alignment with dtype ``string`` and one level of ``MultiIndex`` containing only ``NA`` (:issue:`37591`) - Fixed regression in :func:`merge` on merging DatetimeIndex with empty DataFrame (:issue:`36895`) - Bug in :meth:`DataFrame.apply` not setting index of return value when ``func`` return type is ``dict`` (:issue:`37544`) +- Bug in :func:`concat` resulting in a ``ValueError`` when at least one of both inputs had a non-unique index (:issue:`36263`) Sparse ^^^^^^ diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 2e6b801db109a..ca878d3293c57 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -2150,3 +2150,24 @@ def _sort_tuples(values: np.ndarray[tuple]): arrays, _ = to_arrays(values, None) indexer = lexsort_indexer(arrays, orders=True) return values[indexer] + + +def make_duplicates_of_left_unique_in_right( + left: np.ndarray, right: np.ndarray +) -> np.ndarray: + """ + If left has duplicates, which are also duplicated in right, this duplicated values + are dropped from right, meaning that every duplicate value from left exists only + once in right. + + Parameters + ---------- + left: ndarray + right: ndarray + + Returns + ------- + Duplicates of left are unique in right + """ + left_duplicates = unique(left[duplicated(left)]) + return right[~(duplicated(right) & isin(right, left_duplicates))] diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 77b1076920f20..ee54b06f5bceb 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -13,6 +13,7 @@ from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries from pandas.core.dtypes.missing import isna +import pandas.core.algorithms as algos from pandas.core.arrays.categorical import ( factorize_from_iterable, factorize_from_iterables, @@ -501,6 +502,13 @@ def get_result(self): # 1-ax to convert BlockManager axis to DataFrame axis obj_labels = obj.axes[1 - ax] if not new_labels.equals(obj_labels): + # We have to remove the duplicates from obj_labels + # in new labels to make them unique, otherwise we would + # duplicate or duplicates again + if not obj_labels.is_unique: + new_labels = algos.make_duplicates_of_left_unique_in_right( + np.asarray(obj_labels), np.asarray(new_labels) + ) indexers[ax] = obj_labels.reindex(new_labels)[1] mgrs_indexers.append((obj._mgr, indexers)) diff --git a/pandas/tests/reshape/concat/test_dataframe.py b/pandas/tests/reshape/concat/test_dataframe.py index 295846ee1b264..babc8124877e9 100644 --- a/pandas/tests/reshape/concat/test_dataframe.py +++ b/pandas/tests/reshape/concat/test_dataframe.py @@ -167,3 +167,14 @@ def test_concat_dataframe_keys_bug(self, sort): # it works result = concat([t1, t2], axis=1, keys=["t1", "t2"], sort=sort) assert list(result.columns) == [("t1", "value"), ("t2", "value")] + + def test_concat_duplicate_indexes(self): + # GH#36263 ValueError with non unique indexes + df1 = DataFrame([1, 2, 3, 4], index=[0, 1, 1, 4], columns=["a"]) + df2 = DataFrame([6, 7, 8, 9], index=[0, 0, 1, 3], columns=["b"]) + result = concat([df1, df2], axis=1) + expected = DataFrame( + {"a": [1, 1, 2, 3, np.nan, 4], "b": [6, 7, 8, 8, 9, np.nan]}, + index=Index([0, 0, 1, 1, 3, 4]), + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 34b7d0e73e914..3c8f5b7385fcb 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -2356,3 +2356,15 @@ def test_diff_ea_axis(self): msg = "cannot diff DatetimeArray on axis=1" with pytest.raises(ValueError, match=msg): algos.diff(dta, 1, axis=1) + + +@pytest.mark.parametrize( + "left_values", [[0, 1, 1, 4], [0, 1, 1, 4, 4], [0, 1, 1, 1, 4]] +) +def test_make_duplicates_of_left_unique_in_right(left_values): + # GH#36263 + left = np.array(left_values) + right = np.array([0, 0, 1, 1, 4]) + result = algos.make_duplicates_of_left_unique_in_right(left, right) + expected = np.array([0, 0, 1, 4]) + tm.assert_numpy_array_equal(result, expected)