[BUG]: Fix ValueError in concat() when at least one Index has duplicates (pandas-dev#36290)

phofl · web-flow · commit b32febd5954f · 2020-11-19T14:01:08.000-05:00
diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py
@@ -5,6 +5,7 @@
 from pandas._libs import lib
 
 import pandas as pd
+from pandas.core.algorithms import make_duplicates_of_left_unique_in_right
 
 from .pandas_vb_common import tm
 
@@ -174,4 +175,15 @@ def time_argsort(self, N):
         self.array.argsort()
 
 
+class RemoveDuplicates:
+    def setup(self):
+        N = 10 ** 5
+        na = np.arange(int(N / 2))
+        self.left = np.concatenate([na[: int(N / 4)], na[: int(N / 4)]])
+        self.right = np.concatenate([na, na])
+
+    def time_make_duplicates_of_left_unique_in_right(self):
+        make_duplicates_of_left_unique_in_right(self.left, self.right)
+
+
 from .pandas_vb_common import setup  # noqa: F401 isort:skip
diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
@@ -711,6 +711,7 @@ Reshaping
 - Bug in :meth:`DataFrame.combine_first()` caused wrong alignment with dtype ``string`` and one level of ``MultiIndex`` containing only ``NA`` (:issue:`37591`)
 - Fixed regression in :func:`merge` on merging DatetimeIndex with empty DataFrame (:issue:`36895`)
 - Bug in :meth:`DataFrame.apply` not setting index of return value when ``func`` return type is ``dict`` (:issue:`37544`)
+- Bug in :func:`concat` resulting in a ``ValueError`` when at least one of both inputs had a non-unique index (:issue:`36263`)
 
 Sparse
 ^^^^^^
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -2150,3 +2150,24 @@ def _sort_tuples(values: np.ndarray[tuple]):
     arrays, _ = to_arrays(values, None)
     indexer = lexsort_indexer(arrays, orders=True)
     return values[indexer]
+
+
+def make_duplicates_of_left_unique_in_right(
+    left: np.ndarray, right: np.ndarray
+) -> np.ndarray:
+    """
+    If left has duplicates, which are also duplicated in right, this duplicated values
+    are dropped from right, meaning that every duplicate value from left exists only
+    once in right.
+
+    Parameters
+    ----------
+    left: ndarray
+    right: ndarray
+
+    Returns
+    -------
+    Duplicates of left are unique in right
+    """
+    left_duplicates = unique(left[duplicated(left)])
+    return right[~(duplicated(right) & isin(right, left_duplicates))]
diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py
@@ -13,6 +13,7 @@
 from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries
 from pandas.core.dtypes.missing import isna
 
+import pandas.core.algorithms as algos
 from pandas.core.arrays.categorical import (
     factorize_from_iterable,
     factorize_from_iterables,
@@ -501,6 +502,13 @@ def get_result(self):
                     # 1-ax to convert BlockManager axis to DataFrame axis
                     obj_labels = obj.axes[1 - ax]
                     if not new_labels.equals(obj_labels):
+                        # We have to remove the duplicates from obj_labels
+                        # in new labels to make them unique, otherwise we would
+                        # duplicate or duplicates again
+                        if not obj_labels.is_unique:
+                            new_labels = algos.make_duplicates_of_left_unique_in_right(
+                                np.asarray(obj_labels), np.asarray(new_labels)
+                            )
                         indexers[ax] = obj_labels.reindex(new_labels)[1]
 
                 mgrs_indexers.append((obj._mgr, indexers))
diff --git a/pandas/tests/reshape/concat/test_dataframe.py b/pandas/tests/reshape/concat/test_dataframe.py
@@ -167,3 +167,14 @@ def test_concat_dataframe_keys_bug(self, sort):
         # it works
         result = concat([t1, t2], axis=1, keys=["t1", "t2"], sort=sort)
         assert list(result.columns) == [("t1", "value"), ("t2", "value")]
+
+    def test_concat_duplicate_indexes(self):
+        # GH#36263 ValueError with non unique indexes
+        df1 = DataFrame([1, 2, 3, 4], index=[0, 1, 1, 4], columns=["a"])
+        df2 = DataFrame([6, 7, 8, 9], index=[0, 0, 1, 3], columns=["b"])
+        result = concat([df1, df2], axis=1)
+        expected = DataFrame(
+            {"a": [1, 1, 2, 3, np.nan, 4], "b": [6, 7, 8, 8, 9, np.nan]},
+            index=Index([0, 0, 1, 1, 3, 4]),
+        )
+        tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
@@ -2356,3 +2356,15 @@ def test_diff_ea_axis(self):
         msg = "cannot diff DatetimeArray on axis=1"
         with pytest.raises(ValueError, match=msg):
             algos.diff(dta, 1, axis=1)
+
+
+@pytest.mark.parametrize(
+    "left_values", [[0, 1, 1, 4], [0, 1, 1, 4, 4], [0, 1, 1, 1, 4]]
+)
+def test_make_duplicates_of_left_unique_in_right(left_values):
+    # GH#36263
+    left = np.array(left_values)
+    right = np.array([0, 0, 1, 1, 4])
+    result = algos.make_duplicates_of_left_unique_in_right(left, right)
+    expected = np.array([0, 0, 1, 4])
+    tm.assert_numpy_array_equal(result, expected)