From 6105102bdc701f7665f60059243898d47ff7d73f Mon Sep 17 00:00:00 2001
From: phofl <patrick_hoefler@gmx.net>
Date: Fri, 11 Sep 2020 16:45:21 +0200
Subject: [PATCH 01/11] Fix crash in concat if non unique index

---
 doc/source/whatsnew/v1.2.0.rst      |  1 +
 pandas/core/reshape/concat.py       | 14 +++++++++++++-
 pandas/tests/reshape/test_concat.py | 12 ++++++++++++
 3 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
index 2aac2596c18cb..0411cacda845f 100644
--- a/doc/source/whatsnew/v1.2.0.rst
+++ b/doc/source/whatsnew/v1.2.0.rst
@@ -322,6 +322,7 @@ Reshaping
 - Bug in :meth:`DataFrame.pivot_table` with ``aggfunc='count'`` or ``aggfunc='sum'`` returning ``NaN`` for missing categories when pivoted on a ``Categorical``. Now returning ``0`` (:issue:`31422`)
 - Bug in :func:`union_indexes` where input index names are not preserved in some cases. Affects :func:`concat` and :class:`DataFrame` constructor (:issue:`13475`)
 - Bug in func :meth:`crosstab` when using multiple columns with ``margins=True`` and ``normalize=True`` (:issue:`35144`)
+- Bug in :func:`concat` resulted in a ``ValueError`` when at least one of both inputs had a non unique index (:issue:`36263`)
 -
 
 Sparse
diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py
index 9b94dae8556f6..6406afd718579 100644
--- a/pandas/core/reshape/concat.py
+++ b/pandas/core/reshape/concat.py
@@ -496,7 +496,19 @@ def get_result(self):
                     # 1-ax to convert BlockManager axis to DataFrame axis
                     obj_labels = obj.axes[1 - ax]
                     if not new_labels.equals(obj_labels):
-                        indexers[ax] = obj_labels.reindex(new_labels)[1]
+                        # We have to remove the duplicates from obj_labels
+                        # in new labels to make them unique, otherwise we would
+                        # duplicate or duplicates again
+                        obj_labels_duplicates = obj_labels[
+                            obj_labels.duplicated()
+                        ].unique()
+                        new_labels_cleared = new_labels[
+                            ~(
+                                new_labels.duplicated()
+                                & new_labels.isin(obj_labels_duplicates)
+                            )
+                        ]
+                        indexers[ax] = obj_labels.reindex(new_labels_cleared)[1]
 
                 mgrs_indexers.append((obj._mgr, indexers))
 
diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py
index 90705f827af25..9d5441e766483 100644
--- a/pandas/tests/reshape/test_concat.py
+++ b/pandas/tests/reshape/test_concat.py
@@ -2918,3 +2918,15 @@ def test_concat_frame_axis0_extension_dtypes():
     result = pd.concat([df2, df1], ignore_index=True)
     expected = pd.DataFrame({"a": [4, 5, 6, 1, 2, 3]}, dtype="Int64")
     tm.assert_frame_equal(result, expected)
+
+
+def test_concat_duplicate_indexes():
+    # GH 36263 ValueError with non unique indexes
+    df1 = pd.DataFrame([1, 2, 3, 4], index=[0, 1, 1, 4], columns=["a"])
+    df2 = pd.DataFrame([6, 7, 8, 9], index=[0, 0, 1, 3], columns=["b"])
+    result = pd.concat([df1, df2], axis=1)
+    expected = pd.DataFrame(
+        {"a": [1, 1, 2, 3, np.nan, 4], "b": [6, 7, 8, 8, 9, np.nan]},
+        index=pd.Index([0, 0, 1, 1, 3, 4]),
+    )
+    tm.assert_frame_equal(result, expected)

From b4c3b771673ea14663666cb12ef45404c60861e9 Mon Sep 17 00:00:00 2001
From: phofl <patrick_hoefler@gmx.net>
Date: Sun, 4 Oct 2020 21:01:02 +0200
Subject: [PATCH 02/11] Creta function in algos

---
 pandas/core/algorithms.py     | 24 ++++++++++++++++++++++++
 pandas/core/reshape/concat.py | 11 ++---------
 2 files changed, 26 insertions(+), 9 deletions(-)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 2ce3f2d9a7bfa..f8fecd6f45969 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -2112,3 +2112,27 @@ def sort_mixed(values):
         np.putmask(new_codes, mask, na_sentinel)
 
     return ordered, ensure_platform_int(new_codes)
+
+
+def make_duplicates_of_left_unique_in_right(left, right) -> np.ndarray:
+    """
+    Drops all duplicates values from left in right, so that they are
+    unique in right.
+
+    Parameters
+    ----------
+    left: Index
+    right: Index
+
+    Returns
+    -------
+    Duplicates of left unique in right
+    """
+    left_duplicates = unique(left[duplicated(left)])
+    right_cleared = right[
+        ~(
+                duplicated(right)
+                & np.isin(right, left_duplicates)
+        )
+    ]
+    return right_cleared
diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py
index 3b19957ed2f60..2cd8babcf535f 100644
--- a/pandas/core/reshape/concat.py
+++ b/pandas/core/reshape/concat.py
@@ -13,6 +13,7 @@
 from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries
 from pandas.core.dtypes.missing import isna
 
+import pandas.core.algorithms as algos
 from pandas.core.arrays.categorical import (
     factorize_from_iterable,
     factorize_from_iterables,
@@ -498,15 +499,7 @@ def get_result(self):
                         # We have to remove the duplicates from obj_labels
                         # in new labels to make them unique, otherwise we would
                         # duplicate or duplicates again
-                        obj_labels_duplicates = obj_labels[
-                            obj_labels.duplicated()
-                        ].unique()
-                        new_labels_cleared = new_labels[
-                            ~(
-                                new_labels.duplicated()
-                                & new_labels.isin(obj_labels_duplicates)
-                            )
-                        ]
+                        new_labels_cleared = algos.make_duplicates_of_left_unique_in_right(obj_labels.values, new_labels.values)
                         indexers[ax] = obj_labels.reindex(new_labels_cleared)[1]
 
                 mgrs_indexers.append((obj._mgr, indexers))

From 7129ced7808da2c8e12e39655d9418506fe8f127 Mon Sep 17 00:00:00 2001
From: phofl <patrick_hoefler@gmx.net>
Date: Sun, 4 Oct 2020 22:21:53 +0200
Subject: [PATCH 03/11] Add benchmark

---
 asv_bench/benchmarks/algorithms.py | 12 ++++++++++++
 pandas/core/algorithms.py          | 14 ++++----------
 pandas/core/reshape/concat.py      |  7 +++++--
 3 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py
index 65e52e03c43c7..03480ae198345 100644
--- a/asv_bench/benchmarks/algorithms.py
+++ b/asv_bench/benchmarks/algorithms.py
@@ -5,6 +5,7 @@
 from pandas._libs import lib
 
 import pandas as pd
+from pandas.core.algorithms import make_duplicates_of_left_unique_in_right
 
 from .pandas_vb_common import tm
 
@@ -174,4 +175,15 @@ def time_argsort(self, N):
         self.array.argsort()
 
 
+class RemoveDuplicates:
+    def setup(self):
+        N = 10 ** 5
+        na = np.arange(int(N / 2))
+        self.left = np.concatenate([na[: int(N / 4)], na[: int(N / 4)]])
+        self.right = np.concatenate([na, na])
+
+    def time_make_duplicates_of_left_unique_in_right(self):
+        make_duplicates_of_left_unique_in_right(self.left, self.right)
+
+
 from .pandas_vb_common import setup  # noqa: F401 isort:skip
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 94bebc1a81f5e..ccdf9852fb2ed 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -2124,18 +2124,12 @@ def make_duplicates_of_left_unique_in_right(left, right) -> np.ndarray:
 
     Parameters
     ----------
-    left: Index
-    right: Index
+    left: ndarray
+    right: ndarray
 
     Returns
     -------
-    Duplicates of left unique in right
+    Duplicates of left are unique in right
     """
     left_duplicates = unique(left[duplicated(left)])
-    right_cleared = right[
-        ~(
-                duplicated(right)
-                & np.isin(right, left_duplicates)
-        )
-    ]
-    return right_cleared
+    return right[~(duplicated(right) & np.isin(right, left_duplicates))]
diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py
index 9e937fe02c813..5daad1c728e22 100644
--- a/pandas/core/reshape/concat.py
+++ b/pandas/core/reshape/concat.py
@@ -500,8 +500,11 @@ def get_result(self):
                         # We have to remove the duplicates from obj_labels
                         # in new labels to make them unique, otherwise we would
                         # duplicate or duplicates again
-                        new_labels_cleared = algos.make_duplicates_of_left_unique_in_right(obj_labels.values, new_labels.values)
-                        indexers[ax] = obj_labels.reindex(new_labels_cleared)[1]
+                        if not obj_labels.is_unique:
+                            new_labels = algos.make_duplicates_of_left_unique_in_right(
+                                obj_labels.values, new_labels.values
+                            )
+                        indexers[ax] = obj_labels.reindex(new_labels)[1]
 
                 mgrs_indexers.append((obj._mgr, indexers))
 

From 9cce3ff7f6ad4b30ba265e99dcada78476bf13f8 Mon Sep 17 00:00:00 2001
From: phofl <patrick_hoefler@gmx.net>
Date: Fri, 13 Nov 2020 00:58:46 +0100
Subject: [PATCH 04/11] Add test in different file

---
 pandas/tests/reshape/concat/test_concat.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py
index a1351ce782669..75d7d9b65aefb 100644
--- a/pandas/tests/reshape/concat/test_concat.py
+++ b/pandas/tests/reshape/concat/test_concat.py
@@ -556,3 +556,15 @@ def test_concat_preserves_extension_int64_dtype():
     result = pd.concat([df_a, df_b], ignore_index=True)
     expected = DataFrame({"a": [-1, None], "b": [None, 1]}, dtype="Int64")
     tm.assert_frame_equal(result, expected)
+
+
+def test_concat_duplicate_indexes():
+    # GH#36263 ValueError with non unique indexes
+    df1 = pd.DataFrame([1, 2, 3, 4], index=[0, 1, 1, 4], columns=["a"])
+    df2 = pd.DataFrame([6, 7, 8, 9], index=[0, 0, 1, 3], columns=["b"])
+    result = pd.concat([df1, df2], axis=1)
+    expected = pd.DataFrame(
+        {"a": [1, 1, 2, 3, np.nan, 4], "b": [6, 7, 8, 8, 9, np.nan]},
+        index=pd.Index([0, 0, 1, 1, 3, 4]),
+    )
+    tm.assert_frame_equal(result, expected)

From d0c4ea5e098e2b6d42bb56604b38f3be9ab9749b Mon Sep 17 00:00:00 2001
From: phofl <patrick_hoefler@gmx.net>
Date: Fri, 13 Nov 2020 01:00:59 +0100
Subject: [PATCH 05/11] Fix pattern

---
 pandas/tests/reshape/concat/test_concat.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py
index 75d7d9b65aefb..b0f495a404a80 100644
--- a/pandas/tests/reshape/concat/test_concat.py
+++ b/pandas/tests/reshape/concat/test_concat.py
@@ -560,11 +560,11 @@ def test_concat_preserves_extension_int64_dtype():
 
 def test_concat_duplicate_indexes():
     # GH#36263 ValueError with non unique indexes
-    df1 = pd.DataFrame([1, 2, 3, 4], index=[0, 1, 1, 4], columns=["a"])
-    df2 = pd.DataFrame([6, 7, 8, 9], index=[0, 0, 1, 3], columns=["b"])
-    result = pd.concat([df1, df2], axis=1)
-    expected = pd.DataFrame(
+    df1 = DataFrame([1, 2, 3, 4], index=[0, 1, 1, 4], columns=["a"])
+    df2 = DataFrame([6, 7, 8, 9], index=[0, 0, 1, 3], columns=["b"])
+    result = concat([df1, df2], axis=1)
+    expected = DataFrame(
         {"a": [1, 1, 2, 3, np.nan, 4], "b": [6, 7, 8, 8, 9, np.nan]},
-        index=pd.Index([0, 0, 1, 1, 3, 4]),
+        index=Index([0, 0, 1, 1, 3, 4]),
     )
     tm.assert_frame_equal(result, expected)

From a96b2626a23fe0b635ff7a5598e94303b832415e Mon Sep 17 00:00:00 2001
From: phofl <patrick_hoefler@gmx.net>
Date: Fri, 13 Nov 2020 20:29:33 +0100
Subject: [PATCH 06/11] Use asarray

---
 pandas/core/reshape/concat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py
index 5b14158748d04..ee54b06f5bceb 100644
--- a/pandas/core/reshape/concat.py
+++ b/pandas/core/reshape/concat.py
@@ -507,7 +507,7 @@ def get_result(self):
                         # duplicate or duplicates again
                         if not obj_labels.is_unique:
                             new_labels = algos.make_duplicates_of_left_unique_in_right(
-                                obj_labels.values, new_labels.values
+                                np.asarray(obj_labels), np.asarray(new_labels)
                             )
                         indexers[ax] = obj_labels.reindex(new_labels)[1]
 

From e21939f63e554ea85ff97fe370b266da1ecba384 Mon Sep 17 00:00:00 2001
From: phofl <patrick_hoefler@gmx.net>
Date: Fri, 13 Nov 2020 20:48:45 +0100
Subject: [PATCH 07/11] Adress review comments

---
 pandas/core/algorithms.py                     |  4 +++-
 pandas/tests/reshape/concat/test_concat.py    | 12 ------------
 pandas/tests/reshape/concat/test_dataframe.py | 11 +++++++++++
 pandas/tests/test_algos.py                    | 12 ++++++++++++
 4 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 1e74eb28a0f8a..fc6d08777e3c9 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -2151,7 +2151,9 @@ def _sort_tuples(values: np.ndarray[tuple]):
     return values[indexer]
 
 
-def make_duplicates_of_left_unique_in_right(left, right) -> np.ndarray:
+def make_duplicates_of_left_unique_in_right(
+    left: np.ndarray, right: np.ndarray
+) -> np.ndarray:
     """
     Drops all duplicates values from left in right, so that they are
     unique in right.
diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py
index b0f495a404a80..a1351ce782669 100644
--- a/pandas/tests/reshape/concat/test_concat.py
+++ b/pandas/tests/reshape/concat/test_concat.py
@@ -556,15 +556,3 @@ def test_concat_preserves_extension_int64_dtype():
     result = pd.concat([df_a, df_b], ignore_index=True)
     expected = DataFrame({"a": [-1, None], "b": [None, 1]}, dtype="Int64")
     tm.assert_frame_equal(result, expected)
-
-
-def test_concat_duplicate_indexes():
-    # GH#36263 ValueError with non unique indexes
-    df1 = DataFrame([1, 2, 3, 4], index=[0, 1, 1, 4], columns=["a"])
-    df2 = DataFrame([6, 7, 8, 9], index=[0, 0, 1, 3], columns=["b"])
-    result = concat([df1, df2], axis=1)
-    expected = DataFrame(
-        {"a": [1, 1, 2, 3, np.nan, 4], "b": [6, 7, 8, 8, 9, np.nan]},
-        index=Index([0, 0, 1, 1, 3, 4]),
-    )
-    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/reshape/concat/test_dataframe.py b/pandas/tests/reshape/concat/test_dataframe.py
index 295846ee1b264..babc8124877e9 100644
--- a/pandas/tests/reshape/concat/test_dataframe.py
+++ b/pandas/tests/reshape/concat/test_dataframe.py
@@ -167,3 +167,14 @@ def test_concat_dataframe_keys_bug(self, sort):
         # it works
         result = concat([t1, t2], axis=1, keys=["t1", "t2"], sort=sort)
         assert list(result.columns) == [("t1", "value"), ("t2", "value")]
+
+    def test_concat_duplicate_indexes(self):
+        # GH#36263 ValueError with non unique indexes
+        df1 = DataFrame([1, 2, 3, 4], index=[0, 1, 1, 4], columns=["a"])
+        df2 = DataFrame([6, 7, 8, 9], index=[0, 0, 1, 3], columns=["b"])
+        result = concat([df1, df2], axis=1)
+        expected = DataFrame(
+            {"a": [1, 1, 2, 3, np.nan, 4], "b": [6, 7, 8, 8, 9, np.nan]},
+            index=Index([0, 0, 1, 1, 3, 4]),
+        )
+        tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
index 88286448de900..cc824bbb7cba3 100644
--- a/pandas/tests/test_algos.py
+++ b/pandas/tests/test_algos.py
@@ -2358,3 +2358,15 @@ def test_diff_ea_axis(self):
         msg = "cannot diff DatetimeArray on axis=1"
         with pytest.raises(ValueError, match=msg):
             algos.diff(dta, 1, axis=1)
+
+
+@pytest.mark.parametrize(
+    "left_values", [[0, 1, 1, 4], [0, 1, 1, 4, 4], [0, 1, 1, 1, 4]]
+)
+def test_make_duplicates_of_left_unique_in_right(left_values):
+    # GH#36263
+    left = np.array(left_values)
+    right = np.array([0, 0, 1, 1, 4])
+    result = algos.make_duplicates_of_left_unique_in_right(left, right)
+    expected = np.array(([0, 0, 1, 4]))
+    tm.assert_numpy_array_equal(result, expected)

From 23344a262d3852e6de0c8a44b9271a1af4b5b8cb Mon Sep 17 00:00:00 2001
From: phofl <patrick_hoefler@gmx.net>
Date: Fri, 13 Nov 2020 21:01:31 +0100
Subject: [PATCH 08/11] Delete brackets

---
 pandas/tests/test_algos.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
index cc824bbb7cba3..ad878238b0fe2 100644
--- a/pandas/tests/test_algos.py
+++ b/pandas/tests/test_algos.py
@@ -2368,5 +2368,5 @@ def test_make_duplicates_of_left_unique_in_right(left_values):
     left = np.array(left_values)
     right = np.array([0, 0, 1, 1, 4])
     result = algos.make_duplicates_of_left_unique_in_right(left, right)
-    expected = np.array(([0, 0, 1, 4]))
+    expected = np.array([0, 0, 1, 4])
     tm.assert_numpy_array_equal(result, expected)

From a4e185102975762cf64fbda269db58843427f3f7 Mon Sep 17 00:00:00 2001
From: phofl <patrick_hoefler@gmx.net>
Date: Wed, 18 Nov 2020 21:39:20 +0100
Subject: [PATCH 09/11] Change whatsnew

---
 doc/source/whatsnew/v1.2.0.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
index e543976bb78d6..4ed81e7804c30 100644
--- a/doc/source/whatsnew/v1.2.0.rst
+++ b/doc/source/whatsnew/v1.2.0.rst
@@ -567,7 +567,7 @@ Reshaping
 - Bug in :meth:`DataFrame.combine_first()` caused wrong alignment with dtype ``string`` and one level of ``MultiIndex`` containing only ``NA`` (:issue:`37591`)
 - Fixed regression in :func:`merge` on merging DatetimeIndex with empty DataFrame (:issue:`36895`)
 - Bug in :meth:`DataFrame.apply` not setting index of return value when ``func`` return type is ``dict`` (:issue:`37544`)
-- Bug in :func:`concat` resulted in a ``ValueError`` when at least one of both inputs had a non unique index (:issue:`36263`)
+- Bug in :func:`concat` resulting in a ``ValueError`` when at least one of both inputs had a non-unique index (:issue:`36263`)
 
 Sparse
 ^^^^^^

From 354048d7fa58a1f1ce33fe545604f6a4e87b8dfc Mon Sep 17 00:00:00 2001
From: phofl <patrick_hoefler@gmx.net>
Date: Wed, 18 Nov 2020 21:42:57 +0100
Subject: [PATCH 10/11] Remove file

---
 pandas/core/algorithms.py           | 2 +-
 pandas/tests/reshape/test_concat.py | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 delete mode 100644 pandas/tests/reshape/test_concat.py

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index fc6d08777e3c9..4e87907dd84db 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -2168,4 +2168,4 @@ def make_duplicates_of_left_unique_in_right(
     Duplicates of left are unique in right
     """
     left_duplicates = unique(left[duplicated(left)])
-    return right[~(duplicated(right) & np.isin(right, left_duplicates))]
+    return right[~(duplicated(right) & isin(right, left_duplicates))]
diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py
deleted file mode 100644
index e69de29bb2d1d..0000000000000

From ab8b03bff1c02c751d5e0347da3ca4b23a863248 Mon Sep 17 00:00:00 2001
From: phofl <patrick_hoefler@gmx.net>
Date: Wed, 18 Nov 2020 21:47:37 +0100
Subject: [PATCH 11/11] Add comment

---
 pandas/core/algorithms.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 4e87907dd84db..cebfe9e5ca9d8 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -2155,8 +2155,9 @@ def make_duplicates_of_left_unique_in_right(
     left: np.ndarray, right: np.ndarray
 ) -> np.ndarray:
     """
-    Drops all duplicates values from left in right, so that they are
-    unique in right.
+    If left has duplicates, which are also duplicated in right, this duplicated values
+    are dropped from right, meaning that every duplicate value from left exists only
+    once in right.
 
     Parameters
     ----------