From 6105102bdc701f7665f60059243898d47ff7d73f Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 11 Sep 2020 16:45:21 +0200 Subject: [PATCH 01/11] Fix crash in concat if non unique index --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/core/reshape/concat.py | 14 +++++++++++++- pandas/tests/reshape/test_concat.py | 12 ++++++++++++ 3 files changed, 26 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 2aac2596c18cb..0411cacda845f 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -322,6 +322,7 @@ Reshaping - Bug in :meth:`DataFrame.pivot_table` with ``aggfunc='count'`` or ``aggfunc='sum'`` returning ``NaN`` for missing categories when pivoted on a ``Categorical``. Now returning ``0`` (:issue:`31422`) - Bug in :func:`union_indexes` where input index names are not preserved in some cases. Affects :func:`concat` and :class:`DataFrame` constructor (:issue:`13475`) - Bug in func :meth:`crosstab` when using multiple columns with ``margins=True`` and ``normalize=True`` (:issue:`35144`) +- Bug in :func:`concat` resulted in a ``ValueError`` when at least one of both inputs had a non unique index (:issue:`36263`) - Sparse diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 9b94dae8556f6..6406afd718579 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -496,7 +496,19 @@ def get_result(self): # 1-ax to convert BlockManager axis to DataFrame axis obj_labels = obj.axes[1 - ax] if not new_labels.equals(obj_labels): - indexers[ax] = obj_labels.reindex(new_labels)[1] + # We have to remove the duplicates from obj_labels + # in new labels to make them unique, otherwise we would + # duplicate or duplicates again + obj_labels_duplicates = obj_labels[ + obj_labels.duplicated() + ].unique() + new_labels_cleared = new_labels[ + ~( + new_labels.duplicated() + & new_labels.isin(obj_labels_duplicates) + ) + ] + indexers[ax] = obj_labels.reindex(new_labels_cleared)[1] mgrs_indexers.append((obj._mgr, indexers)) diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 90705f827af25..9d5441e766483 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -2918,3 +2918,15 @@ def test_concat_frame_axis0_extension_dtypes(): result = pd.concat([df2, df1], ignore_index=True) expected = pd.DataFrame({"a": [4, 5, 6, 1, 2, 3]}, dtype="Int64") tm.assert_frame_equal(result, expected) + + +def test_concat_duplicate_indexes(): + # GH 36263 ValueError with non unique indexes + df1 = pd.DataFrame([1, 2, 3, 4], index=[0, 1, 1, 4], columns=["a"]) + df2 = pd.DataFrame([6, 7, 8, 9], index=[0, 0, 1, 3], columns=["b"]) + result = pd.concat([df1, df2], axis=1) + expected = pd.DataFrame( + {"a": [1, 1, 2, 3, np.nan, 4], "b": [6, 7, 8, 8, 9, np.nan]}, + index=pd.Index([0, 0, 1, 1, 3, 4]), + ) + tm.assert_frame_equal(result, expected) From b4c3b771673ea14663666cb12ef45404c60861e9 Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 4 Oct 2020 21:01:02 +0200 Subject: [PATCH 02/11] Creta function in algos --- pandas/core/algorithms.py | 24 ++++++++++++++++++++++++ pandas/core/reshape/concat.py | 11 ++--------- 2 files changed, 26 insertions(+), 9 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 2ce3f2d9a7bfa..f8fecd6f45969 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -2112,3 +2112,27 @@ def sort_mixed(values): np.putmask(new_codes, mask, na_sentinel) return ordered, ensure_platform_int(new_codes) + + +def make_duplicates_of_left_unique_in_right(left, right) -> np.ndarray: + """ + Drops all duplicates values from left in right, so that they are + unique in right. + + Parameters + ---------- + left: Index + right: Index + + Returns + ------- + Duplicates of left unique in right + """ + left_duplicates = unique(left[duplicated(left)]) + right_cleared = right[ + ~( + duplicated(right) + & np.isin(right, left_duplicates) + ) + ] + return right_cleared diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 3b19957ed2f60..2cd8babcf535f 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -13,6 +13,7 @@ from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries from pandas.core.dtypes.missing import isna +import pandas.core.algorithms as algos from pandas.core.arrays.categorical import ( factorize_from_iterable, factorize_from_iterables, @@ -498,15 +499,7 @@ def get_result(self): # We have to remove the duplicates from obj_labels # in new labels to make them unique, otherwise we would # duplicate or duplicates again - obj_labels_duplicates = obj_labels[ - obj_labels.duplicated() - ].unique() - new_labels_cleared = new_labels[ - ~( - new_labels.duplicated() - & new_labels.isin(obj_labels_duplicates) - ) - ] + new_labels_cleared = algos.make_duplicates_of_left_unique_in_right(obj_labels.values, new_labels.values) indexers[ax] = obj_labels.reindex(new_labels_cleared)[1] mgrs_indexers.append((obj._mgr, indexers)) From 7129ced7808da2c8e12e39655d9418506fe8f127 Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 4 Oct 2020 22:21:53 +0200 Subject: [PATCH 03/11] Add benchmark --- asv_bench/benchmarks/algorithms.py | 12 ++++++++++++ pandas/core/algorithms.py | 14 ++++---------- pandas/core/reshape/concat.py | 7 +++++-- 3 files changed, 21 insertions(+), 12 deletions(-) diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 65e52e03c43c7..03480ae198345 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -5,6 +5,7 @@ from pandas._libs import lib import pandas as pd +from pandas.core.algorithms import make_duplicates_of_left_unique_in_right from .pandas_vb_common import tm @@ -174,4 +175,15 @@ def time_argsort(self, N): self.array.argsort() +class RemoveDuplicates: + def setup(self): + N = 10 ** 5 + na = np.arange(int(N / 2)) + self.left = np.concatenate([na[: int(N / 4)], na[: int(N / 4)]]) + self.right = np.concatenate([na, na]) + + def time_make_duplicates_of_left_unique_in_right(self): + make_duplicates_of_left_unique_in_right(self.left, self.right) + + from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 94bebc1a81f5e..ccdf9852fb2ed 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -2124,18 +2124,12 @@ def make_duplicates_of_left_unique_in_right(left, right) -> np.ndarray: Parameters ---------- - left: Index - right: Index + left: ndarray + right: ndarray Returns ------- - Duplicates of left unique in right + Duplicates of left are unique in right """ left_duplicates = unique(left[duplicated(left)]) - right_cleared = right[ - ~( - duplicated(right) - & np.isin(right, left_duplicates) - ) - ] - return right_cleared + return right[~(duplicated(right) & np.isin(right, left_duplicates))] diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 9e937fe02c813..5daad1c728e22 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -500,8 +500,11 @@ def get_result(self): # We have to remove the duplicates from obj_labels # in new labels to make them unique, otherwise we would # duplicate or duplicates again - new_labels_cleared = algos.make_duplicates_of_left_unique_in_right(obj_labels.values, new_labels.values) - indexers[ax] = obj_labels.reindex(new_labels_cleared)[1] + if not obj_labels.is_unique: + new_labels = algos.make_duplicates_of_left_unique_in_right( + obj_labels.values, new_labels.values + ) + indexers[ax] = obj_labels.reindex(new_labels)[1] mgrs_indexers.append((obj._mgr, indexers)) From 9cce3ff7f6ad4b30ba265e99dcada78476bf13f8 Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 13 Nov 2020 00:58:46 +0100 Subject: [PATCH 04/11] Add test in different file --- pandas/tests/reshape/concat/test_concat.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index a1351ce782669..75d7d9b65aefb 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -556,3 +556,15 @@ def test_concat_preserves_extension_int64_dtype(): result = pd.concat([df_a, df_b], ignore_index=True) expected = DataFrame({"a": [-1, None], "b": [None, 1]}, dtype="Int64") tm.assert_frame_equal(result, expected) + + +def test_concat_duplicate_indexes(): + # GH#36263 ValueError with non unique indexes + df1 = pd.DataFrame([1, 2, 3, 4], index=[0, 1, 1, 4], columns=["a"]) + df2 = pd.DataFrame([6, 7, 8, 9], index=[0, 0, 1, 3], columns=["b"]) + result = pd.concat([df1, df2], axis=1) + expected = pd.DataFrame( + {"a": [1, 1, 2, 3, np.nan, 4], "b": [6, 7, 8, 8, 9, np.nan]}, + index=pd.Index([0, 0, 1, 1, 3, 4]), + ) + tm.assert_frame_equal(result, expected) From d0c4ea5e098e2b6d42bb56604b38f3be9ab9749b Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 13 Nov 2020 01:00:59 +0100 Subject: [PATCH 05/11] Fix pattern --- pandas/tests/reshape/concat/test_concat.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index 75d7d9b65aefb..b0f495a404a80 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -560,11 +560,11 @@ def test_concat_preserves_extension_int64_dtype(): def test_concat_duplicate_indexes(): # GH#36263 ValueError with non unique indexes - df1 = pd.DataFrame([1, 2, 3, 4], index=[0, 1, 1, 4], columns=["a"]) - df2 = pd.DataFrame([6, 7, 8, 9], index=[0, 0, 1, 3], columns=["b"]) - result = pd.concat([df1, df2], axis=1) - expected = pd.DataFrame( + df1 = DataFrame([1, 2, 3, 4], index=[0, 1, 1, 4], columns=["a"]) + df2 = DataFrame([6, 7, 8, 9], index=[0, 0, 1, 3], columns=["b"]) + result = concat([df1, df2], axis=1) + expected = DataFrame( {"a": [1, 1, 2, 3, np.nan, 4], "b": [6, 7, 8, 8, 9, np.nan]}, - index=pd.Index([0, 0, 1, 1, 3, 4]), + index=Index([0, 0, 1, 1, 3, 4]), ) tm.assert_frame_equal(result, expected) From a96b2626a23fe0b635ff7a5598e94303b832415e Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 13 Nov 2020 20:29:33 +0100 Subject: [PATCH 06/11] Use asarray --- pandas/core/reshape/concat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 5b14158748d04..ee54b06f5bceb 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -507,7 +507,7 @@ def get_result(self): # duplicate or duplicates again if not obj_labels.is_unique: new_labels = algos.make_duplicates_of_left_unique_in_right( - obj_labels.values, new_labels.values + np.asarray(obj_labels), np.asarray(new_labels) ) indexers[ax] = obj_labels.reindex(new_labels)[1] From e21939f63e554ea85ff97fe370b266da1ecba384 Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 13 Nov 2020 20:48:45 +0100 Subject: [PATCH 07/11] Adress review comments --- pandas/core/algorithms.py | 4 +++- pandas/tests/reshape/concat/test_concat.py | 12 ------------ pandas/tests/reshape/concat/test_dataframe.py | 11 +++++++++++ pandas/tests/test_algos.py | 12 ++++++++++++ 4 files changed, 26 insertions(+), 13 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 1e74eb28a0f8a..fc6d08777e3c9 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -2151,7 +2151,9 @@ def _sort_tuples(values: np.ndarray[tuple]): return values[indexer] -def make_duplicates_of_left_unique_in_right(left, right) -> np.ndarray: +def make_duplicates_of_left_unique_in_right( + left: np.ndarray, right: np.ndarray +) -> np.ndarray: """ Drops all duplicates values from left in right, so that they are unique in right. diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index b0f495a404a80..a1351ce782669 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -556,15 +556,3 @@ def test_concat_preserves_extension_int64_dtype(): result = pd.concat([df_a, df_b], ignore_index=True) expected = DataFrame({"a": [-1, None], "b": [None, 1]}, dtype="Int64") tm.assert_frame_equal(result, expected) - - -def test_concat_duplicate_indexes(): - # GH#36263 ValueError with non unique indexes - df1 = DataFrame([1, 2, 3, 4], index=[0, 1, 1, 4], columns=["a"]) - df2 = DataFrame([6, 7, 8, 9], index=[0, 0, 1, 3], columns=["b"]) - result = concat([df1, df2], axis=1) - expected = DataFrame( - {"a": [1, 1, 2, 3, np.nan, 4], "b": [6, 7, 8, 8, 9, np.nan]}, - index=Index([0, 0, 1, 1, 3, 4]), - ) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/concat/test_dataframe.py b/pandas/tests/reshape/concat/test_dataframe.py index 295846ee1b264..babc8124877e9 100644 --- a/pandas/tests/reshape/concat/test_dataframe.py +++ b/pandas/tests/reshape/concat/test_dataframe.py @@ -167,3 +167,14 @@ def test_concat_dataframe_keys_bug(self, sort): # it works result = concat([t1, t2], axis=1, keys=["t1", "t2"], sort=sort) assert list(result.columns) == [("t1", "value"), ("t2", "value")] + + def test_concat_duplicate_indexes(self): + # GH#36263 ValueError with non unique indexes + df1 = DataFrame([1, 2, 3, 4], index=[0, 1, 1, 4], columns=["a"]) + df2 = DataFrame([6, 7, 8, 9], index=[0, 0, 1, 3], columns=["b"]) + result = concat([df1, df2], axis=1) + expected = DataFrame( + {"a": [1, 1, 2, 3, np.nan, 4], "b": [6, 7, 8, 8, 9, np.nan]}, + index=Index([0, 0, 1, 1, 3, 4]), + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 88286448de900..cc824bbb7cba3 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -2358,3 +2358,15 @@ def test_diff_ea_axis(self): msg = "cannot diff DatetimeArray on axis=1" with pytest.raises(ValueError, match=msg): algos.diff(dta, 1, axis=1) + + +@pytest.mark.parametrize( + "left_values", [[0, 1, 1, 4], [0, 1, 1, 4, 4], [0, 1, 1, 1, 4]] +) +def test_make_duplicates_of_left_unique_in_right(left_values): + # GH#36263 + left = np.array(left_values) + right = np.array([0, 0, 1, 1, 4]) + result = algos.make_duplicates_of_left_unique_in_right(left, right) + expected = np.array(([0, 0, 1, 4])) + tm.assert_numpy_array_equal(result, expected) From 23344a262d3852e6de0c8a44b9271a1af4b5b8cb Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 13 Nov 2020 21:01:31 +0100 Subject: [PATCH 08/11] Delete brackets --- pandas/tests/test_algos.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index cc824bbb7cba3..ad878238b0fe2 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -2368,5 +2368,5 @@ def test_make_duplicates_of_left_unique_in_right(left_values): left = np.array(left_values) right = np.array([0, 0, 1, 1, 4]) result = algos.make_duplicates_of_left_unique_in_right(left, right) - expected = np.array(([0, 0, 1, 4])) + expected = np.array([0, 0, 1, 4]) tm.assert_numpy_array_equal(result, expected) From a4e185102975762cf64fbda269db58843427f3f7 Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 18 Nov 2020 21:39:20 +0100 Subject: [PATCH 09/11] Change whatsnew --- doc/source/whatsnew/v1.2.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index e543976bb78d6..4ed81e7804c30 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -567,7 +567,7 @@ Reshaping - Bug in :meth:`DataFrame.combine_first()` caused wrong alignment with dtype ``string`` and one level of ``MultiIndex`` containing only ``NA`` (:issue:`37591`) - Fixed regression in :func:`merge` on merging DatetimeIndex with empty DataFrame (:issue:`36895`) - Bug in :meth:`DataFrame.apply` not setting index of return value when ``func`` return type is ``dict`` (:issue:`37544`) -- Bug in :func:`concat` resulted in a ``ValueError`` when at least one of both inputs had a non unique index (:issue:`36263`) +- Bug in :func:`concat` resulting in a ``ValueError`` when at least one of both inputs had a non-unique index (:issue:`36263`) Sparse ^^^^^^ From 354048d7fa58a1f1ce33fe545604f6a4e87b8dfc Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 18 Nov 2020 21:42:57 +0100 Subject: [PATCH 10/11] Remove file --- pandas/core/algorithms.py | 2 +- pandas/tests/reshape/test_concat.py | 0 2 files changed, 1 insertion(+), 1 deletion(-) delete mode 100644 pandas/tests/reshape/test_concat.py diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index fc6d08777e3c9..4e87907dd84db 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -2168,4 +2168,4 @@ def make_duplicates_of_left_unique_in_right( Duplicates of left are unique in right """ left_duplicates = unique(left[duplicated(left)]) - return right[~(duplicated(right) & np.isin(right, left_duplicates))] + return right[~(duplicated(right) & isin(right, left_duplicates))] diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 From ab8b03bff1c02c751d5e0347da3ca4b23a863248 Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 18 Nov 2020 21:47:37 +0100 Subject: [PATCH 11/11] Add comment --- pandas/core/algorithms.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 4e87907dd84db..cebfe9e5ca9d8 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -2155,8 +2155,9 @@ def make_duplicates_of_left_unique_in_right( left: np.ndarray, right: np.ndarray ) -> np.ndarray: """ - Drops all duplicates values from left in right, so that they are - unique in right. + If left has duplicates, which are also duplicated in right, this duplicated values + are dropped from right, meaning that every duplicate value from left exists only + once in right. Parameters ----------