From a998162206cfa9f81455dda5a12eea6b1a4393d9 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Wed, 23 Dec 2020 16:05:45 +1100 Subject: [PATCH 1/2] Revert "[BUG]: Fix ValueError in concat() when at least one Index has duplicates (#36290)" This reverts commit b32febd5954fa342fe477c1776a2d736a4cd24b5. --- asv_bench/benchmarks/algorithms.py | 12 ----------- doc/source/whatsnew/v1.2.0.rst | 1 - pandas/core/algorithms.py | 21 ------------------- pandas/core/reshape/concat.py | 8 ------- pandas/tests/reshape/concat/test_dataframe.py | 11 ---------- pandas/tests/test_algos.py | 12 ----------- 6 files changed, 65 deletions(-) diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 03480ae198345..65e52e03c43c7 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -5,7 +5,6 @@ from pandas._libs import lib import pandas as pd -from pandas.core.algorithms import make_duplicates_of_left_unique_in_right from .pandas_vb_common import tm @@ -175,15 +174,4 @@ def time_argsort(self, N): self.array.argsort() -class RemoveDuplicates: - def setup(self): - N = 10 ** 5 - na = np.arange(int(N / 2)) - self.left = np.concatenate([na[: int(N / 4)], na[: int(N / 4)]]) - self.right = np.concatenate([na, na]) - - def time_make_duplicates_of_left_unique_in_right(self): - make_duplicates_of_left_unique_in_right(self.left, self.right) - - from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 8aae870d50716..5577c266ac438 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -821,7 +821,6 @@ Reshaping - Bug in :meth:`DataFrame.combine_first` caused wrong alignment with dtype ``string`` and one level of ``MultiIndex`` containing only ``NA`` (:issue:`37591`) - Fixed regression in :func:`merge` on merging :class:`.DatetimeIndex` with empty DataFrame (:issue:`36895`) - Bug in :meth:`DataFrame.apply` not setting index of return value when ``func`` return type is ``dict`` (:issue:`37544`) -- Bug in :func:`concat` resulting in a ``ValueError`` when at least one of both inputs had a non-unique index (:issue:`36263`) - Bug in :meth:`DataFrame.merge` and :meth:`pandas.merge` returning inconsistent ordering in result for ``how=right`` and ``how=left`` (:issue:`35382`) - Bug in :func:`merge_ordered` couldn't handle list-like ``left_by`` or ``right_by`` (:issue:`35269`) - Bug in :func:`merge_ordered` returned wrong join result when length of ``left_by`` or ``right_by`` equals to the rows of ``left`` or ``right`` (:issue:`38166`) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 5ccafff642bd4..1061eb087318b 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -2199,24 +2199,3 @@ def _sort_tuples(values: np.ndarray[tuple]): arrays, _ = to_arrays(values, None) indexer = lexsort_indexer(arrays, orders=True) return values[indexer] - - -def make_duplicates_of_left_unique_in_right( - left: np.ndarray, right: np.ndarray -) -> np.ndarray: - """ - If left has duplicates, which are also duplicated in right, this duplicated values - are dropped from right, meaning that every duplicate value from left exists only - once in right. - - Parameters - ---------- - left: ndarray - right: ndarray - - Returns - ------- - Duplicates of left are unique in right - """ - left_duplicates = unique(left[duplicated(left)]) - return right[~(duplicated(right) & isin(right, left_duplicates))] diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 42b541bd4cb02..7bda2f01f0465 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -24,7 +24,6 @@ from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries from pandas.core.dtypes.missing import isna -import pandas.core.algorithms as algos from pandas.core.arrays.categorical import ( factorize_from_iterable, factorize_from_iterables, @@ -513,13 +512,6 @@ def get_result(self): # 1-ax to convert BlockManager axis to DataFrame axis obj_labels = obj.axes[1 - ax] if not new_labels.equals(obj_labels): - # We have to remove the duplicates from obj_labels - # in new labels to make them unique, otherwise we would - # duplicate or duplicates again - if not obj_labels.is_unique: - new_labels = algos.make_duplicates_of_left_unique_in_right( - np.asarray(obj_labels), np.asarray(new_labels) - ) indexers[ax] = obj_labels.reindex(new_labels)[1] mgrs_indexers.append((obj._mgr, indexers)) diff --git a/pandas/tests/reshape/concat/test_dataframe.py b/pandas/tests/reshape/concat/test_dataframe.py index babc8124877e9..295846ee1b264 100644 --- a/pandas/tests/reshape/concat/test_dataframe.py +++ b/pandas/tests/reshape/concat/test_dataframe.py @@ -167,14 +167,3 @@ def test_concat_dataframe_keys_bug(self, sort): # it works result = concat([t1, t2], axis=1, keys=["t1", "t2"], sort=sort) assert list(result.columns) == [("t1", "value"), ("t2", "value")] - - def test_concat_duplicate_indexes(self): - # GH#36263 ValueError with non unique indexes - df1 = DataFrame([1, 2, 3, 4], index=[0, 1, 1, 4], columns=["a"]) - df2 = DataFrame([6, 7, 8, 9], index=[0, 0, 1, 3], columns=["b"]) - result = concat([df1, df2], axis=1) - expected = DataFrame( - {"a": [1, 1, 2, 3, np.nan, 4], "b": [6, 7, 8, 8, 9, np.nan]}, - index=Index([0, 0, 1, 1, 3, 4]), - ) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 35411d7e9cfb7..ae01093fbadbf 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -2409,15 +2409,3 @@ def test_diff_ea_axis(self): msg = "cannot diff DatetimeArray on axis=1" with pytest.raises(ValueError, match=msg): algos.diff(dta, 1, axis=1) - - -@pytest.mark.parametrize( - "left_values", [[0, 1, 1, 4], [0, 1, 1, 4, 4], [0, 1, 1, 1, 4]] -) -def test_make_duplicates_of_left_unique_in_right(left_values): - # GH#36263 - left = np.array(left_values) - right = np.array([0, 0, 1, 1, 4]) - result = algos.make_duplicates_of_left_unique_in_right(left, right) - expected = np.array([0, 0, 1, 4]) - tm.assert_numpy_array_equal(result, expected) From b7856e81a54ae6a684ed9a3ad18ab9ec42ad5be5 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Wed, 23 Dec 2020 16:38:34 +1100 Subject: [PATCH 2/2] Better error message for concat with duplicate indices --- pandas/core/reshape/concat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 7bda2f01f0465..5799b579fd0dc 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -512,7 +512,7 @@ def get_result(self): # 1-ax to convert BlockManager axis to DataFrame axis obj_labels = obj.axes[1 - ax] if not new_labels.equals(obj_labels): - indexers[ax] = obj_labels.reindex(new_labels)[1] + indexers[ax] = obj_labels.get_indexer(new_labels) mgrs_indexers.append((obj._mgr, indexers))