Skip to content

Commit fa478d3

Browse files
authored
[BUG] Concat duplicates errors (or lack there of) (#38654)
* Revert "[BUG]: Fix ValueError in concat() when at least one Index has duplicates (#36290)" This reverts commit b32febd.
1 parent 0805043 commit fa478d3

File tree

6 files changed

+1
-66
lines changed

6 files changed

+1
-66
lines changed

asv_bench/benchmarks/algorithms.py

-12
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
from pandas._libs import lib
66

77
import pandas as pd
8-
from pandas.core.algorithms import make_duplicates_of_left_unique_in_right
98

109
from .pandas_vb_common import tm
1110

@@ -175,15 +174,4 @@ def time_argsort(self, N):
175174
self.array.argsort()
176175

177176

178-
class RemoveDuplicates:
179-
def setup(self):
180-
N = 10 ** 5
181-
na = np.arange(int(N / 2))
182-
self.left = np.concatenate([na[: int(N / 4)], na[: int(N / 4)]])
183-
self.right = np.concatenate([na, na])
184-
185-
def time_make_duplicates_of_left_unique_in_right(self):
186-
make_duplicates_of_left_unique_in_right(self.left, self.right)
187-
188-
189177
from .pandas_vb_common import setup # noqa: F401 isort:skip

doc/source/whatsnew/v1.2.0.rst

-1
Original file line numberDiff line numberDiff line change
@@ -823,7 +823,6 @@ Reshaping
823823
- Bug in :meth:`DataFrame.combine_first` caused wrong alignment with dtype ``string`` and one level of ``MultiIndex`` containing only ``NA`` (:issue:`37591`)
824824
- Fixed regression in :func:`merge` on merging :class:`.DatetimeIndex` with empty DataFrame (:issue:`36895`)
825825
- Bug in :meth:`DataFrame.apply` not setting index of return value when ``func`` return type is ``dict`` (:issue:`37544`)
826-
- Bug in :func:`concat` resulting in a ``ValueError`` when at least one of both inputs had a non-unique index (:issue:`36263`)
827826
- Bug in :meth:`DataFrame.merge` and :meth:`pandas.merge` returning inconsistent ordering in result for ``how=right`` and ``how=left`` (:issue:`35382`)
828827
- Bug in :func:`merge_ordered` couldn't handle list-like ``left_by`` or ``right_by`` (:issue:`35269`)
829828
- Bug in :func:`merge_ordered` returned wrong join result when length of ``left_by`` or ``right_by`` equals to the rows of ``left`` or ``right`` (:issue:`38166`)

pandas/core/algorithms.py

-21
Original file line numberDiff line numberDiff line change
@@ -2199,24 +2199,3 @@ def _sort_tuples(values: np.ndarray[tuple]):
21992199
arrays, _ = to_arrays(values, None)
22002200
indexer = lexsort_indexer(arrays, orders=True)
22012201
return values[indexer]
2202-
2203-
2204-
def make_duplicates_of_left_unique_in_right(
2205-
left: np.ndarray, right: np.ndarray
2206-
) -> np.ndarray:
2207-
"""
2208-
If left has duplicates, which are also duplicated in right, this duplicated values
2209-
are dropped from right, meaning that every duplicate value from left exists only
2210-
once in right.
2211-
2212-
Parameters
2213-
----------
2214-
left: ndarray
2215-
right: ndarray
2216-
2217-
Returns
2218-
-------
2219-
Duplicates of left are unique in right
2220-
"""
2221-
left_duplicates = unique(left[duplicated(left)])
2222-
return right[~(duplicated(right) & isin(right, left_duplicates))]

pandas/core/reshape/concat.py

+1-9
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@
2424
from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries
2525
from pandas.core.dtypes.missing import isna
2626

27-
import pandas.core.algorithms as algos
2827
from pandas.core.arrays.categorical import (
2928
factorize_from_iterable,
3029
factorize_from_iterables,
@@ -513,14 +512,7 @@ def get_result(self):
513512
# 1-ax to convert BlockManager axis to DataFrame axis
514513
obj_labels = obj.axes[1 - ax]
515514
if not new_labels.equals(obj_labels):
516-
# We have to remove the duplicates from obj_labels
517-
# in new labels to make them unique, otherwise we would
518-
# duplicate or duplicates again
519-
if not obj_labels.is_unique:
520-
new_labels = algos.make_duplicates_of_left_unique_in_right(
521-
np.asarray(obj_labels), np.asarray(new_labels)
522-
)
523-
indexers[ax] = obj_labels.reindex(new_labels)[1]
515+
indexers[ax] = obj_labels.get_indexer(new_labels)
524516

525517
mgrs_indexers.append((obj._mgr, indexers))
526518

pandas/tests/reshape/concat/test_dataframe.py

-11
Original file line numberDiff line numberDiff line change
@@ -167,14 +167,3 @@ def test_concat_dataframe_keys_bug(self, sort):
167167
# it works
168168
result = concat([t1, t2], axis=1, keys=["t1", "t2"], sort=sort)
169169
assert list(result.columns) == [("t1", "value"), ("t2", "value")]
170-
171-
def test_concat_duplicate_indexes(self):
172-
# GH#36263 ValueError with non unique indexes
173-
df1 = DataFrame([1, 2, 3, 4], index=[0, 1, 1, 4], columns=["a"])
174-
df2 = DataFrame([6, 7, 8, 9], index=[0, 0, 1, 3], columns=["b"])
175-
result = concat([df1, df2], axis=1)
176-
expected = DataFrame(
177-
{"a": [1, 1, 2, 3, np.nan, 4], "b": [6, 7, 8, 8, 9, np.nan]},
178-
index=Index([0, 0, 1, 1, 3, 4]),
179-
)
180-
tm.assert_frame_equal(result, expected)

pandas/tests/test_algos.py

-12
Original file line numberDiff line numberDiff line change
@@ -2409,15 +2409,3 @@ def test_diff_ea_axis(self):
24092409
msg = "cannot diff DatetimeArray on axis=1"
24102410
with pytest.raises(ValueError, match=msg):
24112411
algos.diff(dta, 1, axis=1)
2412-
2413-
2414-
@pytest.mark.parametrize(
2415-
"left_values", [[0, 1, 1, 4], [0, 1, 1, 4, 4], [0, 1, 1, 1, 4]]
2416-
)
2417-
def test_make_duplicates_of_left_unique_in_right(left_values):
2418-
# GH#36263
2419-
left = np.array(left_values)
2420-
right = np.array([0, 0, 1, 1, 4])
2421-
result = algos.make_duplicates_of_left_unique_in_right(left, right)
2422-
expected = np.array([0, 0, 1, 4])
2423-
tm.assert_numpy_array_equal(result, expected)

0 commit comments

Comments
 (0)