Skip to content

Commit 86e187f

Browse files
batterseapowerjreback
authored andcommitted
BUG: CategoricalIndex allowed reindexing duplicate sources (#28257)
1 parent a0d01b8 commit 86e187f

File tree

10 files changed

+126
-86
lines changed

10 files changed

+126
-86
lines changed

doc/source/user_guide/advanced.rst

+25-11
Original file line numberDiff line numberDiff line change
@@ -783,27 +783,41 @@ values **not** in the categories, similarly to how you can reindex **any** panda
783783

784784
.. ipython:: python
785785
786-
df2.reindex(['a', 'e'])
787-
df2.reindex(['a', 'e']).index
788-
df2.reindex(pd.Categorical(['a', 'e'], categories=list('abcde')))
789-
df2.reindex(pd.Categorical(['a', 'e'], categories=list('abcde'))).index
786+
df3 = pd.DataFrame({'A': np.arange(3),
787+
'B': pd.Series(list('abc')).astype('category')})
788+
df3 = df3.set_index('B')
789+
df3
790+
791+
.. ipython:: python
792+
793+
df3.reindex(['a', 'e'])
794+
df3.reindex(['a', 'e']).index
795+
df3.reindex(pd.Categorical(['a', 'e'], categories=list('abe')))
796+
df3.reindex(pd.Categorical(['a', 'e'], categories=list('abe'))).index
790797
791798
.. warning::
792799

793800
Reshaping and Comparison operations on a ``CategoricalIndex`` must have the same categories
794801
or a ``TypeError`` will be raised.
795802

796-
.. code-block:: ipython
803+
.. ipython:: python
797804
798-
In [9]: df3 = pd.DataFrame({'A': np.arange(6), 'B': pd.Series(list('aabbca')).astype('category')})
805+
df4 = pd.DataFrame({'A': np.arange(2),
806+
'B': list('ba')})
807+
df4['B'] = df4['B'].astype(CategoricalDtype(list('ab')))
808+
df4 = df4.set_index('B')
809+
df4.index
799810
800-
In [11]: df3 = df3.set_index('B')
811+
df5 = pd.DataFrame({'A': np.arange(2),
812+
'B': list('bc')})
813+
df5['B'] = df5['B'].astype(CategoricalDtype(list('bc')))
814+
df5 = df5.set_index('B')
815+
df5.index
801816
802-
In [11]: df3.index
803-
Out[11]: CategoricalIndex(['a', 'a', 'b', 'b', 'c', 'a'], categories=['a', 'b', 'c'], ordered=False, name='B', dtype='category')
817+
.. code-block:: ipython
804818
805-
In [12]: pd.concat([df2, df3])
806-
TypeError: categories must match existing categories when appending
819+
In [1]: pd.concat([df4, df5])
820+
TypeError: categories must match existing categories when appending
807821
808822
.. _indexing.rangeindex:
809823

doc/source/whatsnew/v1.0.0.rst

+4
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,7 @@ Categorical
223223

224224
- Added test to assert the :func:`fillna` raises the correct ValueError message when the value isn't a value from categories (:issue:`13628`)
225225
- Bug in :meth:`Categorical.astype` where ``NaN`` values were handled incorrectly when casting to int (:issue:`28406`)
226+
- :meth:`DataFrame.reindex` with a :class:`CategoricalIndex` would fail when the targets contained duplicates, and wouldn't fail if the source contained duplicates (:issue:`28107`)
226227
- Bug in :meth:`Categorical.astype` not allowing for casting to extension dtypes (:issue:`28668`)
227228
- Bug where :func:`merge` was unable to join on categorical and extension dtype columns (:issue:`28668`)
228229
- :meth:`Categorical.searchsorted` and :meth:`CategoricalIndex.searchsorted` now work on unordered categoricals also (:issue:`21667`)
@@ -292,6 +293,9 @@ Indexing
292293
- Bug in reindexing a :meth:`PeriodIndex` with another type of index that contained a `Period` (:issue:`28323`) (:issue:`28337`)
293294
- Fix assignment of column via `.loc` with numpy non-ns datetime type (:issue:`27395`)
294295
- Bug in :meth:`Float64Index.astype` where ``np.inf`` was not handled properly when casting to an integer dtype (:issue:`28475`)
296+
- :meth:`Index.union` could fail when the left contained duplicates (:issue:`28257`)
297+
- :meth:`Index.get_indexer_non_unique` could fail with `TypeError` in some cases, such as when searching for ints in a string index (:issue:`28257`)
298+
-
295299

296300
Missing
297301
^^^^^^^

pandas/_libs/index.pyx

+14-6
Original file line numberDiff line numberDiff line change
@@ -286,7 +286,7 @@ cdef class IndexEngine:
286286
cdef:
287287
ndarray values, x
288288
ndarray[int64_t] result, missing
289-
set stargets
289+
set stargets, remaining_stargets
290290
dict d = {}
291291
object val
292292
int count = 0, count_missing = 0
@@ -309,12 +309,20 @@ cdef class IndexEngine:
309309
if stargets and len(stargets) < 5 and self.is_monotonic_increasing:
310310
# if there are few enough stargets and the index is monotonically
311311
# increasing, then use binary search for each starget
312+
remaining_stargets = set()
312313
for starget in stargets:
313-
start = values.searchsorted(starget, side='left')
314-
end = values.searchsorted(starget, side='right')
315-
if start != end:
316-
d[starget] = list(range(start, end))
317-
else:
314+
try:
315+
start = values.searchsorted(starget, side='left')
316+
end = values.searchsorted(starget, side='right')
317+
except TypeError: # e.g. if we tried to search for string in int array
318+
remaining_stargets.add(starget)
319+
else:
320+
if start != end:
321+
d[starget] = list(range(start, end))
322+
323+
stargets = remaining_stargets
324+
325+
if stargets:
318326
# otherwise, map by iterating through all items in the index
319327
for i in range(n):
320328
val = values[i]

pandas/core/indexes/base.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -2493,8 +2493,12 @@ def _union(self, other, sort):
24932493
value_set = set(lvals)
24942494
result.extend([x for x in rvals if x not in value_set])
24952495
else:
2496-
indexer = self.get_indexer(other)
2497-
indexer, = (indexer == -1).nonzero()
2496+
# find indexes of things in "other" that are not in "self"
2497+
if self.is_unique:
2498+
indexer = self.get_indexer(other)
2499+
indexer = (indexer == -1).nonzero()[0]
2500+
else:
2501+
indexer = algos.unique1d(self.get_indexer_non_unique(other)[1])
24982502

24992503
if len(indexer) > 0:
25002504
other_diff = algos.take_nd(rvals, indexer, allow_fill=False)

pandas/core/indexes/category.py

-8
Original file line numberDiff line numberDiff line change
@@ -552,10 +552,6 @@ def get_value(self, series: AnyArrayLike, key: Any):
552552
# we might be a positional inexer
553553
return super().get_value(series, key)
554554

555-
def _can_reindex(self, indexer):
556-
""" always allow reindexing """
557-
pass
558-
559555
@Substitution(klass="CategoricalIndex")
560556
@Appender(_shared_docs["searchsorted"])
561557
def searchsorted(self, value, side="left", sorter=None):
@@ -585,7 +581,6 @@ def reindex(self, target, method=None, level=None, limit=None, tolerance=None):
585581
Indices of output values in original index
586582
587583
"""
588-
589584
if method is not None:
590585
raise NotImplementedError(
591586
"argument method is not implemented for CategoricalIndex.reindex"
@@ -605,9 +600,6 @@ def reindex(self, target, method=None, level=None, limit=None, tolerance=None):
605600
indexer = None
606601
missing = []
607602
else:
608-
if not target.is_unique:
609-
raise ValueError("cannot reindex with a non-unique indexer")
610-
611603
indexer, missing = self.get_indexer_non_unique(np.array(target))
612604

613605
if len(self.codes) and indexer is not None:

pandas/tests/indexes/test_category.py

+12-8
Original file line numberDiff line numberDiff line change
@@ -599,15 +599,19 @@ def test_reindex_dtype(self):
599599
tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp))
600600

601601
def test_reindex_duplicate_target(self):
602-
# See GH23963
603-
c = CategoricalIndex(["a", "b", "c", "a"], categories=["a", "b", "c", "d"])
604-
with pytest.raises(ValueError, match="non-unique indexer"):
605-
c.reindex(["a", "a", "c"])
602+
# See GH25459
603+
cat = CategoricalIndex(["a", "b", "c"], categories=["a", "b", "c", "d"])
604+
res, indexer = cat.reindex(["a", "c", "c"])
605+
exp = Index(["a", "c", "c"], dtype="object")
606+
tm.assert_index_equal(res, exp, exact=True)
607+
tm.assert_numpy_array_equal(indexer, np.array([0, 2, 2], dtype=np.intp))
606608

607-
with pytest.raises(ValueError, match="non-unique indexer"):
608-
c.reindex(
609-
CategoricalIndex(["a", "a", "c"], categories=["a", "b", "c", "d"])
610-
)
609+
res, indexer = cat.reindex(
610+
CategoricalIndex(["a", "c", "c"], categories=["a", "b", "c", "d"])
611+
)
612+
exp = CategoricalIndex(["a", "c", "c"], categories=["a", "b", "c", "d"])
613+
tm.assert_index_equal(res, exp, exact=True)
614+
tm.assert_numpy_array_equal(indexer, np.array([0, 2, 2], dtype=np.intp))
611615

612616
def test_reindex_empty_index(self):
613617
# See GH16770

pandas/tests/indexing/test_categorical.py

+34-37
Original file line numberDiff line numberDiff line change
@@ -561,92 +561,89 @@ def test_read_only_source(self):
561561
assert_frame_equal(rw_df.loc[1:3], ro_df.loc[1:3])
562562

563563
def test_reindexing(self):
564+
df = DataFrame(
565+
{
566+
"A": np.arange(3, dtype="int64"),
567+
"B": Series(list("abc")).astype(CDT(list("cabe"))),
568+
}
569+
).set_index("B")
564570

565571
# reindexing
566572
# convert to a regular index
567-
result = self.df2.reindex(["a", "b", "e"])
568-
expected = DataFrame(
569-
{"A": [0, 1, 5, 2, 3, np.nan], "B": Series(list("aaabbe"))}
570-
).set_index("B")
573+
result = df.reindex(["a", "b", "e"])
574+
expected = DataFrame({"A": [0, 1, np.nan], "B": Series(list("abe"))}).set_index(
575+
"B"
576+
)
571577
assert_frame_equal(result, expected, check_index_type=True)
572578

573-
result = self.df2.reindex(["a", "b"])
574-
expected = DataFrame(
575-
{"A": [0, 1, 5, 2, 3], "B": Series(list("aaabb"))}
576-
).set_index("B")
579+
result = df.reindex(["a", "b"])
580+
expected = DataFrame({"A": [0, 1], "B": Series(list("ab"))}).set_index("B")
577581
assert_frame_equal(result, expected, check_index_type=True)
578582

579-
result = self.df2.reindex(["e"])
583+
result = df.reindex(["e"])
580584
expected = DataFrame({"A": [np.nan], "B": Series(["e"])}).set_index("B")
581585
assert_frame_equal(result, expected, check_index_type=True)
582586

583-
result = self.df2.reindex(["d"])
587+
result = df.reindex(["d"])
584588
expected = DataFrame({"A": [np.nan], "B": Series(["d"])}).set_index("B")
585589
assert_frame_equal(result, expected, check_index_type=True)
586590

587591
# since we are actually reindexing with a Categorical
588592
# then return a Categorical
589593
cats = list("cabe")
590594

591-
result = self.df2.reindex(Categorical(["a", "d"], categories=cats))
595+
result = df.reindex(Categorical(["a", "e"], categories=cats))
592596
expected = DataFrame(
593-
{"A": [0, 1, 5, np.nan], "B": Series(list("aaad")).astype(CDT(cats))}
597+
{"A": [0, np.nan], "B": Series(list("ae")).astype(CDT(cats))}
594598
).set_index("B")
595599
assert_frame_equal(result, expected, check_index_type=True)
596600

597-
result = self.df2.reindex(Categorical(["a"], categories=cats))
601+
result = df.reindex(Categorical(["a"], categories=cats))
598602
expected = DataFrame(
599-
{"A": [0, 1, 5], "B": Series(list("aaa")).astype(CDT(cats))}
603+
{"A": [0], "B": Series(list("a")).astype(CDT(cats))}
600604
).set_index("B")
601605
assert_frame_equal(result, expected, check_index_type=True)
602606

603-
result = self.df2.reindex(["a", "b", "e"])
604-
expected = DataFrame(
605-
{"A": [0, 1, 5, 2, 3, np.nan], "B": Series(list("aaabbe"))}
606-
).set_index("B")
607+
result = df.reindex(["a", "b", "e"])
608+
expected = DataFrame({"A": [0, 1, np.nan], "B": Series(list("abe"))}).set_index(
609+
"B"
610+
)
607611
assert_frame_equal(result, expected, check_index_type=True)
608612

609-
result = self.df2.reindex(["a", "b"])
610-
expected = DataFrame(
611-
{"A": [0, 1, 5, 2, 3], "B": Series(list("aaabb"))}
612-
).set_index("B")
613+
result = df.reindex(["a", "b"])
614+
expected = DataFrame({"A": [0, 1], "B": Series(list("ab"))}).set_index("B")
613615
assert_frame_equal(result, expected, check_index_type=True)
614616

615-
result = self.df2.reindex(["e"])
617+
result = df.reindex(["e"])
616618
expected = DataFrame({"A": [np.nan], "B": Series(["e"])}).set_index("B")
617619
assert_frame_equal(result, expected, check_index_type=True)
618620

619621
# give back the type of categorical that we received
620-
result = self.df2.reindex(
621-
Categorical(["a", "d"], categories=cats, ordered=True)
622-
)
622+
result = df.reindex(Categorical(["a", "e"], categories=cats, ordered=True))
623623
expected = DataFrame(
624-
{
625-
"A": [0, 1, 5, np.nan],
626-
"B": Series(list("aaad")).astype(CDT(cats, ordered=True)),
627-
}
624+
{"A": [0, np.nan], "B": Series(list("ae")).astype(CDT(cats, ordered=True))}
628625
).set_index("B")
629626
assert_frame_equal(result, expected, check_index_type=True)
630627

631-
result = self.df2.reindex(Categorical(["a", "d"], categories=["a", "d"]))
628+
result = df.reindex(Categorical(["a", "d"], categories=["a", "d"]))
632629
expected = DataFrame(
633-
{"A": [0, 1, 5, np.nan], "B": Series(list("aaad")).astype(CDT(["a", "d"]))}
630+
{"A": [0, np.nan], "B": Series(list("ad")).astype(CDT(["a", "d"]))}
634631
).set_index("B")
635632
assert_frame_equal(result, expected, check_index_type=True)
636633

637634
# passed duplicate indexers are not allowed
638-
msg = "cannot reindex with a non-unique indexer"
635+
msg = "cannot reindex from a duplicate axis"
639636
with pytest.raises(ValueError, match=msg):
640-
self.df2.reindex(["a", "a"])
637+
self.df2.reindex(["a", "b"])
641638

642639
# args NotImplemented ATM
643640
msg = r"argument {} is not implemented for CategoricalIndex\.reindex"
644641
with pytest.raises(NotImplementedError, match=msg.format("method")):
645-
self.df2.reindex(["a"], method="ffill")
642+
df.reindex(["a"], method="ffill")
646643
with pytest.raises(NotImplementedError, match=msg.format("level")):
647-
self.df2.reindex(["a"], level=1)
644+
df.reindex(["a"], level=1)
648645
with pytest.raises(NotImplementedError, match=msg.format("limit")):
649-
self.df2.reindex(["a"], limit=2)
646+
df.reindex(["a"], limit=2)
650647

651648
def test_loc_slice(self):
652649
# slicing

pandas/tests/series/test_operators.py

+22-13
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
import pandas as pd
88
from pandas import Categorical, DataFrame, Index, Series, bdate_range, date_range, isna
99
from pandas.core import ops
10-
from pandas.core.indexes.base import InvalidIndexError
1110
import pandas.core.nanops as nanops
1211
import pandas.util.testing as tm
1312
from pandas.util.testing import (
@@ -282,44 +281,54 @@ def test_logical_ops_with_index(self, op):
282281
result = op(ser, idx2)
283282
assert_series_equal(result, expected)
284283

284+
def test_reversed_xor_with_index_returns_index(self):
285+
# GH#22092, GH#19792
286+
ser = Series([True, True, False, False])
287+
idx1 = Index([True, False, True, False])
288+
idx2 = Index([1, 0, 1, 0])
289+
290+
expected = Index.symmetric_difference(idx1, ser)
291+
result = idx1 ^ ser
292+
assert_index_equal(result, expected)
293+
294+
expected = Index.symmetric_difference(idx2, ser)
295+
result = idx2 ^ ser
296+
assert_index_equal(result, expected)
297+
285298
@pytest.mark.parametrize(
286299
"op",
287300
[
288301
pytest.param(
289302
ops.rand_,
290303
marks=pytest.mark.xfail(
291-
reason="GH#22092 Index implementation returns Index",
304+
reason="GH#22092 Index __and__ returns Index intersection",
292305
raises=AssertionError,
293306
strict=True,
294307
),
295308
),
296309
pytest.param(
297310
ops.ror_,
298311
marks=pytest.mark.xfail(
299-
reason="Index.get_indexer with non unique index",
300-
raises=InvalidIndexError,
312+
reason="GH#22092 Index __or__ returns Index union",
313+
raises=AssertionError,
301314
strict=True,
302315
),
303316
),
304-
ops.rxor,
305317
],
306318
)
307-
def test_reversed_logical_ops_with_index(self, op):
319+
def test_reversed_logical_op_with_index_returns_series(self, op):
308320
# GH#22092, GH#19792
309321
ser = Series([True, True, False, False])
310322
idx1 = Index([True, False, True, False])
311323
idx2 = Index([1, 0, 1, 0])
312324

313-
# symmetric_difference is only for rxor, but other 2 should fail
314-
expected = idx1.symmetric_difference(ser)
315-
325+
expected = pd.Series(op(idx1.values, ser.values))
316326
result = op(ser, idx1)
317-
assert_index_equal(result, expected)
318-
319-
expected = idx2.symmetric_difference(ser)
327+
assert_series_equal(result, expected)
320328

329+
expected = pd.Series(op(idx2.values, ser.values))
321330
result = op(ser, idx2)
322-
assert_index_equal(result, expected)
331+
assert_series_equal(result, expected)
323332

324333
@pytest.mark.parametrize(
325334
"op, expected",

pandas/tests/test_base.py

+6
Original file line numberDiff line numberDiff line change
@@ -1009,6 +1009,12 @@ def test_bool_indexing(self, indexer_klass, indexer):
10091009
s = pd.Series(idx)
10101010
tm.assert_series_equal(s[indexer_klass(indexer)], s.iloc[exp_idx])
10111011

1012+
def test_get_indexer_non_unique_dtype_mismatch(self):
1013+
# GH 25459
1014+
indexes, missing = pd.Index(["A", "B"]).get_indexer_non_unique(pd.Index([0]))
1015+
tm.assert_numpy_array_equal(np.array([-1], dtype=np.intp), indexes)
1016+
tm.assert_numpy_array_equal(np.array([0], dtype=np.int64), missing)
1017+
10121018

10131019
class TestTranspose(Ops):
10141020
errmsg = "the 'axes' parameter is not supported"

0 commit comments

Comments
 (0)