Skip to content

Commit b2940a0

Browse files
TomAugspurgerjreback
authored andcommitted
BUG: Fixed merge on dtype equal categories (#19553)
1 parent affb5d9 commit b2940a0

File tree

5 files changed

+80
-17
lines changed

5 files changed

+80
-17
lines changed

doc/source/whatsnew/v0.23.0.txt

+26-14
Original file line numberDiff line numberDiff line change
@@ -598,6 +598,32 @@ Documentation Changes
598598
Bug Fixes
599599
~~~~~~~~~
600600

601+
Categorical
602+
^^^^^^^^^^^
603+
604+
.. warning::
605+
606+
A class of bugs were introduced in pandas 0.21 with ``CategoricalDtype`` that
607+
affects the correctness of operations like ``merge``, ``concat``, and
608+
indexing when comparing multiple unordered ``Categorical`` arrays that have
609+
the same categories, but in a different order. We highly recommend upgrading
610+
or manually aligning your categories before doing these operations.
611+
612+
- Bug in ``Categorical.equals`` returning the wrong result when comparing two
613+
unordered ``Categorical`` arrays with the same categories, but in a different
614+
order (:issue:`16603`)
615+
- Bug in :func:`pandas.api.types.union_categoricals` returning the wrong result
616+
when for unordered categoricals with the categories in a different order.
617+
This affected :func:`pandas.concat` with Categorical data (:issue:`19096`).
618+
- Bug in :func:`pandas.merge` returning the wrong result when joining on an
619+
unordered ``Categorical`` that had the same categories but in a different
620+
order (:issue:`19551`)
621+
- Bug in :meth:`CategoricalIndex.get_indexer` returning the wrong result when
622+
``target`` was an unordered ``Categorical`` that had the same categories as
623+
``self`` but in a different order (:issue:`19551`)
624+
- Bug in :meth:`Index.astype` with a categorical dtype where the resultant index is not converted to a :class:`CategoricalIndex` for all types of index (:issue:`18630`)
625+
- Bug in :meth:`Series.astype` and ``Categorical.astype()`` where an existing categorical data does not get updated (:issue:`10696`, :issue:`18593`)
626+
- Bug in :class:`Index` constructor with ``dtype=CategoricalDtype(...)`` where ``categories`` and ``ordered`` are not maintained (issue:`19032`)
601627

602628
Datetimelike
603629
^^^^^^^^^^^^
@@ -745,20 +771,6 @@ Reshaping
745771
- Improved error message for :func:`DataFrame.merge` when there is no common merge key (:issue:`19427`)
746772
-
747773

748-
749-
Categorical
750-
^^^^^^^^^^^
751-
752-
-
753-
- Bug in :func:`pandas.api.types.union_categoricals` returning the wrong result
754-
when all the categoricals had the same categories, but in a different order.
755-
This affected :func:`pandas.concat` with Categorical data (:issue:`19096`).
756-
- Bug in ``Categorical.equals`` between two unordered categories with the same categories, but in a different order (:issue:`16603`)
757-
- Bug in :meth:`Index.astype` with a categorical dtype where the resultant index is not converted to a :class:`CategoricalIndex` for all types of index (:issue:`18630`)
758-
- Bug in :meth:`Series.astype` and ``Categorical.astype()`` where an existing categorical data does not get updated (:issue:`10696`, :issue:`18593`)
759-
- Bug in :class:`Index` constructor with ``dtype=CategoricalDtype(...)`` where ``categories`` and ``ordered`` are not maintained (issue:`19032`)
760-
-
761-
762774
Other
763775
^^^^^
764776

pandas/core/indexes/category.py

+9-2
Original file line numberDiff line numberDiff line change
@@ -553,6 +553,8 @@ def _reindex_non_unique(self, target):
553553

554554
@Appender(_index_shared_docs['get_indexer'] % _index_doc_kwargs)
555555
def get_indexer(self, target, method=None, limit=None, tolerance=None):
556+
from pandas.core.arrays.categorical import _recode_for_categories
557+
556558
method = missing.clean_reindex_fill_method(method)
557559
target = ibase._ensure_index(target)
558560

@@ -568,8 +570,13 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
568570

569571
if (isinstance(target, CategoricalIndex) and
570572
self.values.is_dtype_equal(target)):
571-
# we have the same codes
572-
codes = target.codes
573+
if self.values.equals(target.values):
574+
# we have the same codes
575+
codes = target.codes
576+
else:
577+
codes = _recode_for_categories(target.codes,
578+
target.categories,
579+
self.values.categories)
573580
else:
574581
if isinstance(target, CategoricalIndex):
575582
code_indexer = self.categories.get_indexer(target.categories)

pandas/core/reshape/merge.py

+9-1
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212

1313
from pandas import (Categorical, DataFrame,
1414
Index, MultiIndex, Timedelta)
15+
from pandas.core.arrays.categorical import _recode_for_categories
1516
from pandas.core.frame import _merge_doc
1617
from pandas.core.dtypes.common import (
1718
is_datetime64tz_dtype,
@@ -1540,8 +1541,15 @@ def _factorize_keys(lk, rk, sort=True):
15401541
is_categorical_dtype(rk) and
15411542
lk.is_dtype_equal(rk)):
15421543
klass = libhashtable.Int64Factorizer
1544+
1545+
if lk.categories.equals(rk.categories):
1546+
rk = rk.codes
1547+
else:
1548+
# Same categories in different orders -> recode
1549+
rk = _recode_for_categories(rk.codes, rk.categories, lk.categories)
1550+
15431551
lk = _ensure_int64(lk.codes)
1544-
rk = _ensure_int64(rk.codes)
1552+
rk = _ensure_int64(rk)
15451553
elif is_int_or_datetime_dtype(lk) and is_int_or_datetime_dtype(rk):
15461554
klass = libhashtable.Int64Factorizer
15471555
lk = _ensure_int64(com._values_from_object(lk))

pandas/tests/indexing/test_categorical.py

+17
Original file line numberDiff line numberDiff line change
@@ -432,6 +432,23 @@ def test_get_indexer_array(self):
432432
expected = np.array([0, 1], dtype='intp')
433433
tm.assert_numpy_array_equal(result, expected)
434434

435+
def test_get_indexer_same_categories_same_order(self):
436+
ci = CategoricalIndex(['a', 'b'], categories=['a', 'b'])
437+
438+
result = ci.get_indexer(CategoricalIndex(['b', 'b'],
439+
categories=['a', 'b']))
440+
expected = np.array([1, 1], dtype='intp')
441+
tm.assert_numpy_array_equal(result, expected)
442+
443+
def test_get_indexer_same_categories_different_order(self):
444+
# https://github.com/pandas-dev/pandas/issues/19551
445+
ci = CategoricalIndex(['a', 'b'], categories=['a', 'b'])
446+
447+
result = ci.get_indexer(CategoricalIndex(['b', 'b'],
448+
categories=['b', 'a']))
449+
expected = np.array([1, 1], dtype='intp')
450+
tm.assert_numpy_array_equal(result, expected)
451+
435452
def test_getitem_with_listlike(self):
436453
# GH 16115
437454
cats = Categorical([Timestamp('12-31-1999'),

pandas/tests/reshape/merge/test_merge.py

+19
Original file line numberDiff line numberDiff line change
@@ -1643,6 +1643,25 @@ def test_merge_categorical(self):
16431643
result = pd.merge(cleft, cright, how='left', left_on='b', right_on='c')
16441644
tm.assert_frame_equal(result, expected)
16451645

1646+
def tests_merge_categorical_unordered_equal(self):
1647+
# GH-19551
1648+
df1 = DataFrame({
1649+
'Foo': Categorical(['A', 'B', 'C'], categories=['A', 'B', 'C']),
1650+
'Left': ['A0', 'B0', 'C0'],
1651+
})
1652+
1653+
df2 = DataFrame({
1654+
'Foo': Categorical(['C', 'B', 'A'], categories=['C', 'B', 'A']),
1655+
'Right': ['C1', 'B1', 'A1'],
1656+
})
1657+
result = pd.merge(df1, df2, on=['Foo'])
1658+
expected = DataFrame({
1659+
'Foo': pd.Categorical(['A', 'B', 'C']),
1660+
'Left': ['A0', 'B0', 'C0'],
1661+
'Right': ['A1', 'B1', 'C1'],
1662+
})
1663+
assert_frame_equal(result, expected)
1664+
16461665
def test_other_columns(self, left, right):
16471666
# non-merge columns should preserve if possible
16481667
right = right.assign(Z=right.Z.astype('category'))

0 commit comments

Comments
 (0)