From 85f0c7030ec2eecebd060fc385d5857ca9d08eeb Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Wed, 13 Jun 2018 10:22:59 +0200 Subject: [PATCH] BUG: fix get_indexer_non_unique with CategoricalIndex key closes #21448 --- doc/source/whatsnew/v0.23.2.txt | 2 +- pandas/core/indexes/base.py | 3 +++ pandas/core/indexes/category.py | 7 ++++++- pandas/tests/categorical/test_indexing.py | 20 +++++++++++++++++++- 4 files changed, 29 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index ec2eddcfd4d41..611e5c4836c6f 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -55,7 +55,7 @@ Conversion Indexing ^^^^^^^^ -- +- Bug in :meth:`Index.get_indexer_non_unique` with categorical key (:issue:`21448`) - I/O diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index bf1051332ee19..d9e4ef7db1158 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -31,6 +31,7 @@ is_dtype_equal, is_dtype_union_equal, is_object_dtype, + is_categorical, is_categorical_dtype, is_interval_dtype, is_period_dtype, @@ -3300,6 +3301,8 @@ def _filter_indexer_tolerance(self, target, indexer, tolerance): @Appender(_index_shared_docs['get_indexer_non_unique'] % _index_doc_kwargs) def get_indexer_non_unique(self, target): target = _ensure_index(target) + if is_categorical(target): + target = target.astype(target.dtype.categories.dtype) pself, ptarget = self._maybe_promote(target) if pself is not self or ptarget is not target: return pself.get_indexer_non_unique(ptarget) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 150eca32e229d..587090fa72def 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -598,7 +598,12 @@ def get_indexer_non_unique(self, target): target = ibase._ensure_index(target) if isinstance(target, CategoricalIndex): - target = target.categories + # Indexing on codes is more efficient if categories are the same: + if target.categories is self.categories: + target = target.codes + indexer, missing = self._engine.get_indexer_non_unique(target) + return _ensure_platform_int(indexer), missing + target = target.values codes = self.categories.get_indexer(target) indexer, missing = self._engine.get_indexer_non_unique(codes) diff --git a/pandas/tests/categorical/test_indexing.py b/pandas/tests/categorical/test_indexing.py index 9c27b1101e5ca..cf7b5cfa55882 100644 --- a/pandas/tests/categorical/test_indexing.py +++ b/pandas/tests/categorical/test_indexing.py @@ -5,7 +5,7 @@ import numpy as np import pandas.util.testing as tm -from pandas import Categorical, Index, PeriodIndex +from pandas import Categorical, Index, CategoricalIndex, PeriodIndex from pandas.tests.categorical.common import TestCategorical @@ -103,3 +103,21 @@ def f(): s.categories = [1, 2] pytest.raises(ValueError, f) + + # Combinations of sorted/unique: + @pytest.mark.parametrize("idx_values", [[1, 2, 3, 4], [1, 3, 2, 4], + [1, 3, 3, 4], [1, 2, 2, 4]]) + # Combinations of missing/unique + @pytest.mark.parametrize("key_values", [[1, 2], [1, 5], [1, 1], [5, 5]]) + @pytest.mark.parametrize("key_class", [Categorical, CategoricalIndex]) + def test_get_indexer_non_unique(self, idx_values, key_values, key_class): + # GH 21448 + key = key_class(key_values, categories=range(1, 5)) + # Test for flat index and CategoricalIndex with same/different cats: + for dtype in None, 'category', key.dtype: + idx = Index(idx_values, dtype=dtype) + expected, exp_miss = idx.get_indexer_non_unique(key_values) + result, res_miss = idx.get_indexer_non_unique(key) + + tm.assert_numpy_array_equal(expected, result) + tm.assert_numpy_array_equal(exp_miss, res_miss)