From c0dc3da232957cbc335dd746433e90c771d9f57b Mon Sep 17 00:00:00 2001 From: Chris Bartak Date: Tue, 25 Apr 2017 14:02:53 -0500 Subject: [PATCH 1/2] BUG: sefault in concat of cat-idx --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/core/indexes/category.py | 7 +++++-- pandas/tests/reshape/test_concat.py | 18 ++++++++++++++++++ 3 files changed, 24 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index c07760a94d3f1..c9c22de9141fe 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -1629,6 +1629,7 @@ Indexing - Bug in the display of ``.info()`` where a qualifier (+) would always be displayed with a ``MultiIndex`` that contains only non-strings (:issue:`15245`) - Bug in ``pd.concat()`` where the names of ``MultiIndex`` of resulting ``DataFrame`` are not handled correctly when ``None`` is presented in the names of ``MultiIndex`` of input ``DataFrame`` (:issue:`15787`) - Bug in ``DataFrame.sort_index()`` and ``Series.sort_index()`` where ``na_position`` doesn't work with a ``MultiIndex`` (:issue:`14784`, :issue:`16604`) + - Bug in in ``pd.concat()`` when combining objects with a ``CategoricalIndex`` (:issue:`16111`) I/O ^^^ diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 257ca86947f2b..5f38b19742f71 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -12,6 +12,7 @@ is_scalar) from pandas.core.common import _asarray_tuplesafe from pandas.core.dtypes.missing import array_equivalent +from pandas.core.algorithms import take_1d from pandas.util.decorators import Appender, cache_readonly @@ -470,8 +471,10 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): codes = target.codes else: if isinstance(target, CategoricalIndex): - target = target.categories - codes = self.categories.get_indexer(target) + code_indexer = self.categories.get_indexer(target.categories) + codes = take_1d(code_indexer, target.codes, fill_value=-1) + else: + codes = self.categories.get_indexer(target) indexer, _ = self._engine.get_indexer_non_unique(codes) diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index b877a9d181848..bd12f5f2c54b6 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -1928,6 +1928,24 @@ def test_concat_multiindex_dfs_with_deepcopy(self): result_no_copy = pd.concat(example_dict, names=['testname']) tm.assert_frame_equal(result_no_copy, expected) + def test_concat_categoricalindex(self): + # GH 16111, categories that aren't lexsorted + categories = [9, 0, 1, 2, 3] + + a = pd.Series(1, index=pd.CategoricalIndex([9,0], categories=categories)) + b = pd.Series(2, index=pd.CategoricalIndex([0,1], categories=categories)) + c = pd.Series(3, index=pd.CategoricalIndex([1,2], categories=categories)) + + result = pd.concat([a, b, c], axis=1) + + exp_idx = pd.CategoricalIndex([0,1,2,9]) + exp = pd.DataFrame({0: [1, np.nan, np.nan, 1], + 1: [2, 2, np.nan, np.nan], + 2: [np.nan, 3, 3, np.nan]}, + columns=[0, 1, 2], + index=exp_idx) + tm.assert_frame_equal(result, exp) + @pytest.mark.parametrize('pdt', [pd.Series, pd.DataFrame, pd.Panel]) @pytest.mark.parametrize('dt', np.sctypes['float']) From 5b6be78c6662ce8831f3edfb0d7524193e1ec25d Mon Sep 17 00:00:00 2001 From: Chris Bartak Date: Tue, 25 Apr 2017 15:10:36 -0500 Subject: [PATCH 2/2] lint --- pandas/tests/reshape/test_concat.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index bd12f5f2c54b6..cc71cf6b1a4dc 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -1932,13 +1932,16 @@ def test_concat_categoricalindex(self): # GH 16111, categories that aren't lexsorted categories = [9, 0, 1, 2, 3] - a = pd.Series(1, index=pd.CategoricalIndex([9,0], categories=categories)) - b = pd.Series(2, index=pd.CategoricalIndex([0,1], categories=categories)) - c = pd.Series(3, index=pd.CategoricalIndex([1,2], categories=categories)) + a = pd.Series(1, index=pd.CategoricalIndex([9, 0], + categories=categories)) + b = pd.Series(2, index=pd.CategoricalIndex([0, 1], + categories=categories)) + c = pd.Series(3, index=pd.CategoricalIndex([1, 2], + categories=categories)) result = pd.concat([a, b, c], axis=1) - exp_idx = pd.CategoricalIndex([0,1,2,9]) + exp_idx = pd.CategoricalIndex([0, 1, 2, 9]) exp = pd.DataFrame({0: [1, np.nan, np.nan, 1], 1: [2, 2, np.nan, np.nan], 2: [np.nan, 3, 3, np.nan]},