diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 09a39a6d9b2f5..bfc76b37510b9 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -132,3 +132,4 @@ Bug Fixes - Bug in `pandas.concat` with ``axis=0`` when column is of dtype ``category`` (:issue:`10177`) - Bug in ``read_msgpack`` where input type is not always checked (:issue:`10369`) +- Bug in ``Categorical`` ``Series.shift`` (:issue:`10495`) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 42d7163e7f741..cac00890bb7d8 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -885,7 +885,13 @@ def shift(self, periods, axis=0): """ shift the block by periods, possibly upcast """ # convert integer to float if necessary. need to do a lot more than # that, handle boolean etc also - new_values, fill_value = com._maybe_upcast(self.values) + if isinstance(self.values, Categorical): + # hack toward fixing issue 10495 + values = self.values._codes + else: + values = self.values + new_values, fill_value = com._maybe_upcast(values) + # make sure array sent to np.roll is c_contiguous f_ordered = new_values.flags.f_contiguous if f_ordered: @@ -906,6 +912,13 @@ def shift(self, periods, axis=0): if f_ordered: new_values = new_values.T + if isinstance(self.values, Categorical): + # hack toward fixing issue 10495 + new_values[np.isnan(new_values)] = -1 + new_values = Categorical.from_codes(new_values, + categories=self.values.categories) + + return [make_block(new_values, ndim=self.ndim, fastpath=True, placement=self.mgr_locs)] diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 2c1a4fd43e57f..f5583b2cb587b 100755 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -1175,6 +1175,18 @@ def test_comparison_with_unknown_scalars(self): self.assert_numpy_array_equal(cat == 4 , [False, False, False]) self.assert_numpy_array_equal(cat != 4 , [True, True, True]) + def test_shift(self): + # GH10495 + # Series.shift should not depend on the dtype being categorical or not + values = ['a', 'b', 'c'] + shifts = [-1, 0, 1] + results = [['b', 'c', np.nan], ['a', 'b', 'c'], [np.nan, 'a', 'b']] + + for shift, result in zip(shifts, results): + b = pd.Series(pd.Categorical(result, categories=values)) + a = pd.Series(values, dtype='category').shift(shift) + self.assert_series_equal(a, b) + class TestCategoricalAsBlock(tm.TestCase): _multiprocess_can_split_ = True