diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index fa24c973a7549..0b501adba5039 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -578,6 +578,7 @@ Bug Fixes +- Bug in ``.rank()`` which incorrectly ranks ordered categories (:issue:`15420`) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 4ae46fe33a5cc..b11927a80fb2e 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -973,6 +973,10 @@ def _hashtable_algo(f, values, return_dtype=None): def _get_data_algo(values, func_map): f = None + + if is_categorical_dtype(values): + values = values._values_for_rank() + if is_float_dtype(values): f = func_map['float64'] values = _ensure_float64(values) @@ -988,7 +992,6 @@ def _get_data_algo(values, func_map): elif is_unsigned_integer_dtype(values): f = func_map['uint64'] values = _ensure_uint64(values) - else: values = _ensure_object(values) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index b6898f11ffa74..b88a6b171b316 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -1404,6 +1404,28 @@ def sort_values(self, inplace=False, ascending=True, na_position='last'): return self._constructor(values=codes, categories=self.categories, ordered=self.ordered, fastpath=True) + def _values_for_rank(self): + """ + For correctly ranking ordered categorical data. See GH#15420 + + Ordered categorical data should be ranked on the basis of + codes with -1 translated to NaN. + + Returns + ------- + numpy array + + """ + if self.ordered: + values = self.codes + mask = values == -1 + if mask.any(): + values = values.astype('float64') + values[mask] = np.nan + else: + values = np.array(self) + return values + def order(self, inplace=False, ascending=True, na_position='last'): """ DEPRECATED: use :meth:`Categorical.sort_values`. That function diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 222165e9d3633..b092e4f084767 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1057,6 +1057,84 @@ def test_rank(self): iranks = iseries.rank() assert_series_equal(iranks, exp) + def test_rank_categorical(self): + # GH issue #15420 rank incorrectly orders ordered categories + + # Test ascending/descending ranking for ordered categoricals + exp = pd.Series([1., 2., 3., 4., 5., 6.]) + exp_desc = pd.Series([6., 5., 4., 3., 2., 1.]) + ordered = pd.Series( + ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'] + ).astype('category', ).cat.set_categories( + ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'], + ordered=True + ) + assert_series_equal(ordered.rank(), exp) + assert_series_equal(ordered.rank(ascending=False), exp_desc) + + # Unordered categoricals should be ranked as objects + unordered = pd.Series( + ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'], + ).astype('category').cat.set_categories( + ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'], + ordered=False + ) + exp_unordered = pd.Series([2., 4., 6., 3., 1., 5.]) + res = unordered.rank() + assert_series_equal(res, exp_unordered) + + # Test na_option for rank data + na_ser = pd.Series( + ['first', 'second', 'third', 'fourth', 'fifth', 'sixth', np.NaN] + ).astype('category', ).cat.set_categories( + [ + 'first', 'second', 'third', 'fourth', + 'fifth', 'sixth', 'seventh' + ], + ordered=True + ) + + exp_top = pd.Series([2., 3., 4., 5., 6., 7., 1.]) + exp_bot = pd.Series([1., 2., 3., 4., 5., 6., 7.]) + exp_keep = pd.Series([1., 2., 3., 4., 5., 6., np.NaN]) + + assert_series_equal(na_ser.rank(na_option='top'), exp_top) + assert_series_equal(na_ser.rank(na_option='bottom'), exp_bot) + assert_series_equal(na_ser.rank(na_option='keep'), exp_keep) + + # Test na_option for rank data with ascending False + exp_top = pd.Series([7., 6., 5., 4., 3., 2., 1.]) + exp_bot = pd.Series([6., 5., 4., 3., 2., 1., 7.]) + exp_keep = pd.Series([6., 5., 4., 3., 2., 1., np.NaN]) + + assert_series_equal( + na_ser.rank(na_option='top', ascending=False), + exp_top + ) + assert_series_equal( + na_ser.rank(na_option='bottom', ascending=False), + exp_bot + ) + assert_series_equal( + na_ser.rank(na_option='keep', ascending=False), + exp_keep + ) + + # Test with pct=True + na_ser = pd.Series( + ['first', 'second', 'third', 'fourth', np.NaN], + ).astype('category').cat.set_categories( + ['first', 'second', 'third', 'fourth'], + ordered=True + ) + exp_top = pd.Series([0.4, 0.6, 0.8, 1., 0.2]) + exp_bot = pd.Series([0.2, 0.4, 0.6, 0.8, 1.]) + exp_keep = pd.Series([0.25, 0.5, 0.75, 1., np.NaN]) + + assert_series_equal(na_ser.rank(na_option='top', pct=True), exp_top) + assert_series_equal(na_ser.rank(na_option='bottom', pct=True), exp_bot) + assert_series_equal(na_ser.rank(na_option='keep', pct=True), exp_keep) + def test_rank_signature(self): s = Series([0, 1]) s.rank(method='average')