diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 6179857978b7b..f529af6543939 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -76,7 +76,7 @@ 'cumsum', 'cumprod', 'cummin', 'cummax', 'cumcount', 'resample', 'describe', - 'rank', 'quantile', + 'quantile', 'fillna', 'mad', 'any', 'all', @@ -1378,6 +1378,32 @@ def cumsum(self, axis=0, *args, **kwargs): return self._cython_transform('cumsum') + @Substitution(name='groupby') + @Appender(_doc_template) + def rank(self, axis=0, method='average', numeric_only=True, + na_option='keep', ascending=True, pct=False): + """Compute numerical data ranks (1 through n) along axis. + """ + + def wrapper(values): + return values.rank(axis=axis, method=method, na_option=na_option, + ascending=ascending, pct=pct) + + try: + return self.transform(wrapper) + except ValueError: + if not numeric_only and method == 'first': + raise ValueError('first not supported for non-numeric data') + # such a ValueError is raised by pandas.algos.rank_2d_generic + # for regular (non-grouped) dataframes + if numeric_only: + data = self._obj_with_exclusions._get_numeric_data() + if data.size == 0: + raise DataError('No numeric types to aggregate') + data = data.groupby(self.grouper) + return data.transform(wrapper) + raise + @Substitution(name='groupby') @Appender(_doc_template) def shift(self, periods=1, freq=None, axis=0): @@ -3182,6 +3208,7 @@ def aggregate(self, arg, *args, **kwargs): agg = aggregate def _aggregate_generic(self, func, *args, **kwargs): + if self.grouper.nkeys != 1: raise AssertionError('Number of keys must be 1') diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 3f5b4152afe31..ba0b343aa49ce 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -3646,6 +3646,92 @@ def test_column_select_via_attr(self): expected = self.df.groupby('A').agg(np.mean) assert_frame_equal(result, expected) + def test_rank(self): + # normal behavior + df = DataFrame({'a': ['A1', 'A1', 'A1'], + 'b': [2, 1, 1], + 'c': 1.}) + df = df.set_index('a') + dg = df.groupby('c') + expected = DataFrame({'a': ['A1', 'A1', 'A1'], + 'b': [3., 1., 2.]}) + expected = expected.set_index('a') + + result = dg.rank(method='first') + assert_frame_equal(result, expected) + + result = dg.rank(method='first', numeric_only=True) + assert_frame_equal(result, expected) + + result = dg.rank(method='first', numeric_only=False) + assert_frame_equal(result, expected) + + # GH 11759: non numeric data + df = DataFrame({'a': ['A1', 'A1', 'A1'], + 'b': ['B2', 'B1', 'B1'], + 'c': 1.}) + df = df.set_index('a') + dg = df.groupby('c') + self.assertRaises(DataError, dg.rank, + method='first') + self.assertRaises(DataError, dg.rank, + method='first', numeric_only=True) + self.assertRaises(ValueError, dg.rank, + method='first', numeric_only=False) + # such a ValueError is raised by pandas.algos.rank_2d_generic + # for regular (non-grouped) dataframes + + # with categorical data + df = DataFrame({'a': ['A1', 'A1', 'A1'], + 'b': Categorical(['big', 'small', 'small'], + categories=['small', 'big'], + ordered=True), + 'c': 1.}) + df = df.set_index('a') + dg = df.groupby('c') + self.assertRaises(DataError, dg.rank, + method='first') + self.assertRaises(DataError, dg.rank, + method='first', numeric_only=True) + self.assertRaises(ValueError, dg.rank, + method='first', numeric_only=False) + + # with datetime data + df = DataFrame({'a': ['A1', 'A1', 'A1'], + 'b': [datetime(2002, 2, 2), datetime(2001, 1, 1), + datetime(2001, 1, 1)], + 'c': 1.}) + df = df.set_index('a') + dg = df.groupby('c') + + result = dg.rank(method='first') + assert_frame_equal(result, expected) + + result = dg.rank(method='first', numeric_only=True) + assert_frame_equal(result, expected) + + result = dg.rank(method='first', numeric_only=False) + assert_frame_equal(result, expected) + + # with another numeric column + df = DataFrame({'a': ['A1', 'A1', 'A1'], + 'b': ['B2', 'B1', 'B1'], + 'c': 1., + 'd': 1.}) + df = df.set_index('a') + dg = df.groupby('c') + expected = df.drop('b', axis=1).groupby('c').rank(method='first') + + result = dg.rank(method='first') + assert_frame_equal(result, expected) + + result = dg.rank(method='first', numeric_only=True) + assert_frame_equal(result, expected) + + self.assertRaises(ValueError, dg.rank, + method='first', numeric_only=False) + # same remark as above + def test_rank_apply(self): lev1 = tm.rands_array(10, 100) lev2 = tm.rands_array(10, 130) @@ -5753,7 +5839,6 @@ def test_groupby_whitelist(self): 'cumcount', 'resample', 'describe', - 'rank', 'quantile', 'fillna', 'mad', @@ -5794,7 +5879,6 @@ def test_groupby_whitelist(self): 'cumcount', 'resample', 'describe', - 'rank', 'quantile', 'fillna', 'mad',