diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index cb7314a26689f..21915da2c4402 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -191,11 +191,12 @@ def rank(values, axis=0, method='average', na_option='keep', """ if values.ndim == 1: f, values = _get_data_algo(values, _rank1d_functions) - ranks = f(values, ties_method=method, ascending=ascending) + ranks = f(values, ties_method=method, ascending=ascending, + na_option=na_option) elif values.ndim == 2: f, values = _get_data_algo(values, _rank2d_functions) ranks = f(values, axis=axis, ties_method=method, - ascending=ascending) + ascending=ascending, na_option=na_option) return ranks diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0f270c6f6e546..902b3a19a6bfa 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4704,8 +4704,10 @@ def rank(self, axis=0, numeric_only=None, method='average', min: lowest rank in group max: highest rank in group first: ranks assigned in order they appear in the array - na_option : {'keep'} + na_option : {'keep', 'top', 'bottom'} keep: leave NA values where they are + top: smallest rank if ascending + bottom: smallest rank if descending ascending : boolean, default True False for ranks by high (1) to low (N) @@ -4716,7 +4718,7 @@ def rank(self, axis=0, numeric_only=None, method='average', if numeric_only is None: try: ranks = algos.rank(self.values, axis=axis, method=method, - ascending=ascending) + ascending=ascending, na_option=na_option) return DataFrame(ranks, index=self.index, columns=self.columns) except TypeError: numeric_only = True @@ -4726,7 +4728,7 @@ def rank(self, axis=0, numeric_only=None, method='average', else: data = self ranks = algos.rank(data.values, axis=axis, method=method, - ascending=ascending) + ascending=ascending, na_option=na_option) return DataFrame(ranks, index=data.index, columns=data.columns) def to_timestamp(self, freq=None, how='start', axis=0, copy=True): diff --git a/pandas/src/stats.pyx b/pandas/src/stats.pyx index f4d87f411a97e..0fc7d30713e79 100644 --- a/pandas/src/stats.pyx +++ b/pandas/src/stats.pyx @@ -70,7 +70,8 @@ cdef _take_2d_object(ndarray[object, ndim=2] values, return result -def rank_1d_float64(object in_arr, ties_method='average', ascending=True): +def rank_1d_float64(object in_arr, ties_method='average', ascending=True, + na_option='keep'): """ Fast NaN-friendly version of scipy.stats.rankdata """ @@ -86,7 +87,7 @@ def rank_1d_float64(object in_arr, ties_method='average', ascending=True): values = np.asarray(in_arr).copy() - if ascending: + if ascending ^ (na_option == 'top'): nan_value = np.inf else: nan_value = -np.inf @@ -115,7 +116,7 @@ def rank_1d_float64(object in_arr, ties_method='average', ascending=True): sum_ranks += i + 1 dups += 1 val = sorted_data[i] - if val == nan_value: + if (val == nan_value) and (na_option == 'keep'): ranks[argsorted[i]] = nan continue if i == n - 1 or fabs(sorted_data[i + 1] - val) > FP_ERR: @@ -138,7 +139,8 @@ def rank_1d_float64(object in_arr, ties_method='average', ascending=True): return ranks -def rank_1d_int64(object in_arr, ties_method='average', ascending=True): +def rank_1d_int64(object in_arr, ties_method='average', ascending=True, + na_option='keep'): """ Fast NaN-friendly version of scipy.stats.rankdata """ @@ -198,7 +200,7 @@ def rank_1d_int64(object in_arr, ties_method='average', ascending=True): def rank_2d_float64(object in_arr, axis=0, ties_method='average', - ascending=True): + ascending=True, na_option='keep'): """ Fast NaN-friendly version of scipy.stats.rankdata """ @@ -219,7 +221,7 @@ def rank_2d_float64(object in_arr, axis=0, ties_method='average', else: values = in_arr.copy() - if ascending: + if ascending ^ (na_option == 'top'): nan_value = np.inf else: nan_value = -np.inf @@ -249,7 +251,7 @@ def rank_2d_float64(object in_arr, axis=0, ties_method='average', sum_ranks += j + 1 dups += 1 val = values[i, j] - if val == nan_value: + if val == nan_value and na_option == 'keep': ranks[i, argsorted[i, j]] = nan continue if j == k - 1 or fabs(values[i, j + 1] - val) > FP_ERR: @@ -277,7 +279,7 @@ def rank_2d_float64(object in_arr, axis=0, ties_method='average', def rank_2d_int64(object in_arr, axis=0, ties_method='average', - ascending=True): + ascending=True, na_option='keep'): """ Fast NaN-friendly version of scipy.stats.rankdata """ @@ -345,7 +347,7 @@ def rank_2d_int64(object in_arr, axis=0, ties_method='average', def rank_1d_generic(object in_arr, bint retry=1, ties_method='average', - ascending=True): + ascending=True, na_option='keep'): """ Fast NaN-friendly version of scipy.stats.rankdata """ @@ -365,7 +367,7 @@ def rank_1d_generic(object in_arr, bint retry=1, ties_method='average', if values.dtype != np.object_: values = values.astype('O') - if ascending: + if ascending ^ (na_option == 'top'): # always greater than everything nan_value = Infinity() else: @@ -401,7 +403,7 @@ def rank_1d_generic(object in_arr, bint retry=1, ties_method='average', sum_ranks += i + 1 dups += 1 val = util.get_value_at(sorted_data, i) - if val is nan_value: + if val is nan_value and na_option=='keep': ranks[argsorted[i]] = nan continue if (i == n - 1 or @@ -450,7 +452,7 @@ class NegInfinity(object): __cmp__ = _return_true def rank_2d_generic(object in_arr, axis=0, ties_method='average', - ascending=True): + ascending=True, na_option='keep'): """ Fast NaN-friendly version of scipy.stats.rankdata """ @@ -475,7 +477,7 @@ def rank_2d_generic(object in_arr, axis=0, ties_method='average', if values.dtype != np.object_: values = values.astype('O') - if ascending: + if ascending ^ (na_option == 'top'): # always greater than everything nan_value = Infinity() else: @@ -510,7 +512,7 @@ def rank_2d_generic(object in_arr, axis=0, ties_method='average', dups = sum_ranks = infs = 0 for j in range(k): val = values[i, j] - if val is nan_value: + if val is nan_value and na_option == 'keep': ranks[i, argsorted[i, j]] = nan infs += 1 continue diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index c989e8c981231..d0cdf07c4a2c2 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -6444,6 +6444,73 @@ def test_rank2(self): expected = self.mixed_frame.rank(1, numeric_only=True) assert_frame_equal(result, expected) + def test_rank_na_option(self): + from pandas.compat.scipy import rankdata + + self.frame['A'][::2] = np.nan + self.frame['B'][::3] = np.nan + self.frame['C'][::4] = np.nan + self.frame['D'][::5] = np.nan + + #bottom + ranks0 = self.frame.rank(na_option='bottom') + ranks1 = self.frame.rank(1, na_option='bottom') + + fvals = self.frame.fillna(np.inf).values + + exp0 = np.apply_along_axis(rankdata, 0, fvals) + exp1 = np.apply_along_axis(rankdata, 1, fvals) + + assert_almost_equal(ranks0.values, exp0) + assert_almost_equal(ranks1.values, exp1) + + #top + ranks0 = self.frame.rank(na_option='top') + ranks1 = self.frame.rank(1, na_option='top') + + fval0 = self.frame.fillna((self.frame.min() - 1).to_dict()).values + fval1 = self.frame.T + fval1 = fval1.fillna((fval1.min() - 1).to_dict()).T + fval1 = fval1.fillna(np.inf).values + + exp0 = np.apply_along_axis(rankdata, 0, fval0) + exp1 = np.apply_along_axis(rankdata, 1, fval1) + + assert_almost_equal(ranks0.values, exp0) + assert_almost_equal(ranks1.values, exp1) + + #descending + + #bottom + ranks0 = self.frame.rank(na_option='top', ascending=False) + ranks1 = self.frame.rank(1, na_option='top', ascending=False) + + fvals = self.frame.fillna(np.inf).values + + exp0 = np.apply_along_axis(rankdata, 0, -fvals) + exp1 = np.apply_along_axis(rankdata, 1, -fvals) + + assert_almost_equal(ranks0.values, exp0) + assert_almost_equal(ranks1.values, exp1) + + #descending + + #top + ranks0 = self.frame.rank(na_option='bottom', ascending=False) + ranks1 = self.frame.rank(1, na_option='bottom', ascending=False) + + fval0 = self.frame.fillna((self.frame.min() - 1).to_dict()).values + fval1 = self.frame.T + fval1 = fval1.fillna((fval1.min() - 1).to_dict()).T + fval1 = fval1.fillna(np.inf).values + + exp0 = np.apply_along_axis(rankdata, 0, -fval0) + exp1 = np.apply_along_axis(rankdata, 1, -fval1) + + assert_almost_equal(ranks0.values, exp0) + assert_almost_equal(ranks1.values, exp1) + + def test_describe(self): desc = self.tsframe.describe() desc = self.mixed_frame.describe()