Skip to content

Commit 9f90bd4

Browse files
authoredSep 14, 2021
ENH: Rolling rank (#43338)
1 parent 323595a commit 9f90bd4

File tree

13 files changed

+427
-11
lines changed

13 files changed

+427
-11
lines changed
 

‎asv_bench/benchmarks/rolling.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,33 @@ def time_quantile(self, constructor, window, dtype, percentile, interpolation):
180180
self.roll.quantile(percentile, interpolation=interpolation)
181181

182182

183+
class Rank:
184+
params = (
185+
["DataFrame", "Series"],
186+
[10, 1000],
187+
["int", "float"],
188+
[True, False],
189+
[True, False],
190+
["min", "max", "average"],
191+
)
192+
param_names = [
193+
"constructor",
194+
"window",
195+
"dtype",
196+
"percentile",
197+
"ascending",
198+
"method",
199+
]
200+
201+
def setup(self, constructor, window, dtype, percentile, ascending, method):
202+
N = 10 ** 5
203+
arr = np.random.random(N).astype(dtype)
204+
self.roll = getattr(pd, constructor)(arr).rolling(window)
205+
206+
def time_rank(self, constructor, window, dtype, percentile, ascending, method):
207+
self.roll.rank(pct=percentile, ascending=ascending, method=method)
208+
209+
183210
class PeakMemFixedWindowMinMax:
184211

185212
params = ["min", "max"]

‎doc/source/reference/window.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ Rolling window functions
3535
Rolling.aggregate
3636
Rolling.quantile
3737
Rolling.sem
38+
Rolling.rank
3839

3940
.. _api.functions_window:
4041

@@ -75,6 +76,7 @@ Expanding window functions
7576
Expanding.aggregate
7677
Expanding.quantile
7778
Expanding.sem
79+
Expanding.rank
7880

7981
.. _api.functions_ewm:
8082

‎doc/source/whatsnew/v1.4.0.rst

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,21 @@ Multithreaded CSV reading with a new CSV Engine based on pyarrow
9494
:func:`pandas.read_csv` now accepts ``engine="pyarrow"`` (requires at least ``pyarrow`` 0.17.0) as an argument, allowing for faster csv parsing on multicore machines
9595
with pyarrow installed. See the :doc:`I/O docs </user_guide/io>` for more info. (:issue:`23697`)
9696

97+
.. _whatsnew_140.enhancements.window_rank:
98+
99+
Rank function for rolling and expanding windows
100+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
101+
102+
Added ``rank`` function to :class:`Rolling` and :class:`Expanding`. The new function supports the ``method``, ``ascending``, and ``pct`` flags of :meth:`DataFrame.rank`. The ``method`` argument supports ``min``, ``max``, and ``average`` ranking methods.
103+
Example:
104+
105+
.. ipython:: python
106+
107+
s = pd.Series([1, 4, 2, 3, 5, 3])
108+
s.rolling(3).rank()
109+
110+
s.rolling(3).rank(method="max")
111+
97112
.. _whatsnew_140.enhancements.other:
98113

99114
Other enhancements

‎pandas/_libs/algos.pxd

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,11 @@ from pandas._libs.util cimport numeric
22

33

44
cdef numeric kth_smallest_c(numeric* arr, Py_ssize_t k, Py_ssize_t n) nogil
5+
6+
cdef enum TiebreakEnumType:
7+
TIEBREAK_AVERAGE
8+
TIEBREAK_MIN,
9+
TIEBREAK_MAX
10+
TIEBREAK_FIRST
11+
TIEBREAK_FIRST_DESCENDING
12+
TIEBREAK_DENSE

‎pandas/_libs/algos.pyx

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -66,13 +66,6 @@ cdef:
6666
float64_t NaN = <float64_t>np.NaN
6767
int64_t NPY_NAT = get_nat()
6868

69-
cdef enum TiebreakEnumType:
70-
TIEBREAK_AVERAGE
71-
TIEBREAK_MIN,
72-
TIEBREAK_MAX
73-
TIEBREAK_FIRST
74-
TIEBREAK_FIRST_DESCENDING
75-
TIEBREAK_DENSE
7669

7770
tiebreakers = {
7871
"average": TIEBREAK_AVERAGE,

‎pandas/_libs/src/skiplist.h

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -180,10 +180,30 @@ PANDAS_INLINE double skiplist_get(skiplist_t *skp, int i, int *ret) {
180180
return node->value;
181181
}
182182

183+
// Returns the lowest rank of all elements with value `value`, as opposed to the
184+
// highest rank returned by `skiplist_insert`.
185+
PANDAS_INLINE int skiplist_min_rank(skiplist_t *skp, double value) {
186+
node_t *node;
187+
int level, rank = 0;
188+
189+
node = skp->head;
190+
for (level = skp->maxlevels - 1; level >= 0; --level) {
191+
while (_node_cmp(node->next[level], value) > 0) {
192+
rank += node->width[level];
193+
node = node->next[level];
194+
}
195+
}
196+
197+
return rank + 1;
198+
}
199+
200+
// Returns the rank of the inserted element. When there are duplicates,
201+
// `rank` is the highest of the group, i.e. the 'max' method of
202+
// https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.rank.html
183203
PANDAS_INLINE int skiplist_insert(skiplist_t *skp, double value) {
184204
node_t *node, *prevnode, *newnode, *next_at_level;
185205
int *steps_at_level;
186-
int size, steps, level;
206+
int size, steps, level, rank = 0;
187207
node_t **chain;
188208

189209
chain = skp->tmp_chain;
@@ -197,6 +217,7 @@ PANDAS_INLINE int skiplist_insert(skiplist_t *skp, double value) {
197217
next_at_level = node->next[level];
198218
while (_node_cmp(next_at_level, value) >= 0) {
199219
steps_at_level[level] += node->width[level];
220+
rank += node->width[level];
200221
node = next_at_level;
201222
next_at_level = node->next[level];
202223
}
@@ -230,7 +251,7 @@ PANDAS_INLINE int skiplist_insert(skiplist_t *skp, double value) {
230251

231252
++(skp->size);
232253

233-
return 1;
254+
return rank + 1;
234255
}
235256

236257
PANDAS_INLINE int skiplist_remove(skiplist_t *skp, double value) {

‎pandas/_libs/window/aggregations.pyi

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ from typing import (
66

77
import numpy as np
88

9+
from pandas._typing import WindowingRankType
10+
911
def roll_sum(
1012
values: np.ndarray, # const float64_t[:]
1113
start: np.ndarray, # np.ndarray[np.int64]
@@ -63,6 +65,15 @@ def roll_quantile(
6365
quantile: float, # float64_t
6466
interpolation: Literal["linear", "lower", "higher", "nearest", "midpoint"],
6567
) -> np.ndarray: ... # np.ndarray[float]
68+
def roll_rank(
69+
values: np.ndarray,
70+
start: np.ndarray,
71+
end: np.ndarray,
72+
minp: int,
73+
percentile: bool,
74+
method: WindowingRankType,
75+
ascending: bool,
76+
) -> np.ndarray: ... # np.ndarray[float]
6677
def roll_apply(
6778
obj: object,
6879
start: np.ndarray, # np.ndarray[np.int64]

‎pandas/_libs/window/aggregations.pyx

Lines changed: 122 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ import cython
55
from libc.math cimport round
66
from libcpp.deque cimport deque
77

8+
from pandas._libs.algos cimport TiebreakEnumType
9+
810
import numpy as np
911

1012
cimport numpy as cnp
@@ -50,6 +52,8 @@ cdef extern from "../src/skiplist.h":
5052
double skiplist_get(skiplist_t*, int, int*) nogil
5153
int skiplist_insert(skiplist_t*, double) nogil
5254
int skiplist_remove(skiplist_t*, double) nogil
55+
int skiplist_rank(skiplist_t*, double) nogil
56+
int skiplist_min_rank(skiplist_t*, double) nogil
5357

5458
cdef:
5559
float32_t MINfloat32 = np.NINF
@@ -795,7 +799,7 @@ def roll_median_c(const float64_t[:] values, ndarray[int64_t] start,
795799
val = values[j]
796800
if notnan(val):
797801
nobs += 1
798-
err = skiplist_insert(sl, val) != 1
802+
err = skiplist_insert(sl, val) == -1
799803
if err:
800804
break
801805

@@ -806,7 +810,7 @@ def roll_median_c(const float64_t[:] values, ndarray[int64_t] start,
806810
val = values[j]
807811
if notnan(val):
808812
nobs += 1
809-
err = skiplist_insert(sl, val) != 1
813+
err = skiplist_insert(sl, val) == -1
810814
if err:
811815
break
812816

@@ -1139,6 +1143,122 @@ def roll_quantile(const float64_t[:] values, ndarray[int64_t] start,
11391143
return output
11401144

11411145

1146+
rolling_rank_tiebreakers = {
1147+
"average": TiebreakEnumType.TIEBREAK_AVERAGE,
1148+
"min": TiebreakEnumType.TIEBREAK_MIN,
1149+
"max": TiebreakEnumType.TIEBREAK_MAX,
1150+
}
1151+
1152+
1153+
def roll_rank(const float64_t[:] values, ndarray[int64_t] start,
1154+
ndarray[int64_t] end, int64_t minp, bint percentile,
1155+
str method, bint ascending) -> np.ndarray:
1156+
"""
1157+
O(N log(window)) implementation using skip list
1158+
1159+
derived from roll_quantile
1160+
"""
1161+
cdef:
1162+
Py_ssize_t i, j, s, e, N = len(values), idx
1163+
float64_t rank_min = 0, rank = 0
1164+
int64_t nobs = 0, win
1165+
float64_t val
1166+
skiplist_t *skiplist
1167+
float64_t[::1] output
1168+
TiebreakEnumType rank_type
1169+
1170+
try:
1171+
rank_type = rolling_rank_tiebreakers[method]
1172+
except KeyError:
1173+
raise ValueError(f"Method '{method}' is not supported")
1174+
1175+
is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds(
1176+
start, end
1177+
)
1178+
# we use the Fixed/Variable Indexer here as the
1179+
# actual skiplist ops outweigh any window computation costs
1180+
output = np.empty(N, dtype=np.float64)
1181+
1182+
win = (end - start).max()
1183+
if win == 0:
1184+
output[:] = NaN
1185+
return np.asarray(output)
1186+
skiplist = skiplist_init(<int>win)
1187+
if skiplist == NULL:
1188+
raise MemoryError("skiplist_init failed")
1189+
1190+
with nogil:
1191+
for i in range(N):
1192+
s = start[i]
1193+
e = end[i]
1194+
1195+
if i == 0 or not is_monotonic_increasing_bounds:
1196+
if not is_monotonic_increasing_bounds:
1197+
nobs = 0
1198+
skiplist_destroy(skiplist)
1199+
skiplist = skiplist_init(<int>win)
1200+
1201+
# setup
1202+
for j in range(s, e):
1203+
val = values[j] if ascending else -values[j]
1204+
if notnan(val):
1205+
nobs += 1
1206+
rank = skiplist_insert(skiplist, val)
1207+
if rank == -1:
1208+
raise MemoryError("skiplist_insert failed")
1209+
if rank_type == TiebreakEnumType.TIEBREAK_AVERAGE:
1210+
# The average rank of `val` is the sum of the ranks of all
1211+
# instances of `val` in the skip list divided by the number
1212+
# of instances. The sum of consecutive integers from 1 to N
1213+
# is N * (N + 1) / 2.
1214+
# The sum of the ranks is the sum of integers from the
1215+
# lowest rank to the highest rank, which is the sum of
1216+
# integers from 1 to the highest rank minus the sum of
1217+
# integers from 1 to one less than the lowest rank.
1218+
rank_min = skiplist_min_rank(skiplist, val)
1219+
rank = (((rank * (rank + 1) / 2)
1220+
- ((rank_min - 1) * rank_min / 2))
1221+
/ (rank - rank_min + 1))
1222+
elif rank_type == TiebreakEnumType.TIEBREAK_MIN:
1223+
rank = skiplist_min_rank(skiplist, val)
1224+
else:
1225+
rank = NaN
1226+
1227+
else:
1228+
# calculate deletes
1229+
for j in range(start[i - 1], s):
1230+
val = values[j] if ascending else -values[j]
1231+
if notnan(val):
1232+
skiplist_remove(skiplist, val)
1233+
nobs -= 1
1234+
1235+
# calculate adds
1236+
for j in range(end[i - 1], e):
1237+
val = values[j] if ascending else -values[j]
1238+
if notnan(val):
1239+
nobs += 1
1240+
rank = skiplist_insert(skiplist, val)
1241+
if rank == -1:
1242+
raise MemoryError("skiplist_insert failed")
1243+
if rank_type == TiebreakEnumType.TIEBREAK_AVERAGE:
1244+
rank_min = skiplist_min_rank(skiplist, val)
1245+
rank = (((rank * (rank + 1) / 2)
1246+
- ((rank_min - 1) * rank_min / 2))
1247+
/ (rank - rank_min + 1))
1248+
elif rank_type == TiebreakEnumType.TIEBREAK_MIN:
1249+
rank = skiplist_min_rank(skiplist, val)
1250+
else:
1251+
rank = NaN
1252+
if nobs >= minp:
1253+
output[i] = rank / nobs if percentile else rank
1254+
else:
1255+
output[i] = NaN
1256+
1257+
skiplist_destroy(skiplist)
1258+
1259+
return np.asarray(output)
1260+
1261+
11421262
def roll_apply(object obj,
11431263
ndarray[int64_t] start, ndarray[int64_t] end,
11441264
int64_t minp,

‎pandas/_typing.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,3 +219,6 @@
219219
PositionalIndexer = Union[ScalarIndexer, SequenceIndexer]
220220
PositionalIndexerTuple = Tuple[PositionalIndexer, PositionalIndexer]
221221
PositionalIndexer2D = Union[PositionalIndexer, PositionalIndexerTuple]
222+
223+
# Windowing rank methods
224+
WindowingRankType = Literal["average", "min", "max"]

‎pandas/core/window/expanding.py

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from pandas._typing import (
1111
Axis,
1212
FrameOrSeries,
13+
WindowingRankType,
1314
)
1415

1516
if TYPE_CHECKING:
@@ -564,6 +565,81 @@ def quantile(
564565
**kwargs,
565566
)
566567

568+
@doc(
569+
template_header,
570+
".. versionadded:: 1.4.0 \n\n",
571+
create_section_header("Parameters"),
572+
dedent(
573+
"""
574+
method : {{'average', 'min', 'max'}}, default 'average'
575+
How to rank the group of records that have the same value (i.e. ties):
576+
577+
* average: average rank of the group
578+
* min: lowest rank in the group
579+
* max: highest rank in the group
580+
581+
ascending : bool, default True
582+
Whether or not the elements should be ranked in ascending order.
583+
pct : bool, default False
584+
Whether or not to display the returned rankings in percentile
585+
form.
586+
"""
587+
).replace("\n", "", 1),
588+
kwargs_compat,
589+
create_section_header("Returns"),
590+
template_returns,
591+
create_section_header("See Also"),
592+
template_see_also,
593+
create_section_header("Examples"),
594+
dedent(
595+
"""
596+
>>> s = pd.Series([1, 4, 2, 3, 5, 3])
597+
>>> s.expanding().rank()
598+
0 1.0
599+
1 2.0
600+
2 2.0
601+
3 3.0
602+
4 5.0
603+
5 3.5
604+
dtype: float64
605+
606+
>>> s.expanding().rank(method="max")
607+
0 1.0
608+
1 2.0
609+
2 2.0
610+
3 3.0
611+
4 5.0
612+
5 4.0
613+
dtype: float64
614+
615+
>>> s.expanding().rank(method="min")
616+
0 1.0
617+
1 2.0
618+
2 2.0
619+
3 3.0
620+
4 5.0
621+
5 3.0
622+
dtype: float64
623+
"""
624+
).replace("\n", "", 1),
625+
window_method="expanding",
626+
aggregation_description="rank",
627+
agg_method="rank",
628+
)
629+
def rank(
630+
self,
631+
method: WindowingRankType = "average",
632+
ascending: bool = True,
633+
pct: bool = False,
634+
**kwargs,
635+
):
636+
return super().rank(
637+
method=method,
638+
ascending=ascending,
639+
pct=pct,
640+
**kwargs,
641+
)
642+
567643
@doc(
568644
template_header,
569645
create_section_header("Parameters"),

‎pandas/core/window/rolling.py

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
ArrayLike,
2929
Axis,
3030
FrameOrSeries,
31+
WindowingRankType,
3132
)
3233
from pandas.compat._optional import import_optional_dependency
3334
from pandas.compat.numpy import function as nv
@@ -1410,6 +1411,22 @@ def quantile(self, quantile: float, interpolation: str = "linear", **kwargs):
14101411

14111412
return self._apply(window_func, name="quantile", **kwargs)
14121413

1414+
def rank(
1415+
self,
1416+
method: WindowingRankType = "average",
1417+
ascending: bool = True,
1418+
pct: bool = False,
1419+
**kwargs,
1420+
):
1421+
window_func = partial(
1422+
window_aggregations.roll_rank,
1423+
method=method,
1424+
ascending=ascending,
1425+
percentile=pct,
1426+
)
1427+
1428+
return self._apply(window_func, name="rank", **kwargs)
1429+
14131430
def cov(
14141431
self,
14151432
other: DataFrame | Series | None = None,
@@ -2161,6 +2178,81 @@ def quantile(self, quantile: float, interpolation: str = "linear", **kwargs):
21612178
**kwargs,
21622179
)
21632180

2181+
@doc(
2182+
template_header,
2183+
".. versionadded:: 1.4.0 \n\n",
2184+
create_section_header("Parameters"),
2185+
dedent(
2186+
"""
2187+
method : {{'average', 'min', 'max'}}, default 'average'
2188+
How to rank the group of records that have the same value (i.e. ties):
2189+
2190+
* average: average rank of the group
2191+
* min: lowest rank in the group
2192+
* max: highest rank in the group
2193+
2194+
ascending : bool, default True
2195+
Whether or not the elements should be ranked in ascending order.
2196+
pct : bool, default False
2197+
Whether or not to display the returned rankings in percentile
2198+
form.
2199+
"""
2200+
).replace("\n", "", 1),
2201+
kwargs_compat,
2202+
create_section_header("Returns"),
2203+
template_returns,
2204+
create_section_header("See Also"),
2205+
template_see_also,
2206+
create_section_header("Examples"),
2207+
dedent(
2208+
"""
2209+
>>> s = pd.Series([1, 4, 2, 3, 5, 3])
2210+
>>> s.rolling(3).rank()
2211+
0 NaN
2212+
1 NaN
2213+
2 2.0
2214+
3 2.0
2215+
4 3.0
2216+
5 1.5
2217+
dtype: float64
2218+
2219+
>>> s.rolling(3).rank(method="max")
2220+
0 NaN
2221+
1 NaN
2222+
2 2.0
2223+
3 2.0
2224+
4 3.0
2225+
5 2.0
2226+
dtype: float64
2227+
2228+
>>> s.rolling(3).rank(method="min")
2229+
0 NaN
2230+
1 NaN
2231+
2 2.0
2232+
3 2.0
2233+
4 3.0
2234+
5 1.0
2235+
dtype: float64
2236+
"""
2237+
).replace("\n", "", 1),
2238+
window_method="rolling",
2239+
aggregation_description="rank",
2240+
agg_method="rank",
2241+
)
2242+
def rank(
2243+
self,
2244+
method: WindowingRankType = "average",
2245+
ascending: bool = True,
2246+
pct: bool = False,
2247+
**kwargs,
2248+
):
2249+
return super().rank(
2250+
method=method,
2251+
ascending=ascending,
2252+
pct=pct,
2253+
**kwargs,
2254+
)
2255+
21642256
@doc(
21652257
template_header,
21662258
create_section_header("Parameters"),

‎pandas/tests/window/test_expanding.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -264,3 +264,27 @@ def test_expanding_skew_kurt_numerical_stability(method):
264264
s = s + 5000
265265
result = getattr(s.expanding(3), method)()
266266
tm.assert_series_equal(result, expected)
267+
268+
269+
@pytest.mark.parametrize("window", [1, 3, 10, 20])
270+
@pytest.mark.parametrize("method", ["min", "max", "average"])
271+
@pytest.mark.parametrize("pct", [True, False])
272+
@pytest.mark.parametrize("ascending", [True, False])
273+
@pytest.mark.parametrize("test_data", ["default", "duplicates", "nans"])
274+
def test_rank(window, method, pct, ascending, test_data):
275+
length = 20
276+
if test_data == "default":
277+
ser = Series(data=np.random.rand(length))
278+
elif test_data == "duplicates":
279+
ser = Series(data=np.random.choice(3, length))
280+
elif test_data == "nans":
281+
ser = Series(
282+
data=np.random.choice([1.0, 0.25, 0.75, np.nan, np.inf, -np.inf], length)
283+
)
284+
285+
expected = ser.expanding(window).apply(
286+
lambda x: x.rank(method=method, pct=pct, ascending=ascending).iloc[-1]
287+
)
288+
result = ser.expanding(window).rank(method=method, pct=pct, ascending=ascending)
289+
290+
tm.assert_series_equal(result, expected)

‎pandas/tests/window/test_rolling.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1500,3 +1500,27 @@ def test_rolling_numeric_dtypes():
15001500
dtype="float64",
15011501
)
15021502
tm.assert_frame_equal(result, expected)
1503+
1504+
1505+
@pytest.mark.parametrize("window", [1, 3, 10, 20])
1506+
@pytest.mark.parametrize("method", ["min", "max", "average"])
1507+
@pytest.mark.parametrize("pct", [True, False])
1508+
@pytest.mark.parametrize("ascending", [True, False])
1509+
@pytest.mark.parametrize("test_data", ["default", "duplicates", "nans"])
1510+
def test_rank(window, method, pct, ascending, test_data):
1511+
length = 20
1512+
if test_data == "default":
1513+
ser = Series(data=np.random.rand(length))
1514+
elif test_data == "duplicates":
1515+
ser = Series(data=np.random.choice(3, length))
1516+
elif test_data == "nans":
1517+
ser = Series(
1518+
data=np.random.choice([1.0, 0.25, 0.75, np.nan, np.inf, -np.inf], length)
1519+
)
1520+
1521+
expected = ser.rolling(window).apply(
1522+
lambda x: x.rank(method=method, pct=pct, ascending=ascending).iloc[-1]
1523+
)
1524+
result = ser.rolling(window).rank(method=method, pct=pct, ascending=ascending)
1525+
1526+
tm.assert_series_equal(result, expected)

0 commit comments

Comments
 (0)
Please sign in to comment.