Skip to content

Commit 574a199

Browse files
committed
This commit is a start to address step sizes in rolling windows (pandas-dev#15354)
and a hint to how to handle iterating windows (pandas-dev#11704)
1 parent 9a741d3 commit 574a199

File tree

5 files changed

+170
-76
lines changed

5 files changed

+170
-76
lines changed

pandas/_libs/window/aggregations.pyx

+1-1
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,7 @@ def roll_sum_variable(ndarray[float64_t] values, ndarray[int64_t] start,
180180
cdef:
181181
float64_t sum_x = 0
182182
int64_t s, e
183-
int64_t nobs = 0, i, j, N = len(values)
183+
int64_t nobs = 0, i, j, N = len(start)
184184
ndarray[float64_t] output
185185
bint is_monotonic_bounds
186186

pandas/_libs/window/indexers.pyx

+77-48
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@ from numpy cimport ndarray, int64_t
99
def calculate_variable_window_bounds(
1010
int64_t num_values,
1111
int64_t window_size,
12-
object min_periods, # unused but here to match get_window_bounds signature
12+
object step_size_obj,
13+
object min_periods_obj,
1314
object center, # unused but here to match get_window_bounds signature
1415
object closed,
1516
const int64_t[:] index
@@ -25,8 +26,11 @@ def calculate_variable_window_bounds(
2526
window_size : int64
2627
window size calculated from the offset
2728
29+
step_size : Optional[int], default None
30+
the window step size
31+
2832
min_periods : object
29-
ignored, exists for compatibility
33+
Minimum data points in each window.
3034
3135
center : object
3236
ignored, exists for compatibility
@@ -42,68 +46,93 @@ def calculate_variable_window_bounds(
4246
(ndarray[int64], ndarray[int64])
4347
"""
4448
cdef:
45-
bint left_closed = False
46-
bint right_closed = False
47-
int index_growth_sign = 1
49+
bint left_open = False
50+
bint right_open = False
51+
int idx_scalar = 1
4852
ndarray[int64_t, ndim=1] start, end
49-
int64_t start_bound, end_bound
53+
int64_t step_size, min_periods
54+
int64_t index_i, index_si, index_ei,
55+
int64_t index_window_i, index_step_i
56+
int64_t index_window_max, index_step_max
57+
int64_t window_i = 0
58+
int64_t next_index_si = 0
59+
int64_t next_index_ei = 0
5060
Py_ssize_t i, j
5161

52-
# if windows is variable, default is 'right', otherwise default is 'both'
5362
if closed is None:
54-
closed = 'right' if index is not None else 'both'
63+
closed = 'left'
5564

56-
if closed in ['right', 'both']:
57-
right_closed = True
65+
if closed not in ['right', 'both']:
66+
right_open = True
5867

59-
if closed in ['left', 'both']:
60-
left_closed = True
68+
if closed not in ['left', 'both']:
69+
left_open = True
6170

71+
# Assume index is monotonic increasing or decreasing. If decreasing (WHY??) negate values.
6272
if index[num_values - 1] < index[0]:
63-
index_growth_sign = -1
73+
idx_scalar = -1
74+
75+
# Minimum "observations".
76+
min_periods = min_periods_obj if min_periods_obj is not None else 0
77+
step_size = step_size_obj if step_size_obj is not None else 1
6478

6579
start = np.empty(num_values, dtype='int64')
6680
start.fill(-1)
6781
end = np.empty(num_values, dtype='int64')
68-
end.fill(-1)
6982

70-
start[0] = 0
83+
if num_values < 1:
84+
return start, end
85+
86+
# Indexing into indices: index_si index_ei (index start/end)
87+
# Indexing into start/end arrays: window_i
88+
# This will find closed intervals [start, end]
7189

72-
# right endpoint is closed
73-
if right_closed:
74-
end[0] = 1
75-
# right endpoint is open
76-
else:
77-
end[0] = 0
90+
window_i = 0
91+
next_index_si = 0
92+
next_index_ei = 0
7893

7994
with nogil:
95+
while next_index_ei < num_values:
96+
index_si = next_index_si
8097

81-
# start is start of slice interval (including)
82-
# end is end of slice interval (not including)
83-
for i in range(1, num_values):
84-
end_bound = index[i]
85-
start_bound = index[i] - index_growth_sign * window_size
86-
87-
# left endpoint is closed
88-
if left_closed:
89-
start_bound -= 1
90-
91-
# advance the start bound until we are
92-
# within the constraint
93-
start[i] = i
94-
for j in range(start[i - 1], i):
95-
if (index[j] - start_bound) * index_growth_sign > 0:
96-
start[i] = j
98+
start[window_i] = index_si
99+
100+
index_window_max = index[index_si] + idx_scalar*(window_size - 1)
101+
index_step_max = index[index_si] + idx_scalar*(step_size - 1)
102+
103+
# Find end of step.
104+
index_step_i = num_values - 1
105+
for index_i in range(index_si + 1, num_values):
106+
# Outside of step?
107+
if idx_scalar*index[index_i] > idx_scalar*index_step_max:
108+
index_step_i = index_i - 1
97109
break
98110

99-
# end bound is previous end
100-
# or current index
101-
if (index[end[i - 1]] - end_bound) * index_growth_sign <= 0:
102-
end[i] = i + 1
103-
else:
104-
end[i] = end[i - 1]
105-
106-
# right endpoint is open
107-
if not right_closed:
108-
end[i] -= 1
109-
return start, end
111+
# Find end of window.
112+
index_window_i = num_values - 1
113+
for index_i in range(next_index_ei + 1, num_values):
114+
# Outside of window?
115+
if idx_scalar*index[index_i] > idx_scalar*index_window_max:
116+
index_window_i = index_i - 1
117+
break
118+
119+
next_index_si = index_step_i + 1
120+
next_index_ei = next_index_si if next_index_si > index_window_i + 1 else index_window_i + 1
121+
122+
end[window_i] = index_window_i
123+
window_i += 1
124+
125+
# Remove excess slots.
126+
valid_idx = (start >= 0) & (start <= end)
127+
128+
# And windows without enough data.
129+
if min_periods is not None:
130+
valid_idx &= (end - start + 1) >= min_periods
131+
132+
# Update open boundaries.
133+
if left_open:
134+
start -= 1
135+
if right_open:
136+
end += 1
137+
138+
return start[valid_idx], end[valid_idx]

pandas/core/generic.py

+3
Original file line numberDiff line numberDiff line change
@@ -10396,6 +10396,7 @@ def rolling(
1039610396
on=None,
1039710397
axis=0,
1039810398
closed=None,
10399+
step=None,
1039910400
):
1040010401
axis = self._get_axis_number(axis)
1040110402

@@ -10409,6 +10410,7 @@ def rolling(
1040910410
on=on,
1041010411
axis=axis,
1041110412
closed=closed,
10413+
step=step,
1041210414
)
1041310415

1041410416
return Rolling(
@@ -10420,6 +10422,7 @@ def rolling(
1042010422
on=on,
1042110423
axis=axis,
1042210424
closed=closed,
10425+
step=step,
1042310426
)
1042410427

1042510428
cls.rolling = rolling

pandas/core/window/indexers.py

+56-12
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
number of values that will be aggregated over
1616
window_size : int, default 0
1717
the number of rows in a window
18+
step_size : int, default 1
19+
the window step size
1820
min_periods : int, default None
1921
min_periods passed from the top level rolling API
2022
center : bool, default None
@@ -35,7 +37,11 @@ class BaseIndexer:
3537
"""Base class for window bounds calculations."""
3638

3739
def __init__(
38-
self, index_array: Optional[np.ndarray] = None, window_size: int = 0, **kwargs,
40+
self,
41+
index_array: Optional[np.ndarray] = None,
42+
window_size: int = 0,
43+
step_size: Optional[int] = None,
44+
**kwargs,
3945
):
4046
"""
4147
Parameters
@@ -45,6 +51,8 @@ def __init__(
4551
"""
4652
self.index_array = index_array
4753
self.window_size = window_size
54+
self.step_size = step_size
55+
4856
# Set user defined kwargs as attributes that can be used in get_window_bounds
4957
for key, value in kwargs.items():
5058
setattr(self, key, value)
@@ -73,17 +81,52 @@ def get_window_bounds(
7381
closed: Optional[str] = None,
7482
) -> Tuple[np.ndarray, np.ndarray]:
7583

76-
start_s = np.zeros(self.window_size, dtype="int64")
77-
start_e = (
78-
np.arange(self.window_size, num_values, dtype="int64")
79-
- self.window_size
80-
+ 1
81-
)
82-
start = np.concatenate([start_s, start_e])[:num_values]
84+
if self.step_size is not None:
85+
"""
86+
Proposed new behavior. Ignores partially filled windows, which don't really
87+
make sense with fixed (index) width windows. Alignment assumers either
88+
centered (`center` = True) or left-aligned (`center` = False). `align`
89+
parameter should probably replace `center` with left, right, and center
90+
options.
91+
"""
92+
93+
# Compute intervals in semi-closed form [start, end)
94+
loffset = self.step_size // 2 if center else 0
95+
start = np.arange(
96+
loffset,
97+
num_values - self.window_size + 1,
98+
self.step_size,
99+
dtype="int64")
100+
end = start + self.window_size
101+
102+
# Open/close interval appropriately.
103+
if closed is None:
104+
closed = 'right'
105+
106+
if closed in ['right', 'both']:
107+
# Close right side of interval.
108+
end -= 1
109+
110+
if closed not in ['left', 'both']:
111+
# Open left side of interval.
112+
start -= 1
113+
114+
else:
115+
"""
116+
Maintained to reproduce old behavior. Unclear if this should remain.
117+
"""
118+
start_s = np.zeros(self.window_size, dtype="int64")
119+
start_e = (
120+
np.arange(self.window_size, num_values, dtype="int64")
121+
- self.window_size
122+
+ 1
123+
)
124+
start = np.concatenate([start_s, start_e])[:num_values]
125+
126+
end_s = np.arange(self.window_size, dtype="int64") + 1
127+
end_e = start_e + self.window_size
128+
end = np.concatenate([end_s, end_e])[:num_values]
83129

84-
end_s = np.arange(self.window_size, dtype="int64") + 1
85-
end_e = start_e + self.window_size
86-
end = np.concatenate([end_s, end_e])[:num_values]
87130
return start, end
88131

89132

@@ -100,7 +143,8 @@ def get_window_bounds(
100143
) -> Tuple[np.ndarray, np.ndarray]:
101144

102145
return calculate_variable_window_bounds(
103-
num_values, self.window_size, min_periods, center, closed, self.index_array,
146+
num_values, self.window_size, self.step_size,
147+
min_periods, center, closed, self.index_array,
104148
)
105149

106150

0 commit comments

Comments
 (0)