Skip to content

Commit e25b8e4

Browse files
stevenschaereryeshsurya
authored andcommitted
BUG: various groupby ewm times issues (pandas-dev#40952)
* times in ewm groupby: sort times in according to grouping; add missing support for times in numba implementation; fix bug in cython implementation * add GH issue id to tests * fix typing validation error * PR comments * trying to fix int64 to int32 casting TypeError * PR comments * PR comments * PR comments
1 parent 4479cfe commit e25b8e4

File tree

4 files changed

+119
-83
lines changed

4 files changed

+119
-83
lines changed

doc/source/whatsnew/v1.3.0.rst

+3
Original file line numberDiff line numberDiff line change
@@ -825,6 +825,9 @@ Groupby/resample/rolling
825825
- Bug in :class:`core.window.ewm.ExponentialMovingWindow` when calling ``__getitem__`` would incorrectly raise a ``ValueError`` when providing ``times`` (:issue:`40164`)
826826
- Bug in :class:`core.window.ewm.ExponentialMovingWindow` when calling ``__getitem__`` would not retain ``com``, ``span``, ``alpha`` or ``halflife`` attributes (:issue:`40164`)
827827
- :class:`core.window.ewm.ExponentialMovingWindow` now raises a ``NotImplementedError`` when specifying ``times`` with ``adjust=False`` due to an incorrect calculation (:issue:`40098`)
828+
- Bug in :meth:`core.window.ewm.ExponentialMovingWindowGroupby.mean` where the times argument was ignored when ``engine='numba'`` (:issue:`40951`)
829+
- Bug in :meth:`core.window.ewm.ExponentialMovingWindowGroupby.mean` where the wrong times were used in case of multiple groups (:issue:`40951`)
830+
- Bug in :class:`core.window.ewm.ExponentialMovingWindowGroupby` where the times vector and values became out of sync for non-trivial groups (:issue:`40951`)
828831
- Bug in :meth:`Series.asfreq` and :meth:`DataFrame.asfreq` dropping rows when the index is not sorted (:issue:`39805`)
829832
- Bug in aggregation functions for :class:`DataFrame` not respecting ``numeric_only`` argument when ``level`` keyword was given (:issue:`40660`)
830833
- Bug in :meth:`SeriesGroupBy.aggregate` where using a user-defined function to aggregate a ``Series`` with an object-typed :class:`Index` causes an incorrect :class:`Index` shape (issue:`40014`)

pandas/_libs/window/aggregations.pyx

+32-46
Original file line numberDiff line numberDiff line change
@@ -73,9 +73,7 @@ cdef inline float64_t calc_sum(int64_t minp, int64_t nobs, float64_t sum_x) nogi
7373
cdef:
7474
float64_t result
7575

76-
if nobs == 0 == minp:
77-
result = 0
78-
elif nobs >= minp:
76+
if nobs >= minp:
7977
result = sum_x
8078
else:
8179
result = NaN
@@ -116,7 +114,7 @@ cdef inline void remove_sum(float64_t val, int64_t *nobs, float64_t *sum_x,
116114

117115

118116
def roll_sum(const float64_t[:] values, ndarray[int64_t] start,
119-
ndarray[int64_t] end, int64_t minp) -> np.ndarray:
117+
ndarray[int64_t] end, int64_t minp):
120118
cdef:
121119
Py_ssize_t i, j
122120
float64_t sum_x = 0, compensation_add = 0, compensation_remove = 0
@@ -128,7 +126,7 @@ def roll_sum(const float64_t[:] values, ndarray[int64_t] start,
128126
is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds(
129127
start, end
130128
)
131-
output = np.empty(N, dtype=np.float64)
129+
output = np.empty(N, dtype=float)
132130

133131
with nogil:
134132

@@ -172,7 +170,7 @@ cdef inline float64_t calc_mean(int64_t minp, Py_ssize_t nobs,
172170
cdef:
173171
float64_t result
174172

175-
if nobs >= minp and nobs > 0:
173+
if nobs >= minp:
176174
result = sum_x / <float64_t>nobs
177175
if neg_ct == 0 and result < 0:
178176
# all positive
@@ -221,7 +219,7 @@ cdef inline void remove_mean(float64_t val, Py_ssize_t *nobs, float64_t *sum_x,
221219

222220

223221
def roll_mean(const float64_t[:] values, ndarray[int64_t] start,
224-
ndarray[int64_t] end, int64_t minp) -> np.ndarray:
222+
ndarray[int64_t] end, int64_t minp):
225223
cdef:
226224
float64_t val, compensation_add = 0, compensation_remove = 0, sum_x = 0
227225
int64_t s, e
@@ -232,7 +230,7 @@ def roll_mean(const float64_t[:] values, ndarray[int64_t] start,
232230
is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds(
233231
start, end
234232
)
235-
output = np.empty(N, dtype=np.float64)
233+
output = np.empty(N, dtype=float)
236234

237235
with nogil:
238236

@@ -338,7 +336,7 @@ cdef inline void remove_var(float64_t val, float64_t *nobs, float64_t *mean_x,
338336

339337

340338
def roll_var(const float64_t[:] values, ndarray[int64_t] start,
341-
ndarray[int64_t] end, int64_t minp, int ddof=1) -> np.ndarray:
339+
ndarray[int64_t] end, int64_t minp, int ddof=1):
342340
"""
343341
Numerically stable implementation using Welford's method.
344342
"""
@@ -355,7 +353,7 @@ def roll_var(const float64_t[:] values, ndarray[int64_t] start,
355353
is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds(
356354
start, end
357355
)
358-
output = np.empty(N, dtype=np.float64)
356+
output = np.empty(N, dtype=float)
359357

360358
with nogil:
361359

@@ -490,7 +488,7 @@ cdef inline void remove_skew(float64_t val, int64_t *nobs,
490488

491489

492490
def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start,
493-
ndarray[int64_t] end, int64_t minp) -> np.ndarray:
491+
ndarray[int64_t] end, int64_t minp):
494492
cdef:
495493
Py_ssize_t i, j
496494
float64_t val, prev, min_val, mean_val, sum_val = 0
@@ -507,7 +505,7 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start,
507505
is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds(
508506
start, end
509507
)
510-
output = np.empty(N, dtype=np.float64)
508+
output = np.empty(N, dtype=float)
511509
min_val = np.nanmin(values)
512510
values_copy = np.copy(values)
513511

@@ -672,7 +670,7 @@ cdef inline void remove_kurt(float64_t val, int64_t *nobs,
672670

673671

674672
def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start,
675-
ndarray[int64_t] end, int64_t minp) -> np.ndarray:
673+
ndarray[int64_t] end, int64_t minp):
676674
cdef:
677675
Py_ssize_t i, j
678676
float64_t val, prev, mean_val, min_val, sum_val = 0
@@ -689,7 +687,7 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start,
689687
is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds(
690688
start, end
691689
)
692-
output = np.empty(N, dtype=np.float64)
690+
output = np.empty(N, dtype=float)
693691
values_copy = np.copy(values)
694692
min_val = np.nanmin(values)
695693

@@ -753,7 +751,7 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start,
753751

754752

755753
def roll_median_c(const float64_t[:] values, ndarray[int64_t] start,
756-
ndarray[int64_t] end, int64_t minp) -> np.ndarray:
754+
ndarray[int64_t] end, int64_t minp):
757755
cdef:
758756
Py_ssize_t i, j
759757
bint err = False, is_monotonic_increasing_bounds
@@ -769,7 +767,7 @@ def roll_median_c(const float64_t[:] values, ndarray[int64_t] start,
769767

770768
# we use the Fixed/Variable Indexer here as the
771769
# actual skiplist ops outweigh any window computation costs
772-
output = np.empty(N, dtype=np.float64)
770+
output = np.empty(N, dtype=float)
773771

774772
if (end - start).max() == 0:
775773
output[:] = NaN
@@ -889,7 +887,7 @@ cdef inline numeric calc_mm(int64_t minp, Py_ssize_t nobs,
889887

890888

891889
def roll_max(ndarray[float64_t] values, ndarray[int64_t] start,
892-
ndarray[int64_t] end, int64_t minp) -> np.ndarray:
890+
ndarray[int64_t] end, int64_t minp):
893891
"""
894892
Moving max of 1d array of any numeric type along axis=0 ignoring NaNs.
895893
@@ -904,16 +902,12 @@ def roll_max(ndarray[float64_t] values, ndarray[int64_t] start,
904902
closed : 'right', 'left', 'both', 'neither'
905903
make the interval closed on the right, left,
906904
both or neither endpoints
907-
908-
Returns
909-
-------
910-
np.ndarray[float]
911905
"""
912906
return _roll_min_max(values, start, end, minp, is_max=1)
913907

914908

915909
def roll_min(ndarray[float64_t] values, ndarray[int64_t] start,
916-
ndarray[int64_t] end, int64_t minp) -> np.ndarray:
910+
ndarray[int64_t] end, int64_t minp):
917911
"""
918912
Moving min of 1d array of any numeric type along axis=0 ignoring NaNs.
919913
@@ -925,10 +919,6 @@ def roll_min(ndarray[float64_t] values, ndarray[int64_t] start,
925919
is below this, output a NaN
926920
index : ndarray, optional
927921
index for window computation
928-
929-
Returns
930-
-------
931-
np.ndarray[float]
932922
"""
933923
return _roll_min_max(values, start, end, minp, is_max=0)
934924

@@ -946,7 +936,7 @@ cdef _roll_min_max(ndarray[numeric] values,
946936
deque W[int64_t] # track the whole window for nobs compute
947937
ndarray[float64_t, ndim=1] output
948938

949-
output = np.empty(N, dtype=np.float64)
939+
output = np.empty(N, dtype=float)
950940
Q = deque[int64_t]()
951941
W = deque[int64_t]()
952942

@@ -1019,7 +1009,7 @@ interpolation_types = {
10191009

10201010
def roll_quantile(const float64_t[:] values, ndarray[int64_t] start,
10211011
ndarray[int64_t] end, int64_t minp,
1022-
float64_t quantile, str interpolation) -> np.ndarray:
1012+
float64_t quantile, str interpolation):
10231013
"""
10241014
O(N log(window)) implementation using skip list
10251015
"""
@@ -1046,7 +1036,7 @@ def roll_quantile(const float64_t[:] values, ndarray[int64_t] start,
10461036
)
10471037
# we use the Fixed/Variable Indexer here as the
10481038
# actual skiplist ops outweigh any window computation costs
1049-
output = np.empty(N, dtype=np.float64)
1039+
output = np.empty(N, dtype=float)
10501040

10511041
win = (end - start).max()
10521042
if win == 0:
@@ -1140,7 +1130,7 @@ def roll_apply(object obj,
11401130
ndarray[int64_t] start, ndarray[int64_t] end,
11411131
int64_t minp,
11421132
object function, bint raw,
1143-
tuple args, dict kwargs) -> np.ndarray:
1133+
tuple args, dict kwargs):
11441134
cdef:
11451135
ndarray[float64_t] output, counts
11461136
ndarray[float64_t, cast=True] arr
@@ -1157,7 +1147,7 @@ def roll_apply(object obj,
11571147

11581148
counts = roll_sum(np.isfinite(arr).astype(float), start, end, minp)
11591149

1160-
output = np.empty(N, dtype=np.float64)
1150+
output = np.empty(N, dtype=float)
11611151

11621152
for i in range(N):
11631153

@@ -1179,15 +1169,11 @@ def roll_apply(object obj,
11791169
# Rolling sum and mean for weighted window
11801170

11811171

1182-
def roll_weighted_sum(
1183-
const float64_t[:] values, const float64_t[:] weights, int minp
1184-
) -> np.ndaray:
1172+
def roll_weighted_sum(const float64_t[:] values, const float64_t[:] weights, int minp):
11851173
return _roll_weighted_sum_mean(values, weights, minp, avg=0)
11861174

11871175

1188-
def roll_weighted_mean(
1189-
const float64_t[:] values, const float64_t[:] weights, int minp
1190-
) -> np.ndaray:
1176+
def roll_weighted_mean(const float64_t[:] values, const float64_t[:] weights, int minp):
11911177
return _roll_weighted_sum_mean(values, weights, minp, avg=1)
11921178

11931179

@@ -1446,7 +1432,7 @@ def roll_weighted_var(const float64_t[:] values, const float64_t[:] weights,
14461432

14471433
n = len(values)
14481434
win_n = len(weights)
1449-
output = np.empty(n, dtype=np.float64)
1435+
output = np.empty(n, dtype=float)
14501436

14511437
with nogil:
14521438

@@ -1486,7 +1472,7 @@ def roll_weighted_var(const float64_t[:] values, const float64_t[:] weights,
14861472

14871473
def ewma(const float64_t[:] vals, const int64_t[:] start, const int64_t[:] end,
14881474
int minp, float64_t com, bint adjust, bint ignore_na,
1489-
const float64_t[:] deltas) -> np.ndarray:
1475+
const float64_t[:] deltas):
14901476
"""
14911477
Compute exponentially-weighted moving average using center-of-mass.
14921478
@@ -1503,13 +1489,13 @@ def ewma(const float64_t[:] vals, const int64_t[:] start, const int64_t[:] end,
15031489
15041490
Returns
15051491
-------
1506-
np.ndarray[float64_t]
1492+
ndarray
15071493
"""
15081494

15091495
cdef:
15101496
Py_ssize_t i, j, s, e, nobs, win_size, N = len(vals), M = len(start)
15111497
const float64_t[:] sub_deltas, sub_vals
1512-
ndarray[float64_t] sub_output, output = np.empty(N, dtype=np.float64)
1498+
ndarray[float64_t] sub_output, output = np.empty(N, dtype=float)
15131499
float64_t alpha, old_wt_factor, new_wt, weighted_avg, old_wt, cur
15141500
bint is_observation
15151501

@@ -1528,7 +1514,7 @@ def ewma(const float64_t[:] vals, const int64_t[:] start, const int64_t[:] end,
15281514
# conjunction with vals[i+1]
15291515
sub_deltas = deltas[s:e - 1]
15301516
win_size = len(sub_vals)
1531-
sub_output = np.empty(win_size, dtype=np.float64)
1517+
sub_output = np.empty(win_size, dtype=float)
15321518

15331519
weighted_avg = sub_vals[0]
15341520
is_observation = weighted_avg == weighted_avg
@@ -1571,7 +1557,7 @@ def ewma(const float64_t[:] vals, const int64_t[:] start, const int64_t[:] end,
15711557

15721558
def ewmcov(const float64_t[:] input_x, const int64_t[:] start, const int64_t[:] end,
15731559
int minp, const float64_t[:] input_y, float64_t com, bint adjust,
1574-
bint ignore_na, bint bias) -> np.ndarray:
1560+
bint ignore_na, bint bias):
15751561
"""
15761562
Compute exponentially-weighted moving variance using center-of-mass.
15771563
@@ -1589,7 +1575,7 @@ def ewmcov(const float64_t[:] input_x, const int64_t[:] start, const int64_t[:]
15891575
15901576
Returns
15911577
-------
1592-
np.ndarray[float64_t]
1578+
ndarray
15931579
"""
15941580

15951581
cdef:
@@ -1599,7 +1585,7 @@ def ewmcov(const float64_t[:] input_x, const int64_t[:] start, const int64_t[:]
15991585
float64_t sum_wt, sum_wt2, old_wt, cur_x, cur_y, old_mean_x, old_mean_y
16001586
float64_t numerator, denominator
16011587
const float64_t[:] sub_x_vals, sub_y_vals
1602-
ndarray[float64_t] sub_out, output = np.empty(N, dtype=np.float64)
1588+
ndarray[float64_t] sub_out, output = np.empty(N, dtype=float)
16031589
bint is_observation
16041590

16051591
if M != N:
@@ -1618,7 +1604,7 @@ def ewmcov(const float64_t[:] input_x, const int64_t[:] start, const int64_t[:]
16181604
sub_x_vals = input_x[s:e]
16191605
sub_y_vals = input_y[s:e]
16201606
win_size = len(sub_x_vals)
1621-
sub_out = np.empty(win_size, dtype=np.float64)
1607+
sub_out = np.empty(win_size, dtype=float)
16221608

16231609
mean_x = sub_x_vals[0]
16241610
mean_y = sub_y_vals[0]

0 commit comments

Comments
 (0)