Skip to content

Commit b34c260

Browse files
committed
use quotes on float_precision options
2 parents 462b1a9 + 078f88e commit b34c260

File tree

16 files changed

+342
-190
lines changed

16 files changed

+342
-190
lines changed

.github/workflows/stale-pr.yml

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
name: "Stale PRs"
2+
on:
3+
schedule:
4+
# * is a special character in YAML so you have to quote this string
5+
- cron: "0 */6 * * *"
6+
7+
jobs:
8+
stale:
9+
runs-on: ubuntu-latest
10+
steps:
11+
- uses: actions/stale@v3
12+
with:
13+
repo-token: ${{ secrets.GITHUB_TOKEN }}
14+
stale-pr-message: "This pull request is stale because it has been open for thirty days with no activity."
15+
skip-stale-pr-message: false
16+
stale-pr-label: "Stale"
17+
exempt-pr-labels: "Needs Review,Blocked"
18+
days-before-stale: 30
19+
days-before-close: -1
20+
remove-stale-when-updated: true
21+
debug-only: true

doc/source/whatsnew/v1.2.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,7 @@ Other enhancements
118118
- :class:`Index` with object dtype supports division and multiplication (:issue:`34160`)
119119
- :meth:`DataFrame.explode` and :meth:`Series.explode` now support exploding of sets (:issue:`35614`)
120120
- `Styler` now allows direct CSS class name addition to individual data cells (:issue:`36159`)
121+
- :meth:`Rolling.mean()` and :meth:`Rolling.sum()` use Kahan summation to calculate the mean to avoid numerical problems (:issue:`10319`, :issue:`11645`, :issue:`13254`, :issue:`32761`, :issue:`36031`)
121122

122123
.. _whatsnew_120.api_breaking.python:
123124

pandas/_libs/window/aggregations.pyx

+55-30
Original file line numberDiff line numberDiff line change
@@ -161,27 +161,42 @@ cdef inline float64_t calc_sum(int64_t minp, int64_t nobs, float64_t sum_x) nogi
161161
return result
162162

163163

164-
cdef inline void add_sum(float64_t val, int64_t *nobs, float64_t *sum_x) nogil:
165-
""" add a value from the sum calc """
164+
cdef inline void add_sum(float64_t val, int64_t *nobs, float64_t *sum_x,
165+
float64_t *compensation) nogil:
166+
""" add a value from the sum calc using Kahan summation """
167+
168+
cdef:
169+
float64_t y, t
166170

167171
# Not NaN
168172
if notnan(val):
169173
nobs[0] = nobs[0] + 1
170-
sum_x[0] = sum_x[0] + val
174+
y = val - compensation[0]
175+
t = sum_x[0] + y
176+
compensation[0] = t - sum_x[0] - y
177+
sum_x[0] = t
171178

172179

173-
cdef inline void remove_sum(float64_t val, int64_t *nobs, float64_t *sum_x) nogil:
174-
""" remove a value from the sum calc """
180+
cdef inline void remove_sum(float64_t val, int64_t *nobs, float64_t *sum_x,
181+
float64_t *compensation) nogil:
182+
""" remove a value from the sum calc using Kahan summation """
183+
184+
cdef:
185+
float64_t y, t
175186

187+
# Not NaN
176188
if notnan(val):
177189
nobs[0] = nobs[0] - 1
178-
sum_x[0] = sum_x[0] - val
190+
y = - val - compensation[0]
191+
t = sum_x[0] + y
192+
compensation[0] = t - sum_x[0] - y
193+
sum_x[0] = t
179194

180195

181196
def roll_sum_variable(ndarray[float64_t] values, ndarray[int64_t] start,
182197
ndarray[int64_t] end, int64_t minp):
183198
cdef:
184-
float64_t sum_x = 0
199+
float64_t sum_x = 0, compensation_add = 0, compensation_remove = 0
185200
int64_t s, e
186201
int64_t nobs = 0, i, j, N = len(values)
187202
ndarray[float64_t] output
@@ -201,31 +216,31 @@ def roll_sum_variable(ndarray[float64_t] values, ndarray[int64_t] start,
201216
# setup
202217

203218
for j in range(s, e):
204-
add_sum(values[j], &nobs, &sum_x)
219+
add_sum(values[j], &nobs, &sum_x, &compensation_add)
205220

206221
else:
207222

208223
# calculate deletes
209224
for j in range(start[i - 1], s):
210-
remove_sum(values[j], &nobs, &sum_x)
225+
remove_sum(values[j], &nobs, &sum_x, &compensation_remove)
211226

212227
# calculate adds
213228
for j in range(end[i - 1], e):
214-
add_sum(values[j], &nobs, &sum_x)
229+
add_sum(values[j], &nobs, &sum_x, &compensation_add)
215230

216231
output[i] = calc_sum(minp, nobs, sum_x)
217232

218233
if not is_monotonic_bounds:
219234
for j in range(s, e):
220-
remove_sum(values[j], &nobs, &sum_x)
235+
remove_sum(values[j], &nobs, &sum_x, &compensation_remove)
221236

222237
return output
223238

224239

225240
def roll_sum_fixed(ndarray[float64_t] values, ndarray[int64_t] start,
226241
ndarray[int64_t] end, int64_t minp, int64_t win):
227242
cdef:
228-
float64_t val, prev_x, sum_x = 0
243+
float64_t val, prev_x, sum_x = 0, compensation_add = 0, compensation_remove = 0
229244
int64_t range_endpoint
230245
int64_t nobs = 0, i, N = len(values)
231246
ndarray[float64_t] output
@@ -237,16 +252,16 @@ def roll_sum_fixed(ndarray[float64_t] values, ndarray[int64_t] start,
237252
with nogil:
238253

239254
for i in range(0, range_endpoint):
240-
add_sum(values[i], &nobs, &sum_x)
255+
add_sum(values[i], &nobs, &sum_x, &compensation_add)
241256
output[i] = NaN
242257

243258
for i in range(range_endpoint, N):
244259
val = values[i]
245-
add_sum(val, &nobs, &sum_x)
260+
add_sum(val, &nobs, &sum_x, &compensation_add)
246261

247262
if i > win - 1:
248263
prev_x = values[i - win]
249-
remove_sum(prev_x, &nobs, &sum_x)
264+
remove_sum(prev_x, &nobs, &sum_x, &compensation_remove)
250265

251266
output[i] = calc_sum(minp, nobs, sum_x)
252267

@@ -277,32 +292,42 @@ cdef inline float64_t calc_mean(int64_t minp, Py_ssize_t nobs,
277292

278293

279294
cdef inline void add_mean(float64_t val, Py_ssize_t *nobs, float64_t *sum_x,
280-
Py_ssize_t *neg_ct) nogil:
281-
""" add a value from the mean calc """
295+
Py_ssize_t *neg_ct, float64_t *compensation) nogil:
296+
""" add a value from the mean calc using Kahan summation """
297+
cdef:
298+
float64_t y, t
282299

283300
# Not NaN
284301
if notnan(val):
285302
nobs[0] = nobs[0] + 1
286-
sum_x[0] = sum_x[0] + val
303+
y = val - compensation[0]
304+
t = sum_x[0] + y
305+
compensation[0] = t - sum_x[0] - y
306+
sum_x[0] = t
287307
if signbit(val):
288308
neg_ct[0] = neg_ct[0] + 1
289309

290310

291311
cdef inline void remove_mean(float64_t val, Py_ssize_t *nobs, float64_t *sum_x,
292-
Py_ssize_t *neg_ct) nogil:
293-
""" remove a value from the mean calc """
312+
Py_ssize_t *neg_ct, float64_t *compensation) nogil:
313+
""" remove a value from the mean calc using Kahan summation """
314+
cdef:
315+
float64_t y, t
294316

295317
if notnan(val):
296318
nobs[0] = nobs[0] - 1
297-
sum_x[0] = sum_x[0] - val
319+
y = - val - compensation[0]
320+
t = sum_x[0] + y
321+
compensation[0] = t - sum_x[0] - y
322+
sum_x[0] = t
298323
if signbit(val):
299324
neg_ct[0] = neg_ct[0] - 1
300325

301326

302327
def roll_mean_fixed(ndarray[float64_t] values, ndarray[int64_t] start,
303328
ndarray[int64_t] end, int64_t minp, int64_t win):
304329
cdef:
305-
float64_t val, prev_x, sum_x = 0
330+
float64_t val, prev_x, sum_x = 0, compensation_add = 0, compensation_remove = 0
306331
Py_ssize_t nobs = 0, i, neg_ct = 0, N = len(values)
307332
ndarray[float64_t] output
308333

@@ -311,16 +336,16 @@ def roll_mean_fixed(ndarray[float64_t] values, ndarray[int64_t] start,
311336
with nogil:
312337
for i in range(minp - 1):
313338
val = values[i]
314-
add_mean(val, &nobs, &sum_x, &neg_ct)
339+
add_mean(val, &nobs, &sum_x, &neg_ct, &compensation_add)
315340
output[i] = NaN
316341

317342
for i in range(minp - 1, N):
318343
val = values[i]
319-
add_mean(val, &nobs, &sum_x, &neg_ct)
344+
add_mean(val, &nobs, &sum_x, &neg_ct, &compensation_add)
320345

321346
if i > win - 1:
322347
prev_x = values[i - win]
323-
remove_mean(prev_x, &nobs, &sum_x, &neg_ct)
348+
remove_mean(prev_x, &nobs, &sum_x, &neg_ct, &compensation_remove)
324349

325350
output[i] = calc_mean(minp, nobs, neg_ct, sum_x)
326351

@@ -330,7 +355,7 @@ def roll_mean_fixed(ndarray[float64_t] values, ndarray[int64_t] start,
330355
def roll_mean_variable(ndarray[float64_t] values, ndarray[int64_t] start,
331356
ndarray[int64_t] end, int64_t minp):
332357
cdef:
333-
float64_t val, sum_x = 0
358+
float64_t val, compensation_add = 0, compensation_remove = 0, sum_x = 0
334359
int64_t s, e
335360
Py_ssize_t nobs = 0, i, j, neg_ct = 0, N = len(values)
336361
ndarray[float64_t] output
@@ -350,26 +375,26 @@ def roll_mean_variable(ndarray[float64_t] values, ndarray[int64_t] start,
350375
# setup
351376
for j in range(s, e):
352377
val = values[j]
353-
add_mean(val, &nobs, &sum_x, &neg_ct)
378+
add_mean(val, &nobs, &sum_x, &neg_ct, &compensation_add)
354379

355380
else:
356381

357382
# calculate deletes
358383
for j in range(start[i - 1], s):
359384
val = values[j]
360-
remove_mean(val, &nobs, &sum_x, &neg_ct)
385+
remove_mean(val, &nobs, &sum_x, &neg_ct, &compensation_remove)
361386

362387
# calculate adds
363388
for j in range(end[i - 1], e):
364389
val = values[j]
365-
add_mean(val, &nobs, &sum_x, &neg_ct)
390+
add_mean(val, &nobs, &sum_x, &neg_ct, &compensation_add)
366391

367392
output[i] = calc_mean(minp, nobs, neg_ct, sum_x)
368393

369394
if not is_monotonic_bounds:
370395
for j in range(s, e):
371396
val = values[j]
372-
remove_mean(val, &nobs, &sum_x, &neg_ct)
397+
remove_mean(val, &nobs, &sum_x, &neg_ct, &compensation_remove)
373398
return output
374399

375400
# ----------------------------------------------------------------------

pandas/core/arrays/_mixins.py

+12
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from pandas.core.algorithms import take, unique
1010
from pandas.core.array_algos.transforms import shift
1111
from pandas.core.arrays.base import ExtensionArray
12+
from pandas.core.indexers import check_array_indexer
1213

1314
_T = TypeVar("_T", bound="NDArrayBackedExtensionArray")
1415

@@ -156,3 +157,14 @@ def _validate_shift_value(self, fill_value):
156157
# TODO: after deprecation in datetimelikearraymixin is enforced,
157158
# we can remove this and ust validate_fill_value directly
158159
return self._validate_fill_value(fill_value)
160+
161+
def __setitem__(self, key, value):
162+
key = self._validate_setitem_key(key)
163+
value = self._validate_setitem_value(value)
164+
self._ndarray[key] = value
165+
166+
def _validate_setitem_key(self, key):
167+
return check_array_indexer(self, key)
168+
169+
def _validate_setitem_value(self, value):
170+
return value

pandas/core/arrays/categorical.py

+17-26
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ def func(self, other):
9393

9494
if is_scalar(other):
9595
if other in self.categories:
96-
i = self.categories.get_loc(other)
96+
i = self._unbox_scalar(other)
9797
ret = op(self._codes, i)
9898

9999
if opname not in {"__eq__", "__ge__", "__gt__"}:
@@ -1184,8 +1184,7 @@ def _validate_searchsorted_value(self, value):
11841184
# searchsorted is very performance sensitive. By converting codes
11851185
# to same dtype as self.codes, we get much faster performance.
11861186
if is_scalar(value):
1187-
codes = self.categories.get_loc(value)
1188-
codes = self.codes.dtype.type(codes)
1187+
codes = self._unbox_scalar(value)
11891188
else:
11901189
locs = [self.categories.get_loc(x) for x in value]
11911190
codes = np.array(locs, dtype=self.codes.dtype)
@@ -1212,7 +1211,7 @@ def _validate_fill_value(self, fill_value):
12121211
if isna(fill_value):
12131212
fill_value = -1
12141213
elif fill_value in self.categories:
1215-
fill_value = self.categories.get_loc(fill_value)
1214+
fill_value = self._unbox_scalar(fill_value)
12161215
else:
12171216
raise ValueError(
12181217
f"'fill_value={fill_value}' is not present "
@@ -1680,7 +1679,7 @@ def fillna(self, value=None, method=None, limit=None):
16801679
if isna(value):
16811680
codes[mask] = -1
16821681
else:
1683-
codes[mask] = self.categories.get_loc(value)
1682+
codes[mask] = self._unbox_scalar(value)
16841683

16851684
else:
16861685
raise TypeError(
@@ -1734,6 +1733,17 @@ def _validate_listlike(self, target: ArrayLike) -> np.ndarray:
17341733

17351734
return codes
17361735

1736+
def _unbox_scalar(self, key) -> int:
1737+
# searchsorted is very performance sensitive. By converting codes
1738+
# to same dtype as self.codes, we get much faster performance.
1739+
code = self.categories.get_loc(key)
1740+
code = self._codes.dtype.type(code)
1741+
return code
1742+
1743+
def _unbox_listlike(self, value):
1744+
unboxed = self.categories.get_indexer(value)
1745+
return unboxed.astype(self._ndarray.dtype, copy=False)
1746+
17371747
# ------------------------------------------------------------------
17381748

17391749
def take_nd(self, indexer, allow_fill: bool = False, fill_value=None):
@@ -1884,20 +1894,6 @@ def __getitem__(self, key):
18841894
return result
18851895
return self._from_backing_data(result)
18861896

1887-
def __setitem__(self, key, value):
1888-
"""
1889-
Item assignment.
1890-
1891-
Raises
1892-
------
1893-
ValueError
1894-
If (one or more) Value is not in categories or if a assigned
1895-
`Categorical` does not have the same categories
1896-
"""
1897-
key = self._validate_setitem_key(key)
1898-
value = self._validate_setitem_value(value)
1899-
self._ndarray[key] = value
1900-
19011897
def _validate_setitem_value(self, value):
19021898
value = extract_array(value, extract_numpy=True)
19031899

@@ -1925,11 +1921,7 @@ def _validate_setitem_value(self, value):
19251921
"category, set the categories first"
19261922
)
19271923

1928-
lindexer = self.categories.get_indexer(rvalue)
1929-
if isinstance(lindexer, np.ndarray) and lindexer.dtype.kind == "i":
1930-
lindexer = lindexer.astype(self._ndarray.dtype)
1931-
1932-
return lindexer
1924+
return self._unbox_listlike(rvalue)
19331925

19341926
def _validate_setitem_key(self, key):
19351927
if lib.is_integer(key):
@@ -2155,8 +2147,7 @@ def unique(self):
21552147
return cat.set_categories(cat.categories.take(take_codes))
21562148

21572149
def _values_for_factorize(self):
2158-
codes = self.codes.astype("int64")
2159-
return codes, -1
2150+
return self._ndarray, -1
21602151

21612152
@classmethod
21622153
def _from_factorized(cls, uniques, original):

pandas/core/arrays/datetimelike.py

+2-4
Original file line numberDiff line numberDiff line change
@@ -609,9 +609,7 @@ def __setitem__(
609609
if no_op:
610610
return
611611

612-
value = self._validate_setitem_value(value)
613-
key = check_array_indexer(self, key)
614-
self._ndarray[key] = value
612+
super().__setitem__(key, value)
615613
self._maybe_clear_freq()
616614

617615
def _maybe_clear_freq(self):
@@ -697,7 +695,7 @@ def copy(self: DatetimeLikeArrayT) -> DatetimeLikeArrayT:
697695
return new_obj
698696

699697
def _values_for_factorize(self):
700-
return self.asi8, iNaT
698+
return self._ndarray, iNaT
701699

702700
@classmethod
703701
def _from_factorized(cls, values, original):

0 commit comments

Comments
 (0)