Skip to content

Commit bbec57d

Browse files
committed
Merge pull request #10472 from jvkersch/fix/var-welford-algorithm
ENH: Make group_var_ use Welford's algorithm.
2 parents a14f513 + a35d9b7 commit bbec57d

File tree

4 files changed

+181
-102
lines changed

4 files changed

+181
-102
lines changed

doc/source/whatsnew/v0.17.0.txt

+3
Original file line numberDiff line numberDiff line change
@@ -135,8 +135,11 @@ Bug Fixes
135135
- Bug in ``DatetimeIndex`` and ``PeriodIndex.value_counts`` resets name from its result, but retains in result's ``Index``. (:issue:`10150`)
136136

137137
- Bug in `pandas.concat` with ``axis=0`` when column is of dtype ``category`` (:issue:`10177`)
138+
138139
- Bug in ``read_msgpack`` where input type is not always checked (:issue:`10369`)
139140

140141
- Bug in `pandas.read_csv` with ``index_col=False`` or with ``index_col=['a', 'b']`` (:issue:`10413`, :issue:`10467`)
141142

142143
- Bug in `Series.from_csv` with ``header`` kwarg not setting the ``Series.name`` or the ``Series.index.name`` (:issue:`10483`)
144+
145+
- Bug in `groupby.var` which caused variance to be inaccurate for small float values (:issue:`10448`)

pandas/src/generate_code.py

+19-34
Original file line numberDiff line numberDiff line change
@@ -1147,67 +1147,52 @@ def group_prod_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
11471147

11481148
group_var_template = """@cython.wraparound(False)
11491149
@cython.boundscheck(False)
1150+
@cython.cdivision(True)
11501151
def group_var_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
11511152
ndarray[int64_t] counts,
11521153
ndarray[%(dest_type2)s, ndim=2] values,
11531154
ndarray[int64_t] labels):
11541155
cdef:
11551156
Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
1156-
%(dest_type2)s val, ct
1157-
ndarray[%(dest_type2)s, ndim=2] nobs, sumx, sumxx
1157+
%(dest_type2)s val, ct, oldmean
1158+
ndarray[%(dest_type2)s, ndim=2] nobs, mean
11581159
11591160
if not len(values) == len(labels):
11601161
raise AssertionError("len(index) != len(labels)")
11611162
11621163
nobs = np.zeros_like(out)
1163-
sumx = np.zeros_like(out)
1164-
sumxx = np.zeros_like(out)
1164+
mean = np.zeros_like(out)
11651165
11661166
N, K = (<object> values).shape
11671167
1168-
with nogil:
1169-
if K > 1:
1170-
for i in range(N):
1171-
1172-
lab = labels[i]
1173-
if lab < 0:
1174-
continue
1168+
out[:, :] = 0.0
11751169
1176-
counts[lab] += 1
1177-
1178-
for j in range(K):
1179-
val = values[i, j]
1180-
1181-
# not nan
1182-
if val == val:
1183-
nobs[lab, j] += 1
1184-
sumx[lab, j] += val
1185-
sumxx[lab, j] += val * val
1186-
else:
1187-
for i in range(N):
1170+
with nogil:
1171+
for i in range(N):
1172+
lab = labels[i]
1173+
if lab < 0:
1174+
continue
11881175
1189-
lab = labels[i]
1190-
if lab < 0:
1191-
continue
1176+
counts[lab] += 1
11921177
1193-
counts[lab] += 1
1194-
val = values[i, 0]
1178+
for j in range(K):
1179+
val = values[i, j]
11951180
11961181
# not nan
11971182
if val == val:
1198-
nobs[lab, 0] += 1
1199-
sumx[lab, 0] += val
1200-
sumxx[lab, 0] += val * val
1201-
1183+
nobs[lab, j] += 1
1184+
oldmean = mean[lab, j]
1185+
mean[lab, j] += (val - oldmean) / nobs[lab, j]
1186+
out[lab, j] += (val - mean[lab, j]) * (val - oldmean)
12021187
12031188
for i in range(ncounts):
12041189
for j in range(K):
12051190
ct = nobs[i, j]
12061191
if ct < 2:
12071192
out[i, j] = NAN
12081193
else:
1209-
out[i, j] = ((ct * sumxx[i, j] - sumx[i, j] * sumx[i, j]) /
1210-
(ct * ct - ct))
1194+
out[i, j] /= (ct - 1)
1195+
12111196
"""
12121197

12131198
group_var_bin_template = """@cython.wraparound(False)

pandas/src/generated.pyx

+38-68
Original file line numberDiff line numberDiff line change
@@ -7232,131 +7232,101 @@ def group_prod_bin_float32(ndarray[float32_t, ndim=2] out,
72327232

72337233
@cython.wraparound(False)
72347234
@cython.boundscheck(False)
7235+
@cython.cdivision(True)
72357236
def group_var_float64(ndarray[float64_t, ndim=2] out,
72367237
ndarray[int64_t] counts,
72377238
ndarray[float64_t, ndim=2] values,
72387239
ndarray[int64_t] labels):
72397240
cdef:
72407241
Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
7241-
float64_t val, ct
7242-
ndarray[float64_t, ndim=2] nobs, sumx, sumxx
7242+
float64_t val, ct, oldmean
7243+
ndarray[float64_t, ndim=2] nobs, mean
72437244

72447245
if not len(values) == len(labels):
72457246
raise AssertionError("len(index) != len(labels)")
72467247

72477248
nobs = np.zeros_like(out)
7248-
sumx = np.zeros_like(out)
7249-
sumxx = np.zeros_like(out)
7249+
mean = np.zeros_like(out)
72507250

72517251
N, K = (<object> values).shape
72527252

7253-
with nogil:
7254-
if K > 1:
7255-
for i in range(N):
7256-
7257-
lab = labels[i]
7258-
if lab < 0:
7259-
continue
7253+
out[:, :] = 0.0
72607254

7261-
counts[lab] += 1
7262-
7263-
for j in range(K):
7264-
val = values[i, j]
7265-
7266-
# not nan
7267-
if val == val:
7268-
nobs[lab, j] += 1
7269-
sumx[lab, j] += val
7270-
sumxx[lab, j] += val * val
7271-
else:
7272-
for i in range(N):
7255+
with nogil:
7256+
for i in range(N):
7257+
lab = labels[i]
7258+
if lab < 0:
7259+
continue
72737260

7274-
lab = labels[i]
7275-
if lab < 0:
7276-
continue
7261+
counts[lab] += 1
72777262

7278-
counts[lab] += 1
7279-
val = values[i, 0]
7263+
for j in range(K):
7264+
val = values[i, j]
72807265

72817266
# not nan
72827267
if val == val:
7283-
nobs[lab, 0] += 1
7284-
sumx[lab, 0] += val
7285-
sumxx[lab, 0] += val * val
7286-
7268+
nobs[lab, j] += 1
7269+
oldmean = mean[lab, j]
7270+
mean[lab, j] += (val - oldmean) / nobs[lab, j]
7271+
out[lab, j] += (val - mean[lab, j]) * (val - oldmean)
72877272

72887273
for i in range(ncounts):
72897274
for j in range(K):
72907275
ct = nobs[i, j]
72917276
if ct < 2:
72927277
out[i, j] = NAN
72937278
else:
7294-
out[i, j] = ((ct * sumxx[i, j] - sumx[i, j] * sumx[i, j]) /
7295-
(ct * ct - ct))
7279+
out[i, j] /= (ct - 1)
7280+
72967281

72977282
@cython.wraparound(False)
72987283
@cython.boundscheck(False)
7284+
@cython.cdivision(True)
72997285
def group_var_float32(ndarray[float32_t, ndim=2] out,
73007286
ndarray[int64_t] counts,
73017287
ndarray[float32_t, ndim=2] values,
73027288
ndarray[int64_t] labels):
73037289
cdef:
73047290
Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
7305-
float32_t val, ct
7306-
ndarray[float32_t, ndim=2] nobs, sumx, sumxx
7291+
float32_t val, ct, oldmean
7292+
ndarray[float32_t, ndim=2] nobs, mean
73077293

73087294
if not len(values) == len(labels):
73097295
raise AssertionError("len(index) != len(labels)")
73107296

73117297
nobs = np.zeros_like(out)
7312-
sumx = np.zeros_like(out)
7313-
sumxx = np.zeros_like(out)
7298+
mean = np.zeros_like(out)
73147299

73157300
N, K = (<object> values).shape
73167301

7317-
with nogil:
7318-
if K > 1:
7319-
for i in range(N):
7320-
7321-
lab = labels[i]
7322-
if lab < 0:
7323-
continue
7302+
out[:, :] = 0.0
73247303

7325-
counts[lab] += 1
7326-
7327-
for j in range(K):
7328-
val = values[i, j]
7329-
7330-
# not nan
7331-
if val == val:
7332-
nobs[lab, j] += 1
7333-
sumx[lab, j] += val
7334-
sumxx[lab, j] += val * val
7335-
else:
7336-
for i in range(N):
7304+
with nogil:
7305+
for i in range(N):
7306+
lab = labels[i]
7307+
if lab < 0:
7308+
continue
73377309

7338-
lab = labels[i]
7339-
if lab < 0:
7340-
continue
7310+
counts[lab] += 1
73417311

7342-
counts[lab] += 1
7343-
val = values[i, 0]
7312+
for j in range(K):
7313+
val = values[i, j]
73447314

73457315
# not nan
73467316
if val == val:
7347-
nobs[lab, 0] += 1
7348-
sumx[lab, 0] += val
7349-
sumxx[lab, 0] += val * val
7350-
7317+
nobs[lab, j] += 1
7318+
oldmean = mean[lab, j]
7319+
mean[lab, j] += (val - oldmean) / nobs[lab, j]
7320+
out[lab, j] += (val - mean[lab, j]) * (val - oldmean)
73517321

73527322
for i in range(ncounts):
73537323
for j in range(K):
73547324
ct = nobs[i, j]
73557325
if ct < 2:
73567326
out[i, j] = NAN
73577327
else:
7358-
out[i, j] = ((ct * sumxx[i, j] - sumx[i, j] * sumx[i, j]) /
7359-
(ct * ct - ct))
7328+
out[i, j] /= (ct - 1)
7329+
73607330

73617331

73627332
@cython.wraparound(False)

0 commit comments

Comments
 (0)