diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 43bf6d9dd1fee..03848a7c4af43 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -700,6 +700,225 @@ group_mean_float32 = _group_mean['float'] group_mean_float64 = _group_mean['double'] +@cython.wraparound(False) +@cython.boundscheck(False) +def _group_mean_Corder(floating[:, ::1] out, + int64_t[::1] counts, + ndarray[floating, ndim=2] values, + const int64_t[::1] labels, + Py_ssize_t min_count=-1): + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + floating val, count, y, t + floating[:, ::1] sumx, compensation + int64_t[:, ::1] nobs + Py_ssize_t len_values = len(values), len_labels = len(labels) + + assert min_count == -1, "'min_count' only used in add and prod" + + if len_values != len_labels: + raise ValueError("len(index) != len(labels)") + + nobs = np.zeros((out).shape, dtype=np.int64) + sumx = np.zeros_like(out) + compensation = np.zeros_like(out) + + N, K = (values).shape + + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + # not nan + if val == val: + nobs[lab, j] += 1 + y = val - compensation[lab, j] + t = sumx[lab, j] + y + compensation[lab, j] = t - sumx[lab, j] - y + sumx[lab, j] = t + + for i in range(ncounts): + for j in range(K): + count = nobs[i, j] + if nobs[i, j] == 0: + out[i, j] = NAN + else: + out[i, j] = sumx[i, j] / count + + +group_mean_Corder_float32 = _group_mean_Corder['float'] +group_mean_Corder_float64 = _group_mean_Corder['double'] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def _group_mean_transposed(floating[:, :] out, + int64_t[:] counts, + ndarray[floating, ndim=2] values, + const int64_t[:] labels, + Py_ssize_t min_count=-1): + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + floating val, count, y, t + floating[:, :] sumx, compensation + int64_t[:, :] nobs + Py_ssize_t len_values = len(values), len_labels = len(labels) + + assert min_count == -1, "'min_count' only used in add and prod" + + # if len_values != len_labels: + # raise ValueError("len(index) != len(labels)") + + nobs = np.zeros((out).shape, dtype=np.int64) + sumx = np.zeros_like(out) + compensation = np.zeros_like(out) + + K, N = (values).shape + + if N != len_labels: + raise ValueError("len(index) != len(labels)") + + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[j, i] + # not nan + if val == val: + nobs[j, lab] += 1 + y = val - compensation[j, lab] + t = sumx[j, lab] + y + compensation[j, lab] = t - sumx[j, lab] - y + sumx[j, lab] = t + + for i in range(ncounts): + for j in range(K): + count = nobs[j, i] + if nobs[j, i] == 0: + out[j, i] = NAN + else: + out[j, i] = sumx[j, i] / count + + +group_mean_transposed_float32 = _group_mean_transposed['float'] +group_mean_transposed_float64 = _group_mean_transposed['double'] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def _group_mean_1d(floating[:] out, + int64_t[:] counts, + ndarray[floating, ndim=1] values, + const int64_t[:] labels, + Py_ssize_t min_count=-1): + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + floating val, count, y, t + floating[:] sumx, compensation + int64_t[:] nobs + Py_ssize_t len_values = len(values), len_labels = len(labels) + + assert min_count == -1, "'min_count' only used in add and prod" + + if len_values != len_labels: + raise ValueError("len(index) != len(labels)") + + nobs = np.zeros((out).shape, dtype=np.int64) + sumx = np.zeros_like(out) + compensation = np.zeros_like(out) + + N, = (values).shape + + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i] + # not nan + if val == val: + nobs[lab] += 1 + y = val - compensation[lab] + t = sumx[lab] + y + compensation[lab] = t - sumx[lab] - y + sumx[lab] = t + + for i in range(ncounts): + count = nobs[i] + if nobs[i] == 0: + out[i] = NAN + else: + out[i] = sumx[i] / count + + +group_mean_1d_float32 = _group_mean_1d['float'] +group_mean_1d_float64 = _group_mean_1d['double'] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def _group_mean_1d_Corder(floating[::1] out, + int64_t[::1] counts, + floating[::1] values, + const int64_t[:] labels, + Py_ssize_t min_count=-1): + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + floating val, count, y, t + floating[::1] sumx, compensation + int64_t[::1] nobs + Py_ssize_t len_values = len(values), len_labels = len(labels) + + assert min_count == -1, "'min_count' only used in add and prod" + + if len_values != len_labels: + raise ValueError("len(index) != len(labels)") + + nobs = np.zeros((out).shape, dtype=np.int64) + sumx = np.zeros_like(out) + compensation = np.zeros_like(out) + + N, = (values).shape + + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i] + # not nan + if val == val: + nobs[lab] += 1 + y = val - compensation[lab] + t = sumx[lab] + y + compensation[lab] = t - sumx[lab] - y + sumx[lab] = t + + for i in range(ncounts): + count = nobs[i] + if nobs[i] == 0: + out[i] = NAN + else: + out[i] = sumx[i] / count + + +group_mean_1d_Corder_float32 = _group_mean_1d_Corder['float'] +group_mean_1d_Corder_float64 = _group_mean_1d_Corder['double'] + + @cython.wraparound(False) @cython.boundscheck(False) def _group_ohlc(floating[:, :] out,