Skip to content

Commit a0c3bd0

Browse files
committed
Refactor groupby group_prod, group_var, group_mean, group_ohlc from tempita to fused types
1 parent 2448e52 commit a0c3bd0

File tree

3 files changed

+212
-215
lines changed

3 files changed

+212
-215
lines changed

pandas/_libs/groupby.pyx

+210
Original file line numberDiff line numberDiff line change
@@ -382,6 +382,9 @@ def group_any_all(uint8_t[:] out,
382382
if values[i] == flag_val:
383383
out[lab] = flag_val
384384

385+
# ----------------------------------------------------------------------
386+
# group_add, group_prod, group_var, group_mean, group_ohlc
387+
# ----------------------------------------------------------------------
385388

386389
@cython.wraparound(False)
387390
@cython.boundscheck(False)
@@ -433,5 +436,212 @@ def _group_add(floating[:, :] out,
433436
group_add_float32 = _group_add['float']
434437
group_add_float64 = _group_add['double']
435438

439+
440+
@cython.wraparound(False)
441+
@cython.boundscheck(False)
442+
def _group_prod(floating[:, :] out,
443+
int64_t[:] counts,
444+
floating[:, :] values,
445+
const int64_t[:] labels,
446+
Py_ssize_t min_count=0):
447+
"""
448+
Only aggregates on axis=0
449+
"""
450+
cdef:
451+
Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
452+
floating val, count
453+
ndarray[floating, ndim=2] prodx, nobs
454+
455+
if not len(values) == len(labels):
456+
raise AssertionError("len(index) != len(labels)")
457+
458+
nobs = np.zeros_like(out)
459+
prodx = np.ones_like(out)
460+
461+
N, K = (<object>values).shape
462+
463+
with nogil:
464+
for i in range(N):
465+
lab = labels[i]
466+
if lab < 0:
467+
continue
468+
469+
counts[lab] += 1
470+
for j in range(K):
471+
val = values[i, j]
472+
473+
# not nan
474+
if val == val:
475+
nobs[lab, j] += 1
476+
prodx[lab, j] *= val
477+
478+
for i in range(ncounts):
479+
for j in range(K):
480+
if nobs[i, j] < min_count:
481+
out[i, j] = NAN
482+
else:
483+
out[i, j] = prodx[i, j]
484+
485+
group_prod_float32 = _group_prod['float']
486+
group_prod_float64 = _group_prod['double']
487+
488+
489+
@cython.wraparound(False)
490+
@cython.boundscheck(False)
491+
@cython.cdivision(True)
492+
def _group_var(floating[:, :] out,
493+
int64_t[:] counts,
494+
floating[:, :] values,
495+
const int64_t[:] labels,
496+
Py_ssize_t min_count=-1):
497+
cdef:
498+
Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
499+
floating val, ct, oldmean
500+
ndarray[floating, ndim=2] nobs, mean
501+
502+
assert min_count == -1, "'min_count' only used in add and prod"
503+
504+
if not len(values) == len(labels):
505+
raise AssertionError("len(index) != len(labels)")
506+
507+
nobs = np.zeros_like(out)
508+
mean = np.zeros_like(out)
509+
510+
N, K = (<object>values).shape
511+
512+
out[:, :] = 0.0
513+
514+
with nogil:
515+
for i in range(N):
516+
lab = labels[i]
517+
if lab < 0:
518+
continue
519+
520+
counts[lab] += 1
521+
522+
for j in range(K):
523+
val = values[i, j]
524+
525+
# not nan
526+
if val == val:
527+
nobs[lab, j] += 1
528+
oldmean = mean[lab, j]
529+
mean[lab, j] += (val - oldmean) / nobs[lab, j]
530+
out[lab, j] += (val - mean[lab, j]) * (val - oldmean)
531+
532+
for i in range(ncounts):
533+
for j in range(K):
534+
ct = nobs[i, j]
535+
if ct < 2:
536+
out[i, j] = NAN
537+
else:
538+
out[i, j] /= (ct - 1)
539+
540+
541+
group_var_float32 = _group_var['float']
542+
group_var_float64 = _group_var['double']
543+
544+
545+
@cython.wraparound(False)
546+
@cython.boundscheck(False)
547+
def _group_mean(floating[:, :] out,
548+
int64_t[:] counts,
549+
floating[:, :] values,
550+
const int64_t[:] labels,
551+
Py_ssize_t min_count=-1):
552+
cdef:
553+
Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
554+
floating val, count
555+
ndarray[floating, ndim=2] sumx, nobs
556+
557+
assert min_count == -1, "'min_count' only used in add and prod"
558+
559+
if not len(values) == len(labels):
560+
raise AssertionError("len(index) != len(labels)")
561+
562+
nobs = np.zeros_like(out)
563+
sumx = np.zeros_like(out)
564+
565+
N, K = (<object>values).shape
566+
567+
with nogil:
568+
for i in range(N):
569+
lab = labels[i]
570+
if lab < 0:
571+
continue
572+
573+
counts[lab] += 1
574+
for j in range(K):
575+
val = values[i, j]
576+
# not nan
577+
if val == val:
578+
nobs[lab, j] += 1
579+
sumx[lab, j] += val
580+
581+
for i in range(ncounts):
582+
for j in range(K):
583+
count = nobs[i, j]
584+
if nobs[i, j] == 0:
585+
out[i, j] = NAN
586+
else:
587+
out[i, j] = sumx[i, j] / count
588+
589+
590+
group_mean_float32 = _group_mean['float']
591+
group_mean_float64 = _group_mean['double']
592+
593+
594+
@cython.wraparound(False)
595+
@cython.boundscheck(False)
596+
def _group_ohlc(floating[:, :] out,
597+
int64_t[:] counts,
598+
floating[:, :] values,
599+
const int64_t[:] labels,
600+
Py_ssize_t min_count=-1):
601+
"""
602+
Only aggregates on axis=0
603+
"""
604+
cdef:
605+
Py_ssize_t i, j, N, K, lab
606+
floating val, count
607+
Py_ssize_t ngroups = len(counts)
608+
609+
assert min_count == -1, "'min_count' only used in add and prod"
610+
611+
if len(labels) == 0:
612+
return
613+
614+
N, K = (<object>values).shape
615+
616+
if out.shape[1] != 4:
617+
raise ValueError('Output array must have 4 columns')
618+
619+
if K > 1:
620+
raise NotImplementedError("Argument 'values' must have only "
621+
"one dimension")
622+
out[:] = np.nan
623+
624+
with nogil:
625+
for i in range(N):
626+
lab = labels[i]
627+
if lab == -1:
628+
continue
629+
630+
counts[lab] += 1
631+
val = values[i, 0]
632+
if val != val:
633+
continue
634+
635+
if out[lab, 0] != out[lab, 0]:
636+
out[lab, 0] = out[lab, 1] = out[lab, 2] = out[lab, 3] = val
637+
else:
638+
out[lab, 1] = max(out[lab, 1], val)
639+
out[lab, 2] = min(out[lab, 2], val)
640+
out[lab, 3] = val
641+
642+
643+
group_ohlc_float32 = _group_ohlc['float']
644+
group_ohlc_float64 = _group_ohlc['double']
645+
436646
# generated from template
437647
include "groupby_helper.pxi"

0 commit comments

Comments
 (0)