Skip to content

Commit b2d93b1

Browse files
committed
Refactor groupby group_prod, group_var, group_mean, group_ohlc from tempita to fused types
1 parent 2448e52 commit b2d93b1

File tree

3 files changed

+214
-215
lines changed

3 files changed

+214
-215
lines changed

pandas/_libs/groupby.pyx

+212
Original file line numberDiff line numberDiff line change
@@ -382,6 +382,10 @@ def group_any_all(uint8_t[:] out,
382382
if values[i] == flag_val:
383383
out[lab] = flag_val
384384

385+
# ----------------------------------------------------------------------
386+
# group_add, group_prod, group_var, group_mean, group_ohlc
387+
# ----------------------------------------------------------------------
388+
385389

386390
@cython.wraparound(False)
387391
@cython.boundscheck(False)
@@ -433,5 +437,213 @@ def _group_add(floating[:, :] out,
433437
group_add_float32 = _group_add['float']
434438
group_add_float64 = _group_add['double']
435439

440+
441+
@cython.wraparound(False)
442+
@cython.boundscheck(False)
443+
def _group_prod(floating[:, :] out,
444+
int64_t[:] counts,
445+
floating[:, :] values,
446+
const int64_t[:] labels,
447+
Py_ssize_t min_count=0):
448+
"""
449+
Only aggregates on axis=0
450+
"""
451+
cdef:
452+
Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
453+
floating val, count
454+
ndarray[floating, ndim=2] prodx, nobs
455+
456+
if not len(values) == len(labels):
457+
raise AssertionError("len(index) != len(labels)")
458+
459+
nobs = np.zeros_like(out)
460+
prodx = np.ones_like(out)
461+
462+
N, K = (<object>values).shape
463+
464+
with nogil:
465+
for i in range(N):
466+
lab = labels[i]
467+
if lab < 0:
468+
continue
469+
470+
counts[lab] += 1
471+
for j in range(K):
472+
val = values[i, j]
473+
474+
# not nan
475+
if val == val:
476+
nobs[lab, j] += 1
477+
prodx[lab, j] *= val
478+
479+
for i in range(ncounts):
480+
for j in range(K):
481+
if nobs[i, j] < min_count:
482+
out[i, j] = NAN
483+
else:
484+
out[i, j] = prodx[i, j]
485+
486+
487+
group_prod_float32 = _group_prod['float']
488+
group_prod_float64 = _group_prod['double']
489+
490+
491+
@cython.wraparound(False)
492+
@cython.boundscheck(False)
493+
@cython.cdivision(True)
494+
def _group_var(floating[:, :] out,
495+
int64_t[:] counts,
496+
floating[:, :] values,
497+
const int64_t[:] labels,
498+
Py_ssize_t min_count=-1):
499+
cdef:
500+
Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
501+
floating val, ct, oldmean
502+
ndarray[floating, ndim=2] nobs, mean
503+
504+
assert min_count == -1, "'min_count' only used in add and prod"
505+
506+
if not len(values) == len(labels):
507+
raise AssertionError("len(index) != len(labels)")
508+
509+
nobs = np.zeros_like(out)
510+
mean = np.zeros_like(out)
511+
512+
N, K = (<object>values).shape
513+
514+
out[:, :] = 0.0
515+
516+
with nogil:
517+
for i in range(N):
518+
lab = labels[i]
519+
if lab < 0:
520+
continue
521+
522+
counts[lab] += 1
523+
524+
for j in range(K):
525+
val = values[i, j]
526+
527+
# not nan
528+
if val == val:
529+
nobs[lab, j] += 1
530+
oldmean = mean[lab, j]
531+
mean[lab, j] += (val - oldmean) / nobs[lab, j]
532+
out[lab, j] += (val - mean[lab, j]) * (val - oldmean)
533+
534+
for i in range(ncounts):
535+
for j in range(K):
536+
ct = nobs[i, j]
537+
if ct < 2:
538+
out[i, j] = NAN
539+
else:
540+
out[i, j] /= (ct - 1)
541+
542+
543+
group_var_float32 = _group_var['float']
544+
group_var_float64 = _group_var['double']
545+
546+
547+
@cython.wraparound(False)
548+
@cython.boundscheck(False)
549+
def _group_mean(floating[:, :] out,
550+
int64_t[:] counts,
551+
floating[:, :] values,
552+
const int64_t[:] labels,
553+
Py_ssize_t min_count=-1):
554+
cdef:
555+
Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
556+
floating val, count
557+
ndarray[floating, ndim=2] sumx, nobs
558+
559+
assert min_count == -1, "'min_count' only used in add and prod"
560+
561+
if not len(values) == len(labels):
562+
raise AssertionError("len(index) != len(labels)")
563+
564+
nobs = np.zeros_like(out)
565+
sumx = np.zeros_like(out)
566+
567+
N, K = (<object>values).shape
568+
569+
with nogil:
570+
for i in range(N):
571+
lab = labels[i]
572+
if lab < 0:
573+
continue
574+
575+
counts[lab] += 1
576+
for j in range(K):
577+
val = values[i, j]
578+
# not nan
579+
if val == val:
580+
nobs[lab, j] += 1
581+
sumx[lab, j] += val
582+
583+
for i in range(ncounts):
584+
for j in range(K):
585+
count = nobs[i, j]
586+
if nobs[i, j] == 0:
587+
out[i, j] = NAN
588+
else:
589+
out[i, j] = sumx[i, j] / count
590+
591+
592+
group_mean_float32 = _group_mean['float']
593+
group_mean_float64 = _group_mean['double']
594+
595+
596+
@cython.wraparound(False)
597+
@cython.boundscheck(False)
598+
def _group_ohlc(floating[:, :] out,
599+
int64_t[:] counts,
600+
floating[:, :] values,
601+
const int64_t[:] labels,
602+
Py_ssize_t min_count=-1):
603+
"""
604+
Only aggregates on axis=0
605+
"""
606+
cdef:
607+
Py_ssize_t i, j, N, K, lab
608+
floating val, count
609+
Py_ssize_t ngroups = len(counts)
610+
611+
assert min_count == -1, "'min_count' only used in add and prod"
612+
613+
if len(labels) == 0:
614+
return
615+
616+
N, K = (<object>values).shape
617+
618+
if out.shape[1] != 4:
619+
raise ValueError('Output array must have 4 columns')
620+
621+
if K > 1:
622+
raise NotImplementedError("Argument 'values' must have only "
623+
"one dimension")
624+
out[:] = np.nan
625+
626+
with nogil:
627+
for i in range(N):
628+
lab = labels[i]
629+
if lab == -1:
630+
continue
631+
632+
counts[lab] += 1
633+
val = values[i, 0]
634+
if val != val:
635+
continue
636+
637+
if out[lab, 0] != out[lab, 0]:
638+
out[lab, 0] = out[lab, 1] = out[lab, 2] = out[lab, 3] = val
639+
else:
640+
out[lab, 1] = max(out[lab, 1], val)
641+
out[lab, 2] = min(out[lab, 2], val)
642+
out[lab, 3] = val
643+
644+
645+
group_ohlc_float32 = _group_ohlc['float']
646+
group_ohlc_float64 = _group_ohlc['double']
647+
436648
# generated from template
437649
include "groupby_helper.pxi"

0 commit comments

Comments
 (0)