Skip to content

Commit 9d11734

Browse files
committed
PERF: Cythonize groupby transforms #4095
1 parent 35b159d commit 9d11734

File tree

7 files changed

+614
-95
lines changed

7 files changed

+614
-95
lines changed

asv_bench/benchmarks/groupby.py

+16
Original file line numberDiff line numberDiff line change
@@ -772,3 +772,19 @@ def setup(self):
772772

773773
def time_groupby_transform_series2(self):
774774
self.df.groupby('id')['val'].transform(np.mean)
775+
776+
class groupby_transform_cythonized(object):
777+
goal_time = 0.2
778+
779+
def setup(self):
780+
np.random.seed(0)
781+
self.df = DataFrame({'id': (np.arange(100000) / 3), 'val': np.random.randn(100000), })
782+
783+
def time_groupby_transform_cumprod(self):
784+
self.df.groupby('id').cumprod()
785+
786+
def time_groupby_transform_cumsum(self):
787+
self.df.groupby('id').cumsum()
788+
789+
def time_groupby_transform_shift(self):
790+
self.df.groupby('id').shift()

doc/source/whatsnew/v0.17.1.txt

+2
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,8 @@ Performance Improvements
9393
- Performance bug in repr of ``Categorical`` categories, which was rendering the strings before chopping them for display (:issue:`11305`)
9494
- Improved performance of ``Series`` constructor with no data and ``DatetimeIndex`` (:issue:`11433`)
9595

96+
- Improved performance ``shift``, ``cumprod``, and ``cumsum`` with groupby (:issue:`4095`)
97+
9698
.. _whatsnew_0171.bug_fixes:
9799

98100

pandas/algos.pyx

-38
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,6 @@ cdef np.float64_t MAXfloat64 = np.inf
5050
cdef double NaN = <double> np.NaN
5151
cdef double nan = NaN
5252

53-
5453
cdef inline int int_max(int a, int b): return a if a >= b else b
5554
cdef inline int int_min(int a, int b): return a if a <= b else b
5655

@@ -2266,43 +2265,6 @@ def group_last_bin_object(ndarray[object, ndim=2] out,
22662265
else:
22672266
out[i, j] = resx[i, j]
22682267

2269-
2270-
2271-
#----------------------------------------------------------------------
2272-
# median
2273-
2274-
def group_median(ndarray[float64_t, ndim=2] out,
2275-
ndarray[int64_t] counts,
2276-
ndarray[float64_t, ndim=2] values,
2277-
ndarray[int64_t] labels):
2278-
'''
2279-
Only aggregates on axis=0
2280-
'''
2281-
cdef:
2282-
Py_ssize_t i, j, N, K, ngroups, size
2283-
ndarray[int64_t] _counts
2284-
ndarray data
2285-
float64_t* ptr
2286-
ngroups = len(counts)
2287-
N, K = (<object> values).shape
2288-
2289-
indexer, _counts = groupsort_indexer(labels, ngroups)
2290-
counts[:] = _counts[1:]
2291-
2292-
data = np.empty((K, N), dtype=np.float64)
2293-
ptr = <float64_t*> data.data
2294-
2295-
take_2d_axis1_float64_float64(values.T, indexer, out=data)
2296-
2297-
for i in range(K):
2298-
# exclude NA group
2299-
ptr += _counts[0]
2300-
for j in range(ngroups):
2301-
size = _counts[j + 1]
2302-
out[j, i] = _median_linear(ptr, size)
2303-
ptr += size
2304-
2305-
23062268
cdef inline float64_t _median_linear(float64_t* a, int n):
23072269
cdef int i, j, na_count = 0
23082270
cdef float64_t result

0 commit comments

Comments
 (0)