Skip to content

PERF: Cythonize groupby transforms #4095 #10901

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions asv_bench/benchmarks/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -772,3 +772,19 @@ def setup(self):

def time_groupby_transform_series2(self):
self.df.groupby('id')['val'].transform(np.mean)

class groupby_transform_cythonized(object):
goal_time = 0.2

def setup(self):
np.random.seed(0)
self.df = DataFrame({'id': (np.arange(100000) / 3), 'val': np.random.randn(100000), })

def time_groupby_transform_cumprod(self):
self.df.groupby('id').cumprod()

def time_groupby_transform_cumsum(self):
self.df.groupby('id').cumsum()

def time_groupby_transform_shift(self):
self.df.groupby('id').shift()
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.17.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,8 @@ Performance Improvements
- Performance bug in repr of ``Categorical`` categories, which was rendering the strings before chopping them for display (:issue:`11305`)
- Improved performance of ``Series`` constructor with no data and ``DatetimeIndex`` (:issue:`11433`)

- Improved performance ``shift``, ``cumprod``, and ``cumsum`` with groupby (:issue:`4095`)

.. _whatsnew_0171.bug_fixes:


Expand Down
38 changes: 0 additions & 38 deletions pandas/algos.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@ cdef np.float64_t MAXfloat64 = np.inf
cdef double NaN = <double> np.NaN
cdef double nan = NaN


cdef inline int int_max(int a, int b): return a if a >= b else b
cdef inline int int_min(int a, int b): return a if a <= b else b

Expand Down Expand Up @@ -2266,43 +2265,6 @@ def group_last_bin_object(ndarray[object, ndim=2] out,
else:
out[i, j] = resx[i, j]



#----------------------------------------------------------------------
# median

def group_median(ndarray[float64_t, ndim=2] out,
ndarray[int64_t] counts,
ndarray[float64_t, ndim=2] values,
ndarray[int64_t] labels):
'''
Only aggregates on axis=0
'''
cdef:
Py_ssize_t i, j, N, K, ngroups, size
ndarray[int64_t] _counts
ndarray data
float64_t* ptr
ngroups = len(counts)
N, K = (<object> values).shape

indexer, _counts = groupsort_indexer(labels, ngroups)
counts[:] = _counts[1:]

data = np.empty((K, N), dtype=np.float64)
ptr = <float64_t*> data.data

take_2d_axis1_float64_float64(values.T, indexer, out=data)

for i in range(K):
# exclude NA group
ptr += _counts[0]
for j in range(ngroups):
size = _counts[j + 1]
out[j, i] = _median_linear(ptr, size)
ptr += size


cdef inline float64_t _median_linear(float64_t* a, int n):
cdef int i, j, na_count = 0
cdef float64_t result
Expand Down
Loading