Skip to content

REF: remove libreduction.SeriesGrouper #43505

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Sep 11, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
197 changes: 0 additions & 197 deletions pandas/_libs/reduction.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -34,166 +34,6 @@ cpdef check_result_array(object obj, object dtype):
raise ValueError("Must produce aggregated value")


cdef class _BaseGrouper:
cdef _check_dummy(self, object dummy):
# both values and index must be an ndarray!

values = dummy.values
# GH 23683: datetimetz types are equivalent to datetime types here
if (dummy.dtype != self.arr.dtype
and values.dtype != self.arr.dtype):
raise ValueError('Dummy array must be same dtype')
if is_array(values) and not values.flags.contiguous:
# e.g. Categorical has no `flags` attribute
values = values.copy()
index = dummy.index.values
if not index.flags.contiguous:
index = index.copy()

return values, index

cdef _init_dummy_series_and_index(self, Slider islider, Slider vslider):
"""
Create Series and Index objects that we will alter in-place while iterating.
"""
cached_index = self.ityp(islider.buf, dtype=self.idtype)
cached_series = self.typ(
vslider.buf, dtype=vslider.buf.dtype, index=cached_index, name=self.name
)
return cached_index, cached_series

cdef inline _update_cached_objs(self, object cached_series, object cached_index,
Slider islider, Slider vslider):
cached_index._engine.clear_mapping()
cached_index._cache.clear() # e.g. inferred_freq must go
cached_series._mgr.set_values(vslider.buf)

cdef inline object _apply_to_group(self,
object cached_series, object cached_index,
bint initialized):
"""
Call self.f on our new group, then update to the next group.
"""
cdef:
object res

# NB: we assume that _update_cached_objs has already cleared cleared
# the cache and engine mapping
res = self.f(cached_series)
res = extract_result(res)
if not initialized:
# On the first pass, we check the output shape to see
# if this looks like a reduction.
initialized = True
check_result_array(res, cached_series.dtype)

return res, initialized


cdef class SeriesGrouper(_BaseGrouper):
"""
Performs generic grouping operation while avoiding ndarray construction
overhead
"""
cdef:
Py_ssize_t nresults, ngroups

cdef public:
ndarray arr, index, dummy_arr, dummy_index
object f, labels, values, typ, ityp, name, idtype

def __init__(self, object series, object f, ndarray[intp_t] labels,
Py_ssize_t ngroups):

if len(series) == 0:
# get_result would never assign `result`
raise ValueError("SeriesGrouper requires non-empty `series`")

self.labels = labels
self.f = f

values = series.values
if is_array(values) and not values.flags.c_contiguous:
# e.g. Categorical has no `flags` attribute
values = values.copy('C')
self.arr = values
self.typ = series._constructor
self.ityp = series.index._constructor
self.idtype = series.index.dtype
self.index = series.index.values
self.name = series.name

dummy = series.iloc[:0]
self.dummy_arr, self.dummy_index = self._check_dummy(dummy)
self.ngroups = ngroups

def get_result(self):
cdef:
# Define result to avoid UnboundLocalError
ndarray arr, result = None
ndarray[intp_t] labels
ndarray[int64_t] counts
Py_ssize_t i, n, group_size, lab, start, end
object res
bint initialized = 0
Slider vslider, islider
object cached_series = None, cached_index = None

labels = self.labels
counts = np.zeros(self.ngroups, dtype=np.int64)
group_size = 0
n = len(self.arr)

vslider = Slider(self.arr, self.dummy_arr)
islider = Slider(self.index, self.dummy_index)

result = np.empty(self.ngroups, dtype='O')

cached_index, cached_series = self._init_dummy_series_and_index(
islider, vslider
)

start = 0
try:
for i in range(n):
group_size += 1

lab = labels[i]

if i == n - 1 or lab != labels[i + 1]:
if lab == -1:
start += group_size
group_size = 0
continue

end = start + group_size
islider.move(start, end)
vslider.move(start, end)

self._update_cached_objs(
cached_series, cached_index, islider, vslider)

res, initialized = self._apply_to_group(cached_series, cached_index,
initialized)

start += group_size

result[lab] = res
counts[lab] = group_size
group_size = 0

finally:
# so we don't free the wrong memory
islider.reset()
vslider.reset()

# We check for empty series in the constructor, so should always
# have result initialized by this point.
assert initialized, "`result` has not been initialized."

return result, counts


cpdef inline extract_result(object res):
""" extract the result object, it might be a 0-dim ndarray
or a len-1 0-dim, or a scalar """
Expand All @@ -208,40 +48,3 @@ cpdef inline extract_result(object res):
# see test_resampler_grouper.py::test_apply
res = res[0]
return res


cdef class Slider:
"""
Only handles contiguous data for now
"""
cdef:
ndarray values, buf
Py_ssize_t stride
char *orig_data

def __init__(self, ndarray values, ndarray buf):
assert values.ndim == 1
assert values.dtype == buf.dtype

if not values.flags.contiguous:
values = values.copy()

self.values = values
self.buf = buf

self.stride = values.strides[0]
self.orig_data = self.buf.data

self.buf.data = self.values.data
self.buf.strides[0] = self.stride

cdef move(self, int start, int end):
"""
For slicing
"""
self.buf.data = self.values.data + self.stride * start
self.buf.shape[0] = end - start

cdef reset(self):
self.buf.data = self.orig_data
self.buf.shape[0] = 0
36 changes: 1 addition & 35 deletions pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,6 @@
BaseMaskedArray,
BaseMaskedDtype,
)
import pandas.core.common as com
from pandas.core.frame import DataFrame
from pandas.core.generic import NDFrame
from pandas.core.groupby import grouper
Expand Down Expand Up @@ -933,10 +932,6 @@ def agg_series(
result = self._aggregate_series_pure_python(obj, func)

elif not isinstance(obj._values, np.ndarray):
# _aggregate_series_fast would raise TypeError when
# calling libreduction.Slider
# In the datetime64tz case it would incorrectly cast to tz-naive
# TODO: can we get a performant workaround for EAs backed by ndarray?
result = self._aggregate_series_pure_python(obj, func)

# we can preserve a little bit more aggressively with EA dtype
Expand All @@ -945,17 +940,8 @@ def agg_series(
# is sufficiently strict that it casts appropriately.
preserve_dtype = True

elif obj.index._has_complex_internals:
# Preempt TypeError in _aggregate_series_fast
result = self._aggregate_series_pure_python(obj, func)

elif isinstance(self, BinGrouper):
# Not yet able to remove the BaseGrouper aggregate_series_fast,
# as test_crosstab.test_categorical breaks without it
result = self._aggregate_series_pure_python(obj, func)

else:
result = self._aggregate_series_fast(obj, func)
result = self._aggregate_series_pure_python(obj, func)

npvalues = lib.maybe_convert_objects(result, try_float=False)
if preserve_dtype:
Expand All @@ -964,23 +950,6 @@ def agg_series(
out = npvalues
return out

def _aggregate_series_fast(self, obj: Series, func: F) -> npt.NDArray[np.object_]:
# At this point we have already checked that
# - obj.index is not a MultiIndex
# - obj is backed by an ndarray, not ExtensionArray
# - len(obj) > 0
func = com.is_builtin_func(func)

ids, _, ngroups = self.group_info

# avoids object / Series creation overhead
indexer = get_group_index_sorter(ids, ngroups)
obj = obj.take(indexer)
ids = ids.take(indexer)
sgrouper = libreduction.SeriesGrouper(obj, func, ids, ngroups)
result, _ = sgrouper.get_result()
return result

@final
def _aggregate_series_pure_python(
self, obj: Series, func: F
Expand All @@ -995,9 +964,6 @@ def _aggregate_series_pure_python(
splitter = get_splitter(obj, ids, ngroups, axis=0)

for i, group in enumerate(splitter):

# Each step of this loop corresponds to
# libreduction._BaseGrouper._apply_to_group
res = func(group)
res = libreduction.extract_result(res)

Expand Down
47 changes: 1 addition & 46 deletions pandas/tests/groupby/test_bin_groupby.py
Original file line number Diff line number Diff line change
@@ -1,58 +1,13 @@
import numpy as np
import pytest

from pandas._libs import (
lib,
reduction as libreduction,
)
from pandas._libs import lib
import pandas.util._test_decorators as td

import pandas as pd
from pandas import Series
import pandas._testing as tm


def test_series_grouper():
obj = Series(np.random.randn(10))

labels = np.array([-1, -1, -1, 0, 0, 0, 1, 1, 1, 1], dtype=np.intp)

grouper = libreduction.SeriesGrouper(obj, np.mean, labels, 2)
result, counts = grouper.get_result()

expected = np.array([obj[3:6].mean(), obj[6:].mean()], dtype=object)
tm.assert_almost_equal(result, expected)

exp_counts = np.array([3, 4], dtype=np.int64)
tm.assert_almost_equal(counts, exp_counts)


def test_series_grouper_result_length_difference():
# GH 40014
obj = Series(np.random.randn(10), dtype="float64")
obj.index = obj.index.astype("O")
labels = np.array([-1, -1, -1, 0, 0, 0, 1, 1, 1, 1], dtype=np.intp)

grouper = libreduction.SeriesGrouper(obj, lambda x: all(x > 0), labels, 2)
result, counts = grouper.get_result()

expected = np.array([all(obj[3:6] > 0), all(obj[6:] > 0)], dtype=object)
tm.assert_equal(result, expected)

exp_counts = np.array([3, 4], dtype=np.int64)
tm.assert_equal(counts, exp_counts)


def test_series_grouper_requires_nonempty_raises():
# GH#29500
obj = Series(np.random.randn(10))
dummy = obj.iloc[:0]
labels = np.array([-1, -1, -1, 0, 0, 0, 1, 1, 1, 1], dtype=np.intp)

with pytest.raises(ValueError, match="SeriesGrouper requires non-empty `series`"):
libreduction.SeriesGrouper(dummy, np.mean, labels, 2)


def assert_block_lengths(x):
assert len(x) == len(x._mgr.blocks[0].mgr_locs)
return 0
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/reshape/test_crosstab.py
Original file line number Diff line number Diff line change
Expand Up @@ -811,7 +811,7 @@ def test_categoricals(a_dtype, b_dtype):
a_is_cat = is_categorical_dtype(a.dtype)
assert not a_is_cat or a.value_counts().loc[1] == 0
result = crosstab(a, b, margins=True, dropna=False)
values = [[18, 16, 34], [0, 0, np.nan], [34, 32, 66], [52, 48, 100]]
values = [[18, 16, 34], [0, 0, 0], [34, 32, 66], [52, 48, 100]]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

may want to add a note for this (i agree that this looks more correct)

expected = DataFrame(values, index, columns)
if not a_is_cat:
expected = expected.loc[[0, 2, "All"]]
Expand Down