Skip to content

REF: Pre-empt ValueError in _aggregate_series_fast #29500

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Nov 12, 2019
8 changes: 7 additions & 1 deletion pandas/_libs/reduction.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,10 @@ cdef class SeriesGrouper:
# safer obj._get_values(slice(None, 0))
assert dummy is not None

if len(series) == 0:
# get_result would never assign `result`
raise ValueError("SeriesGrouper requires non-empty `series`")

self.labels = labels
self.f = f

Expand Down Expand Up @@ -428,7 +432,9 @@ cdef class SeriesGrouper:
vslider.reset()

if result is None:
raise ValueError("No result.")
# This should no longer be reachable since we check for empty
# series in the constructor.
raise AssertionError("`result` has not been assigned.")

if result.dtype == np.object_:
result = maybe_convert_objects(result)
Expand Down
19 changes: 12 additions & 7 deletions pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -583,7 +583,12 @@ def _transform(
return result

def agg_series(self, obj: Series, func):
if is_extension_array_dtype(obj.dtype) and obj.dtype.kind != "M":
if len(obj) == 0:
# SeriesGrouper would raise if we were to call _aggregate_series_fast
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm should we not just fix _aggregate_series_fast to not raise in that case then? Hoping to avoid special casing like this in any groupby functions

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hoping to avoid special casing like this in any groupby functions

Agreed on the goal. ATM this is the best way to make progress towards that goal. In particular, making this explicit here is much clearer than catching the ValueError with the particular message on L600.

should we not just fix _aggregate_series_fast to not raise in that case then?

#29499 does something along those lines.

# 2019-11-09: 3 tests get here with len(obj) == 0
return self._aggregate_series_pure_python(obj, func)

elif is_extension_array_dtype(obj.dtype) and obj.dtype.kind != "M":
# _aggregate_series_fast would raise TypeError when
# calling libreduction.Slider
# TODO: can we get a performant workaround for EAs backed by ndarray?
Expand All @@ -597,19 +602,18 @@ def agg_series(self, obj: Series, func):
try:
return self._aggregate_series_fast(obj, func)
except ValueError as err:
if "No result." in str(err):
# raised in libreduction
pass
elif "Function does not reduce" in str(err):
if "Function does not reduce" in str(err):
# raised in libreduction
pass
else:
raise
return self._aggregate_series_pure_python(obj, func)

def _aggregate_series_fast(self, obj, func):
# At this point we have already checked that obj.index is not a MultiIndex
# and that obj is backed by an ndarray, not ExtensionArray
# At this point we have already checked that
# - obj.index is not a MultiIndex
# - obj is backed by an ndarray, not ExtensionArray
# - len(obj) > 0
func = self._is_builtin_func(func)

group_index, _, ngroups = self.group_info
Expand Down Expand Up @@ -797,6 +801,7 @@ def groupings(self):
]

def agg_series(self, obj: Series, func):
# 2019-11-09: 3 tests get here with len(obj) == 0
if is_extension_array_dtype(obj.dtype):
# pre-empty SeriesBinGrouper from raising TypeError
# TODO: watch out, this can return None
Expand Down
9 changes: 9 additions & 0 deletions pandas/tests/groupby/test_bin_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,15 @@ def test_series_grouper():
tm.assert_almost_equal(counts, exp_counts)


def test_series_grouper_requires_nonempty():
obj = Series(np.random.randn(10))
dummy = obj[:0]
labels = np.array([-1, -1, -1, 0, 0, 0, 1, 1, 1, 1], dtype=np.int64)

with pytest.raises(ValueError, match="requires non-empty"):
libreduction.SeriesGrouper(dummy, np.mean, labels, 2, dummy)


def test_series_bin_grouper():
obj = Series(np.random.randn(10))
dummy = obj[:0]
Expand Down