From b65b3736ee464bcefe996b16ffff6848373de08f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 8 Nov 2019 16:33:49 -0800 Subject: [PATCH 1/5] pre-empt ValueError by checking input length --- pandas/_libs/reduction.pyx | 8 +++++++- pandas/core/groupby/ops.py | 19 ++++++++++++------- pandas/tests/groupby/test_bin_groupby.py | 9 +++++++++ 3 files changed, 28 insertions(+), 8 deletions(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 60c65a22603cd..d9dfdda0de035 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -325,6 +325,10 @@ cdef class SeriesGrouper: # safer obj._get_values(slice(None, 0)) assert dummy is not None + if len(series) == 0: + # get_result would never assign `result` + raise ValueError("SeriesGrouper requires non-empty `series`") + self.labels = labels self.f = f @@ -428,7 +432,9 @@ cdef class SeriesGrouper: vslider.reset() if result is None: - raise ValueError("No result.") + # This should no longer be reachable since we check for empty + # series in the constructor. + raise AssertionError("`result` has not been assigned.") if result.dtype == np.object_: result = maybe_convert_objects(result) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index e438db6c620ec..0f6fe529b3615 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -583,7 +583,12 @@ def _transform( return result def agg_series(self, obj: Series, func): - if is_extension_array_dtype(obj.dtype) and obj.dtype.kind != "M": + if len(obj) == 0: + # SeriesGrouper would raise if we were to call _aggregate_series_fast + # 2019-11-09: 3 tests get here with len(obj) == 0 + return self._aggregate_series_pure_python(obj, func) + + elif is_extension_array_dtype(obj.dtype) and obj.dtype.kind != "M": # _aggregate_series_fast would raise TypeError when # calling libreduction.Slider # TODO: can we get a performant workaround for EAs backed by ndarray? @@ -597,10 +602,7 @@ def agg_series(self, obj: Series, func): try: return self._aggregate_series_fast(obj, func) except ValueError as err: - if "No result." in str(err): - # raised in libreduction - pass - elif "Function does not reduce" in str(err): + if "Function does not reduce" in str(err): # raised in libreduction pass else: @@ -608,8 +610,10 @@ def agg_series(self, obj: Series, func): return self._aggregate_series_pure_python(obj, func) def _aggregate_series_fast(self, obj, func): - # At this point we have already checked that obj.index is not a MultiIndex - # and that obj is backed by an ndarray, not ExtensionArray + # At this point we have already checked that + # - obj.index is not a MultiIndex + # - obj is backed by an ndarray, not ExtensionArray + # - len(obj) > 0 func = self._is_builtin_func(func) group_index, _, ngroups = self.group_info @@ -797,6 +801,7 @@ def groupings(self): ] def agg_series(self, obj: Series, func): + # 2019-11-09: 3 tests get here with len(obj) == 0 if is_extension_array_dtype(obj.dtype): # pre-empty SeriesBinGrouper from raising TypeError # TODO: watch out, this can return None diff --git a/pandas/tests/groupby/test_bin_groupby.py b/pandas/tests/groupby/test_bin_groupby.py index 0e7a66769d2d4..589deec82a0cc 100644 --- a/pandas/tests/groupby/test_bin_groupby.py +++ b/pandas/tests/groupby/test_bin_groupby.py @@ -25,6 +25,15 @@ def test_series_grouper(): tm.assert_almost_equal(counts, exp_counts) +def test_series_grouper_requires_nonempty(): + obj = Series(np.random.randn(10)) + dummy = obj[:0] + labels = np.array([-1, -1, -1, 0, 0, 0, 1, 1, 1, 1], dtype=np.int64) + + with pytest.raises(ValueError, match="requires non-empty"): + libreduction.SeriesGrouper(dummy, np.mean, labels, 2, dummy) + + def test_series_bin_grouper(): obj = Series(np.random.randn(10)) dummy = obj[:0] From c2558504bd740a81723757eb8a11e050c72d5e61 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 8 Nov 2019 18:37:49 -0800 Subject: [PATCH 2/5] dummy ot force ci From 06bac1895553125ebb15b5a1361650122d883e5e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 9 Nov 2019 09:45:39 -0800 Subject: [PATCH 3/5] update per comments --- pandas/tests/groupby/test_bin_groupby.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/test_bin_groupby.py b/pandas/tests/groupby/test_bin_groupby.py index 589deec82a0cc..4ede6b165c691 100644 --- a/pandas/tests/groupby/test_bin_groupby.py +++ b/pandas/tests/groupby/test_bin_groupby.py @@ -25,12 +25,13 @@ def test_series_grouper(): tm.assert_almost_equal(counts, exp_counts) -def test_series_grouper_requires_nonempty(): +def test_series_grouper_requires_nonempty_raises(): + # GH#29500 obj = Series(np.random.randn(10)) dummy = obj[:0] labels = np.array([-1, -1, -1, 0, 0, 0, 1, 1, 1, 1], dtype=np.int64) - with pytest.raises(ValueError, match="requires non-empty"): + with pytest.raises(ValueError, match="SeriesGrouper requires non-empty `series`"): libreduction.SeriesGrouper(dummy, np.mean, labels, 2, dummy) From e4e96209610431488e1c970cf0028b65fc7c5941 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 9 Nov 2019 09:46:10 -0800 Subject: [PATCH 4/5] update per comments --- pandas/core/groupby/ops.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 0f6fe529b3615..df39ec59835a5 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -585,7 +585,6 @@ def _transform( def agg_series(self, obj: Series, func): if len(obj) == 0: # SeriesGrouper would raise if we were to call _aggregate_series_fast - # 2019-11-09: 3 tests get here with len(obj) == 0 return self._aggregate_series_pure_python(obj, func) elif is_extension_array_dtype(obj.dtype) and obj.dtype.kind != "M": @@ -801,7 +800,6 @@ def groupings(self): ] def agg_series(self, obj: Series, func): - # 2019-11-09: 3 tests get here with len(obj) == 0 if is_extension_array_dtype(obj.dtype): # pre-empty SeriesBinGrouper from raising TypeError # TODO: watch out, this can return None From e02ffd231140dc042a93c2b434d91875a43016c0 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 10 Nov 2019 08:58:23 -0800 Subject: [PATCH 5/5] change check to assertion --- pandas/_libs/reduction.pyx | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index d9dfdda0de035..55e866dea9269 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -431,10 +431,9 @@ cdef class SeriesGrouper: islider.reset() vslider.reset() - if result is None: - # This should no longer be reachable since we check for empty - # series in the constructor. - raise AssertionError("`result` has not been assigned.") + # We check for empty series in the constructor, so should always + # have result initialized by this point. + assert result is not None, "`result` has not been assigned." if result.dtype == np.object_: result = maybe_convert_objects(result)