From 67c1c8dd59e35637297e37d6ac3db42213cad444 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 28 May 2023 11:19:13 -0400 Subject: [PATCH 1/2] POC: Don't special case Python builtin and NumPy functions --- pandas/core/apply.py | 10 ++++-- pandas/core/common.py | 34 +++++++++++++++++++ pandas/tests/apply/test_frame_apply.py | 5 +-- pandas/tests/groupby/aggregate/test_cython.py | 5 ++- pandas/tests/groupby/test_apply.py | 7 ++-- pandas/tests/groupby/test_categorical.py | 16 +++------ pandas/tests/groupby/test_function.py | 25 ++++++++------ pandas/tests/groupby/test_groupby.py | 18 ++++++---- pandas/tests/groupby/test_raises.py | 6 ++-- .../tests/groupby/transform/test_transform.py | 20 +++++------ pandas/tests/resample/test_resample_api.py | 16 +++++---- pandas/tests/reshape/test_pivot.py | 4 +-- pandas/tests/window/test_api.py | 14 ++++---- 13 files changed, 112 insertions(+), 68 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index c03f1a268906e..346fb9c822fc2 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -1105,10 +1105,14 @@ def agg(self): # we cannot FIRST try the vectorized evaluation, because # then .agg and .apply would have different semantics if the # operation is actually defined on the Series, e.g. str - try: - result = self.obj.apply(f) - except (ValueError, AttributeError, TypeError): + has_cython_func = f in com._orig_cython_table + if has_cython_func and not self.args and not self.kwargs: result = f(self.obj) + else: + try: + result = self.obj.apply(f) + except (ValueError, AttributeError, TypeError): + result = f(self.obj) return result diff --git a/pandas/core/common.py b/pandas/core/common.py index ee8fe220698b5..d23e7581ea3f9 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -560,12 +560,46 @@ def require_length_match(data, index: Index) -> None: # whereas np.min and np.max (which directly call obj.min and obj.max) # default to axis=None. _builtin_table = { + # builtins.sum: np.sum, + # builtins.max: np.maximum.reduce, + # builtins.min: np.minimum.reduce, +} + +_orig_builtin_table = { builtins.sum: np.sum, builtins.max: np.maximum.reduce, builtins.min: np.minimum.reduce, } _cython_table = { + # builtins.sum: "sum", + # builtins.max: "max", + # builtins.min: "min", + # np.all: "all", + # np.any: "any", + # np.sum: "sum", + # np.nansum: "sum", + # np.mean: "mean", + # np.nanmean: "mean", + # np.prod: "prod", + # np.nanprod: "prod", + # np.std: "std", + # np.nanstd: "std", + # np.var: "var", + # np.nanvar: "var", + # np.median: "median", + # np.nanmedian: "median", + # np.max: "max", + # np.nanmax: "max", + # np.min: "min", + # np.nanmin: "min", + # np.cumprod: "cumprod", + # np.nancumprod: "cumprod", + # np.cumsum: "cumsum", + # np.nancumsum: "cumsum", +} + +_orig_cython_table = { builtins.sum: "sum", builtins.max: "max", builtins.min: "min", diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index fc8b57d26a5be..bac38e319a6ac 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -1502,13 +1502,14 @@ def foo2(x, b=2, c=0): def test_agg_std(): df = DataFrame(np.arange(6).reshape(3, 2), columns=["A", "B"]) + expected_value = 1.632993161855452 result = df.agg(np.std) - expected = Series({"A": 2.0, "B": 2.0}, dtype=float) + expected = Series({"A": expected_value, "B": expected_value}, dtype=float) tm.assert_series_equal(result, expected) result = df.agg([np.std]) - expected = DataFrame({"A": 2.0, "B": 2.0}, index=["std"]) + expected = DataFrame({"A": expected_value, "B": expected_value}, index=["std"]) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index 2fb7c8eb03bb0..9168daf23cc64 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -181,7 +181,10 @@ def test__cython_agg_general(op, targop): df = DataFrame(np.random.randn(1000)) labels = np.random.randint(0, 50, size=1000).astype(float) - result = df.groupby(labels)._cython_agg_general(op, alt=None, numeric_only=True) + kwargs = {"ddof": 0} if op == "var" else {} + result = df.groupby(labels)._cython_agg_general( + op, alt=None, numeric_only=True, **kwargs + ) expected = df.groupby(labels).agg(targop) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 832192d8a33e6..cce810aa11807 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -1072,17 +1072,14 @@ def test_apply_is_unchanged_when_other_methods_are_called_first(reduction_func): # Check output when no other methods are called before .apply() grp = df.groupby(by="a") - msg = "The behavior of DataFrame.sum with axis=None is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False): - result = grp.apply(sum) + result = grp.apply(lambda x: x.sum()) tm.assert_frame_equal(result, expected) # Check output when another method is called before .apply() grp = df.groupby(by="a") args = get_groupby_method_args(reduction_func, df) _ = getattr(grp, reduction_func)(*args) - with tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False): - result = grp.apply(sum) + result = grp.apply(lambda x: x.sum()) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index c0704d9684574..c75460690e114 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1234,8 +1234,8 @@ def test_seriesgroupby_observed_false_or_none(df_cat, observed, operation): ).sortlevel() expected = Series(data=[2, 4, np.nan, 1, np.nan, 3], index=index, name="C") - if operation == "agg": - expected = expected.fillna(0, downcast="infer") + # if operation == "agg": + # expected = expected.fillna(0, downcast="infer") grouped = df_cat.groupby(["A", "B"], observed=observed)["C"] result = getattr(grouped, operation)(sum) tm.assert_series_equal(result, expected) @@ -1676,21 +1676,15 @@ def test_categorical_transform(): "OnTheWay", "Waiting", ], - "last_status": [ - "Delivered", - "Delivered", - "Delivered", - "OnTheWay", - "OnTheWay", - "Waiting", - ], + # max doesn't take into account Categorical dtype + "last_status": "Waiting", } ) expected["status"] = expected["status"].astype(delivery_status_type) # .transform(max) should preserve ordered categoricals - expected["last_status"] = expected["last_status"].astype(delivery_status_type) + # expected["last_status"] = expected["last_status"].astype(delivery_status_type) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 98fce9d668e44..af8de861296ef 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -58,7 +58,7 @@ def test_intercept_builtin_sum(): result = grouped.agg(builtins.sum) result2 = grouped.apply(builtins.sum) - expected = grouped.sum() + expected = Series({0: 1.0, 1: 2.0, 2: np.nan}) tm.assert_series_equal(result, expected) tm.assert_series_equal(result2, expected) @@ -74,17 +74,18 @@ def test_builtins_apply(keys, f): fname = f.__name__ - warn = None if f is not sum else FutureWarning - msg = "The behavior of DataFrame.sum with axis=None is deprecated" - with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False): + if fname == "sum": + with pytest.raises(TypeError, match="unsupported operand type"): + gb.apply(f) + else: result = gb.apply(f) - ngroups = len(df.drop_duplicates(subset=keys)) - - assert_msg = f"invalid frame shape: {result.shape} (expected ({ngroups}, 3))" - assert result.shape == (ngroups, 3), assert_msg + expected = Series({idx: f(group) for idx, group in gb}) + expected.index.names = keys if isinstance(keys, list) else [keys] + tm.assert_series_equal(result, expected) npfunc = lambda x: getattr(np, fname)(x, axis=0) # numpy's equivalent function - expected = gb.apply(npfunc) + result = gb.apply(npfunc) + expected = gb.apply(lambda x: getattr(x, fname)()).astype(float) tm.assert_frame_equal(result, expected) with tm.assert_produces_warning(None): @@ -683,7 +684,11 @@ def test_ops_general(op, targop): df = DataFrame(np.random.randn(1000)) labels = np.random.randint(0, 50, size=1000).astype(float) - result = getattr(df.groupby(labels), op)() + if op in ("std", "var"): + kwargs = {"ddof": 0} + else: + kwargs = {} + result = getattr(df.groupby(labels), op)(**kwargs) expected = df.groupby(labels).agg(targop) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 0c6661b49d917..44143bdb8b7e2 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -82,7 +82,13 @@ def test_basic_aggregations(dtype): tm.assert_series_equal(agged, grouped.agg(np.mean)) # shorthand tm.assert_series_equal(agged, grouped.mean()) - tm.assert_series_equal(grouped.agg(np.sum), grouped.sum()) + + result = grouped.agg(np.sum) + expected = grouped.sum() + if dtype == "int32": + # NumPy sums int32 to int64 + expected = expected.astype("int64") + tm.assert_series_equal(result, expected) expected = grouped.apply(lambda x: x * x.sum()) transformed = grouped.transform(lambda x: x * x.sum()) @@ -753,11 +759,8 @@ def test_groupby_as_index_agg(df): gr = df.groupby(ts) gr.nth(0) # invokes set_selection_from_grouper internally - msg = "The behavior of DataFrame.sum with axis=None is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False): - res = gr.apply(sum) - with tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False): - alt = df.groupby(ts).apply(sum) + res = gr.apply(np.sum) + alt = df.groupby(ts).apply(np.sum) tm.assert_frame_equal(res, alt) for attr in ["mean", "max", "count", "idxmax", "cumsum", "all"]: @@ -923,9 +926,10 @@ def test_raises_on_nuisance(df): df = df.loc[:, ["A", "C", "D"]] df["E"] = datetime.now() grouped = df.groupby("A") - msg = "datetime64 type does not support sum operations" + msg = "does not support reduction 'sum'" with pytest.raises(TypeError, match=msg): grouped.agg(np.sum) + msg = "datetime64 type does not support sum operations" with pytest.raises(TypeError, match=msg): grouped.sum() diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index 6fb903b02b62f..499b06805f4e6 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -329,7 +329,7 @@ def test_groupby_raises_datetime_np( gb = gb["d"] klass, msg = { - np.sum: (TypeError, "datetime64 type does not support sum operations"), + np.sum: (TypeError, "does not support reduction"), np.mean: (None, ""), }[groupby_func_np] @@ -519,10 +519,10 @@ def test_groupby_raises_category_np( gb = gb["d"] klass, msg = { - np.sum: (TypeError, "category type does not support sum operations"), + np.sum: (TypeError, "category does not support reduction"), np.mean: ( TypeError, - "category dtype does not support aggregation 'mean'", + "category does not support reduction", ), }[groupby_func_np] diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 09b24284d3b37..67e1403aca380 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -471,22 +471,22 @@ def test_series_fast_transform_date(): tm.assert_series_equal(result, expected) -def test_transform_length(): +@pytest.mark.parametrize("op", [sum, np.nansum]) +def test_transform_length(op): # GH 9697 df = DataFrame({"col1": [1, 1, 2, 2], "col2": [1, 2, 3, np.nan]}) - expected = Series([3.0] * 4) - - def nsum(x): - return np.nansum(x) + if op is sum: + values = [3.0, 3.0, np.nan, np.nan] + else: + values = [3.0, 3.0, 3.0, 3.0] + expected = Series(values, name="col2") results = [ - df.groupby("col1").transform(sum)["col2"], - df.groupby("col1")["col2"].transform(sum), - df.groupby("col1").transform(nsum)["col2"], - df.groupby("col1")["col2"].transform(nsum), + df.groupby("col1").transform(op)["col2"], + df.groupby("col1")["col2"].transform(op), ] for result in results: - tm.assert_series_equal(result, expected, check_names=False) + tm.assert_series_equal(result, expected) def test_transform_coercion(): diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 793e31295b8c3..7e7cbb6594d53 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -382,19 +382,21 @@ def test_agg(): ] a_mean = r["A"].mean() - a_std = r["A"].std() + a_std_ddof0 = r["A"].std(ddof=0) + a_std_ddof1 = r["A"].std(ddof=1) a_sum = r["A"].sum() b_mean = r["B"].mean() - b_std = r["B"].std() + b_std_ddof0 = r["B"].std(ddof=0) + b_std_ddof1 = r["B"].std(ddof=1) b_sum = r["B"].sum() - expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1) + expected = pd.concat([a_mean, a_std_ddof0, b_mean, b_std_ddof0], axis=1) expected.columns = pd.MultiIndex.from_product([["A", "B"], ["mean", "std"]]) for t in cases: # In case 2, "date" is an index and a column, so get included in the agg if t == cases[2]: date_mean = t["date"].mean() - date_std = t["date"].std() + date_std = t["date"].std(ddof=0) exp = pd.concat([date_mean, date_std, expected], axis=1) exp.columns = pd.MultiIndex.from_product( [["date", "A", "B"], ["mean", "std"]] @@ -405,7 +407,7 @@ def test_agg(): result = t.aggregate([np.mean, np.std]) tm.assert_frame_equal(result, expected) - expected = pd.concat([a_mean, b_std], axis=1) + expected = pd.concat([a_mean, b_std_ddof0], axis=1) for t in cases: result = t.aggregate({"A": np.mean, "B": np.std}) tm.assert_frame_equal(result, expected, check_like=True) @@ -416,7 +418,7 @@ def test_agg(): result = t.aggregate(A=NamedAgg("A", np.mean), B=NamedAgg("B", np.std)) tm.assert_frame_equal(result, expected, check_like=True) - expected = pd.concat([a_mean, a_std], axis=1) + expected = pd.concat([a_mean, a_std_ddof1], axis=1) expected.columns = pd.MultiIndex.from_tuples([("A", "mean"), ("A", "std")]) for t in cases: result = t.aggregate({"A": ["mean", "std"]}) @@ -449,7 +451,7 @@ def test_agg(): } ) - expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1) + expected = pd.concat([a_mean, a_std_ddof1, b_mean, b_std_ddof1], axis=1) expected.columns = pd.MultiIndex.from_tuples( [("A", "mean"), ("A", "std"), ("B", "mean"), ("B", "std")] ) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index b40f0f7a45263..1896db3ce9294 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -2023,10 +2023,7 @@ def test_pivot_string_as_func(self): [ ("sum", np.sum), ("mean", np.mean), - ("std", np.std), (["sum", "mean"], [np.sum, np.mean]), - (["sum", "std"], [np.sum, np.std]), - (["std", "mean"], [np.std, np.mean]), ], ) def test_pivot_string_func_vs_func(self, f, f_numpy, data): @@ -2035,6 +2032,7 @@ def test_pivot_string_func_vs_func(self, f, f_numpy, data): data = data.drop(columns="C") result = pivot_table(data, index="A", columns="B", aggfunc=f) expected = pivot_table(data, index="A", columns="B", aggfunc=f_numpy) + tm.assert_frame_equal(result, expected) @pytest.mark.slow diff --git a/pandas/tests/window/test_api.py b/pandas/tests/window/test_api.py index d6cca5061671b..198d90d746336 100644 --- a/pandas/tests/window/test_api.py +++ b/pandas/tests/window/test_api.py @@ -80,23 +80,25 @@ def test_agg(step): r = df.rolling(window=3, step=step) a_mean = r["A"].mean() - a_std = r["A"].std() + a_std_ddof0 = r["A"].std(ddof=0) + a_std_ddof1 = r["A"].std(ddof=1) a_sum = r["A"].sum() b_mean = r["B"].mean() - b_std = r["B"].std() + b_std_ddof0 = r["B"].std(ddof=0) + b_std_ddof1 = r["B"].std(ddof=1) result = r.aggregate([np.mean, np.std]) - expected = concat([a_mean, a_std, b_mean, b_std], axis=1) + expected = concat([a_mean, a_std_ddof0, b_mean, b_std_ddof0], axis=1) expected.columns = MultiIndex.from_product([["A", "B"], ["mean", "std"]]) tm.assert_frame_equal(result, expected) result = r.aggregate({"A": np.mean, "B": np.std}) - expected = concat([a_mean, b_std], axis=1) + expected = concat([a_mean, b_std_ddof0], axis=1) tm.assert_frame_equal(result, expected, check_like=True) result = r.aggregate({"A": ["mean", "std"]}) - expected = concat([a_mean, a_std], axis=1) + expected = concat([a_mean, a_std_ddof1], axis=1) expected.columns = MultiIndex.from_tuples([("A", "mean"), ("A", "std")]) tm.assert_frame_equal(result, expected) @@ -116,7 +118,7 @@ def test_agg(step): ) result = r.aggregate({"A": ["mean", "std"], "B": ["mean", "std"]}) - expected = concat([a_mean, a_std, b_mean, b_std], axis=1) + expected = concat([a_mean, a_std_ddof1, b_mean, b_std_ddof1], axis=1) exp_cols = [("A", "mean"), ("A", "std"), ("B", "mean"), ("B", "std")] expected.columns = MultiIndex.from_tuples(exp_cols) From 59b06b53b0aa2687a698f90587a773709d51fc45 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 28 May 2023 11:45:30 -0400 Subject: [PATCH 2/2] cleanup implementation --- pandas/core/apply.py | 32 ++--- pandas/core/common.py | 58 --------- pandas/core/groupby/generic.py | 6 - pandas/core/groupby/groupby.py | 5 - pandas/core/resample.py | 3 - pandas/tests/apply/test_invalid_arg.py | 12 +- pandas/tests/apply/test_str.py | 159 ------------------------- 7 files changed, 19 insertions(+), 256 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 346fb9c822fc2..bacb1f2992616 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -159,10 +159,7 @@ def agg(self) -> DataFrame | Series | None: Result of aggregation, or None if agg cannot be performed by this method. """ - obj = self.obj arg = self.f - args = self.args - kwargs = self.kwargs if isinstance(arg, str): return self.apply_str() @@ -173,11 +170,6 @@ def agg(self) -> DataFrame | Series | None: # we require a list, but not a 'str' return self.agg_list_like() - if callable(arg): - f = com.get_cython_func(arg) - if f and not args and not kwargs: - return getattr(obj, f)() - # caller can react return None @@ -283,11 +275,6 @@ def transform_str_or_callable(self, func) -> DataFrame | Series: if isinstance(func, str): return self._apply_str(obj, func, *args, **kwargs) - if not args and not kwargs: - f = com.get_cython_func(func) - if f: - return getattr(obj, f)() - # Two possible ways to use a UDF - apply or call directly try: return obj.apply(func, args=args, **kwargs) @@ -1097,18 +1084,19 @@ def agg(self): # string, list-like, and dict-like are entirely handled in super assert callable(f) - # try a regular apply, this evaluates lambdas - # row-by-row; however if the lambda is expected a Series - # expression, e.g.: lambda x: x-x.quantile(0.25) - # this will fail, so we can try a vectorized evaluation - - # we cannot FIRST try the vectorized evaluation, because - # then .agg and .apply would have different semantics if the - # operation is actually defined on the Series, e.g. str - has_cython_func = f in com._orig_cython_table + has_cython_func = f in com._cython_table if has_cython_func and not self.args and not self.kwargs: + # previous versions would vectorize NumPy functions result = f(self.obj) else: + # try a regular apply, this evaluates lambdas + # row-by-row; however if the lambda is expected a Series + # expression, e.g.: lambda x: x-x.quantile(0.25) + # this will fail, so we can try a vectorized evaluation + + # we cannot FIRST try the vectorized evaluation, because + # then .agg and .apply would have different semantics if the + # operation is actually defined on the Series, e.g. str try: result = self.obj.apply(f) except (ValueError, AttributeError, TypeError): diff --git a/pandas/core/common.py b/pandas/core/common.py index d23e7581ea3f9..1ec27bb39be92 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -556,50 +556,7 @@ def require_length_match(data, index: Index) -> None: ) -# the ufuncs np.maximum.reduce and np.minimum.reduce default to axis=0, -# whereas np.min and np.max (which directly call obj.min and obj.max) -# default to axis=None. -_builtin_table = { - # builtins.sum: np.sum, - # builtins.max: np.maximum.reduce, - # builtins.min: np.minimum.reduce, -} - -_orig_builtin_table = { - builtins.sum: np.sum, - builtins.max: np.maximum.reduce, - builtins.min: np.minimum.reduce, -} - _cython_table = { - # builtins.sum: "sum", - # builtins.max: "max", - # builtins.min: "min", - # np.all: "all", - # np.any: "any", - # np.sum: "sum", - # np.nansum: "sum", - # np.mean: "mean", - # np.nanmean: "mean", - # np.prod: "prod", - # np.nanprod: "prod", - # np.std: "std", - # np.nanstd: "std", - # np.var: "var", - # np.nanvar: "var", - # np.median: "median", - # np.nanmedian: "median", - # np.max: "max", - # np.nanmax: "max", - # np.min: "min", - # np.nanmin: "min", - # np.cumprod: "cumprod", - # np.nancumprod: "cumprod", - # np.cumsum: "cumsum", - # np.nancumsum: "cumsum", -} - -_orig_cython_table = { builtins.sum: "sum", builtins.max: "max", builtins.min: "min", @@ -628,21 +585,6 @@ def require_length_match(data, index: Index) -> None: } -def get_cython_func(arg: Callable) -> str | None: - """ - if we define an internal function for this argument, return it - """ - return _cython_table.get(arg) - - -def is_builtin_func(arg): - """ - if we define a builtin function for this argument, return it, - otherwise return the arg - """ - return _builtin_table.get(arg, arg) - - def fill_missing_names(names: Sequence[Hashable | None]) -> list[Hashable]: """ If a name is missing then replace it by level_n, where n is the count diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 37ef04f17a2e5..f33f657001cb7 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -251,10 +251,6 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) return ret else: - cyfunc = com.get_cython_func(func) - if cyfunc and not args and not kwargs: - return getattr(self, cyfunc)() - if self.ngroups == 0: # e.g. test_evaluate_with_empty_groups without any groups to # iterate over, we have no output on which to do dtype @@ -297,7 +293,6 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) agg = aggregate def _python_agg_general(self, func, *args, **kwargs): - func = com.is_builtin_func(func) f = lambda x: func(x, *args, **kwargs) obj = self._obj_with_exclusions @@ -1463,7 +1458,6 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) agg = aggregate def _python_agg_general(self, func, *args, **kwargs): - func = com.is_builtin_func(func) f = lambda x: func(x, *args, **kwargs) if self.ngroups == 0: diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index bdab641719ded..f5109159fa5b6 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1459,8 +1459,6 @@ def _aggregate_with_numba(self, func, *args, engine_kwargs=None, **kwargs): ) ) def apply(self, func, *args, **kwargs) -> NDFrameT: - func = com.is_builtin_func(func) - if isinstance(func, str): if hasattr(self, func): res = getattr(self, func) @@ -1659,9 +1657,6 @@ def _transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): func, *args, engine_kwargs=engine_kwargs, **kwargs ) - # optimized transforms - func = com.get_cython_func(func) or func - if not isinstance(func, str): return self._transform_general(func, *args, **kwargs) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index f8adb2332609b..bdef45a39de5f 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -43,7 +43,6 @@ import pandas.core.algorithms as algos from pandas.core.apply import ResamplerWindowApply from pandas.core.base import PandasObject -import pandas.core.common as com from pandas.core.generic import ( NDFrame, _shared_docs, @@ -1414,7 +1413,6 @@ def _downsample(self, how, **kwargs): how : string / cython mapped function **kwargs : kw args passed to how function """ - how = com.get_cython_func(how) or how ax = self.ax # Excludes `on` column when provided @@ -1566,7 +1564,6 @@ def _downsample(self, how, **kwargs): if self.kind == "timestamp": return super()._downsample(how, **kwargs) - how = com.get_cython_func(how) or how ax = self.ax if is_subperiod(ax.freq, self.freq): diff --git a/pandas/tests/apply/test_invalid_arg.py b/pandas/tests/apply/test_invalid_arg.py index d75b784302676..8dfce965e12ed 100644 --- a/pandas/tests/apply/test_invalid_arg.py +++ b/pandas/tests/apply/test_invalid_arg.py @@ -243,9 +243,15 @@ def test_agg_cython_table_raises_frame(df, func, expected, axis): ) def test_agg_cython_table_raises_series(series, func, expected): # GH21224 - msg = r"[Cc]ould not convert|can't multiply sequence by non-int of type" - if func == "median" or func is np.nanmedian or func is np.median: - msg = r"Cannot convert \['a' 'b' 'c'\] to numeric" + msg = "|".join( + [ + "Cannot convert", + "[Cc]ould not convert", + "can't multiply sequence by non-int of type", + "not supported for the input types", + "unsupported operand type", + ] + ) with pytest.raises(expected, match=msg): # e.g. Series('a b'.split()).cumprod() will raise diff --git a/pandas/tests/apply/test_str.py b/pandas/tests/apply/test_str.py index 64189fae5f578..23d578516ae4e 100644 --- a/pandas/tests/apply/test_str.py +++ b/pandas/tests/apply/test_str.py @@ -1,11 +1,8 @@ -from itertools import chain import operator import numpy as np import pytest -from pandas.core.dtypes.common import is_number - from pandas import ( DataFrame, Series, @@ -86,162 +83,6 @@ def test_apply_np_transformer(float_frame, op, how): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize( - "series, func, expected", - chain( - tm.get_cython_table_params( - Series(dtype=np.float64), - [ - ("sum", 0), - ("max", np.nan), - ("min", np.nan), - ("all", True), - ("any", False), - ("mean", np.nan), - ("prod", 1), - ("std", np.nan), - ("var", np.nan), - ("median", np.nan), - ], - ), - tm.get_cython_table_params( - Series([np.nan, 1, 2, 3]), - [ - ("sum", 6), - ("max", 3), - ("min", 1), - ("all", True), - ("any", True), - ("mean", 2), - ("prod", 6), - ("std", 1), - ("var", 1), - ("median", 2), - ], - ), - tm.get_cython_table_params( - Series("a b c".split()), - [ - ("sum", "abc"), - ("max", "c"), - ("min", "a"), - ("all", True), - ("any", True), - ], - ), - ), -) -def test_agg_cython_table_series(series, func, expected): - # GH21224 - # test reducing functions in - # pandas.core.base.SelectionMixin._cython_table - result = series.agg(func) - if is_number(expected): - assert np.isclose(result, expected, equal_nan=True) - else: - assert result == expected - - -@pytest.mark.parametrize( - "series, func, expected", - chain( - tm.get_cython_table_params( - Series(dtype=np.float64), - [ - ("cumprod", Series([], dtype=np.float64)), - ("cumsum", Series([], dtype=np.float64)), - ], - ), - tm.get_cython_table_params( - Series([np.nan, 1, 2, 3]), - [ - ("cumprod", Series([np.nan, 1, 2, 6])), - ("cumsum", Series([np.nan, 1, 3, 6])), - ], - ), - tm.get_cython_table_params( - Series("a b c".split()), [("cumsum", Series(["a", "ab", "abc"]))] - ), - ), -) -def test_agg_cython_table_transform_series(series, func, expected): - # GH21224 - # test transforming functions in - # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum) - result = series.agg(func) - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize( - "df, func, expected", - chain( - tm.get_cython_table_params( - DataFrame(), - [ - ("sum", Series(dtype="float64")), - ("max", Series(dtype="float64")), - ("min", Series(dtype="float64")), - ("all", Series(dtype=bool)), - ("any", Series(dtype=bool)), - ("mean", Series(dtype="float64")), - ("prod", Series(dtype="float64")), - ("std", Series(dtype="float64")), - ("var", Series(dtype="float64")), - ("median", Series(dtype="float64")), - ], - ), - tm.get_cython_table_params( - DataFrame([[np.nan, 1], [1, 2]]), - [ - ("sum", Series([1.0, 3])), - ("max", Series([1.0, 2])), - ("min", Series([1.0, 1])), - ("all", Series([True, True])), - ("any", Series([True, True])), - ("mean", Series([1, 1.5])), - ("prod", Series([1.0, 2])), - ("std", Series([np.nan, 0.707107])), - ("var", Series([np.nan, 0.5])), - ("median", Series([1, 1.5])), - ], - ), - ), -) -def test_agg_cython_table_frame(df, func, expected, axis): - # GH 21224 - # test reducing functions in - # pandas.core.base.SelectionMixin._cython_table - result = df.agg(func, axis=axis) - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize( - "df, func, expected", - chain( - tm.get_cython_table_params( - DataFrame(), [("cumprod", DataFrame()), ("cumsum", DataFrame())] - ), - tm.get_cython_table_params( - DataFrame([[np.nan, 1], [1, 2]]), - [ - ("cumprod", DataFrame([[np.nan, 1], [1, 2]])), - ("cumsum", DataFrame([[np.nan, 1], [1, 3]])), - ], - ), - ), -) -def test_agg_cython_table_transform_frame(df, func, expected, axis): - # GH 21224 - # test transforming functions in - # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum) - if axis in ("columns", 1): - # operating blockwise doesn't let us preserve dtypes - expected = expected.astype("float64") - - result = df.agg(func, axis=axis) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize("op", series_transform_kernels) def test_transform_groupby_kernel_series(request, string_series, op): # GH 35964