diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 34b91823abc09..79bb32c54d832 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -112,7 +112,7 @@ Other enhancements - :meth:`SeriesGroupby.transform` and :meth:`DataFrameGroupby.transform` now support passing in a string as the function for ``engine="numba"`` (:issue:`53579`) - Added :meth:`ExtensionArray.interpolate` used by :meth:`Series.interpolate` and :meth:`DataFrame.interpolate` (:issue:`53659`) - Added ``engine_kwargs`` parameter to :meth:`DataFrame.to_excel` (:issue:`53220`) -- Added a new parameter ``by_row`` to :meth:`Series.apply`. When set to ``False`` the supplied callables will always operate on the whole Series (:issue:`53400`). +- Added a new parameter ``by_row`` to :meth:`Series.apply` and :meth:`DataFrame.apply`. When set to ``False`` the supplied callables will always operate on the whole Series or DataFrame (:issue:`53400`, :issue:`53601`). - Groupby aggregations (such as :meth:`DataFrameGroupby.sum`) now can preserve the dtype of the input instead of casting to ``float64`` (:issue:`44952`) - Improved error message when :meth:`DataFrameGroupBy.agg` failed (:issue:`52930`) - Many read/to_* functions, such as :meth:`DataFrame.to_pickle` and :func:`read_csv`, support forwarding compression arguments to lzma.LZMAFile (:issue:`52979`) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index f98a08810c3ff..83a3b29bfd7f0 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -81,6 +81,7 @@ def frame_apply( axis: Axis = 0, raw: bool = False, result_type: str | None = None, + by_row: Literal[False, "compat"] = "compat", args=None, kwargs=None, ) -> FrameApply: @@ -100,6 +101,7 @@ def frame_apply( func, raw=raw, result_type=result_type, + by_row=by_row, args=args, kwargs=kwargs, ) @@ -115,11 +117,16 @@ def __init__( raw: bool, result_type: str | None, *, + by_row: Literal[False, "compat", "_compat"] = "compat", args, kwargs, ) -> None: self.obj = obj self.raw = raw + + assert by_row is False or by_row in ["compat", "_compat"] + self.by_row = by_row + self.args = args or () self.kwargs = kwargs or {} @@ -304,7 +311,14 @@ def agg_or_apply_list_like( func = cast(List[AggFuncTypeBase], self.func) kwargs = self.kwargs if op_name == "apply": - kwargs = {**kwargs, "by_row": False} + if isinstance(self, FrameApply): + by_row = self.by_row + + elif isinstance(self, SeriesApply): + by_row = "_compat" if self.by_row else False + else: + by_row = False + kwargs = {**kwargs, "by_row": by_row} if getattr(obj, "axis", 0) == 1: raise NotImplementedError("axis other than 0 is not supported") @@ -397,7 +411,10 @@ def agg_or_apply_dict_like( obj = self.obj func = cast(AggFuncTypeDict, self.func) - kwargs = {"by_row": False} if op_name == "apply" else {} + kwargs = {} + if op_name == "apply": + by_row = "_compat" if self.by_row else False + kwargs.update({"by_row": by_row}) if getattr(obj, "axis", 0) == 1: raise NotImplementedError("axis other than 0 is not supported") @@ -678,6 +695,23 @@ def agg_axis(self) -> Index: class FrameApply(NDFrameApply): obj: DataFrame + def __init__( + self, + obj: AggObjType, + func: AggFuncType, + raw: bool, + result_type: str | None, + *, + by_row: Literal[False, "compat"] = False, + args, + kwargs, + ) -> None: + if by_row is not False and by_row != "compat": + raise ValueError(f"by_row={by_row} not allowed") + super().__init__( + obj, func, raw, result_type, by_row=by_row, args=args, kwargs=kwargs + ) + # --------------------------------------------------------------- # Abstract Methods @@ -1067,7 +1101,7 @@ def infer_to_same_shape(self, results: ResType, res_index: Index) -> DataFrame: class SeriesApply(NDFrameApply): obj: Series axis: AxisInt = 0 - by_row: bool # only relevant for apply() + by_row: Literal[False, "compat", "_compat"] # only relevant for apply() def __init__( self, @@ -1075,7 +1109,7 @@ def __init__( func: AggFuncType, *, convert_dtype: bool | lib.NoDefault = lib.no_default, - by_row: bool = True, + by_row: Literal[False, "compat", "_compat"] = "compat", args, kwargs, ) -> None: @@ -1090,13 +1124,13 @@ def __init__( stacklevel=find_stack_level(), ) self.convert_dtype = convert_dtype - self.by_row = by_row super().__init__( obj, func, raw=False, result_type=None, + by_row=by_row, args=args, kwargs=kwargs, ) @@ -1115,6 +1149,9 @@ def apply(self) -> DataFrame | Series: # if we are a string, try to dispatch return self.apply_str() + if self.by_row == "_compat": + return self.apply_compat() + # self.func is Callable return self.apply_standard() @@ -1149,6 +1186,28 @@ def apply_empty_result(self) -> Series: obj, method="apply" ) + def apply_compat(self): + """compat apply method for funcs in listlikes and dictlikes. + + Used for each callable when giving listlikes and dictlikes of callables to + apply. Needed for compatibility with Pandas < v2.1. + + .. versionadded:: 2.1.0 + """ + obj = self.obj + func = self.func + + if callable(func): + f = com.get_cython_func(func) + if f and not self.args and not self.kwargs: + return obj.apply(func, by_row=False) + + try: + result = obj.apply(func, by_row="compat") + except (ValueError, AttributeError, TypeError): + result = obj.apply(func, by_row=False) + return result + def apply_standard(self) -> DataFrame | Series: # caller is responsible for ensuring that f is Callable func = cast(Callable, self.func) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 72d586964b524..ae43a44d68f1c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -9634,6 +9634,7 @@ def apply( raw: bool = False, result_type: Literal["expand", "reduce", "broadcast"] | None = None, args=(), + by_row: Literal[False, "compat"] = "compat", **kwargs, ): """ @@ -9682,6 +9683,17 @@ def apply( args : tuple Positional arguments to pass to `func` in addition to the array/series. + by_row : False or "compat", default "compat" + Only has an effect when ``func`` is a listlike or dictlike of funcs + and the func isn't a string. + If "compat", will if possible first translate the func into pandas + methods (e.g. ``Series().apply(np.sum)`` will be translated to + ``Series().sum()``). If that doesn't work, will try call to apply again with + ``by_row=True`` and if that fails, will call apply again with + ``by_row=False`` (backward compatible). + If False, the funcs will be passed the whole Series at once. + + .. versionadded:: 2.1.0 **kwargs Additional keyword arguments to pass as keywords arguments to `func`. @@ -9781,6 +9793,7 @@ def apply( axis=axis, raw=raw, result_type=result_type, + by_row=by_row, args=args, kwargs=kwargs, ) diff --git a/pandas/core/series.py b/pandas/core/series.py index 8785e2fb65fb8..e59a4cfc3fcc1 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4538,7 +4538,7 @@ def apply( convert_dtype: bool | lib.NoDefault = lib.no_default, args: tuple[Any, ...] = (), *, - by_row: bool = True, + by_row: Literal[False, "compat"] = "compat", **kwargs, ) -> DataFrame | Series: """ @@ -4562,14 +4562,20 @@ def apply( preserved for some extension array dtypes, such as Categorical. .. deprecated:: 2.1.0 - The convert_dtype has been deprecated. Do ``ser.astype(object).apply()`` + ``convert_dtype`` has been deprecated. Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``. args : tuple Positional arguments passed to func after the series value. - by_row : bool, default True + by_row : False or "compat", default "compat" + If ``"compat"`` and func is a callable, func will be passed each element of + the Series, like ``Series.map``. If func is a list or dict of + callables, will first try to translate each func into pandas methods. If + that doesn't work, will try call to apply again with ``by_row="compat"`` + and if that fails, will call apply again with ``by_row=False`` + (backward compatible). If False, the func will be passed the whole Series at once. - If True, will func will be passed each element of the Series, like - Series.map (backward compatible). + + ``by_row`` has no effect when ``func`` is a string. .. versionadded:: 2.1.0 **kwargs diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index 99fc393ff82c5..5681167cd54f9 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -667,6 +667,50 @@ def test_infer_row_shape(): assert result == (6, 2) +@pytest.mark.parametrize( + "ops, by_row, expected", + [ + ({"a": lambda x: x + 1}, "compat", DataFrame({"a": [2, 3]})), + ({"a": lambda x: x + 1}, False, DataFrame({"a": [2, 3]})), + ({"a": lambda x: x.sum()}, "compat", Series({"a": 3})), + ({"a": lambda x: x.sum()}, False, Series({"a": 3})), + ( + {"a": ["sum", np.sum, lambda x: x.sum()]}, + "compat", + DataFrame({"a": [3, 3, 3]}, index=["sum", "sum", ""]), + ), + ( + {"a": ["sum", np.sum, lambda x: x.sum()]}, + False, + DataFrame({"a": [3, 3, 3]}, index=["sum", "sum", ""]), + ), + ({"a": lambda x: 1}, "compat", DataFrame({"a": [1, 1]})), + ({"a": lambda x: 1}, False, Series({"a": 1})), + ], +) +def test_dictlike_lambda(ops, by_row, expected): + # GH53601 + df = DataFrame({"a": [1, 2]}) + result = df.apply(ops, by_row=by_row) + tm.assert_equal(result, expected) + + +@pytest.mark.parametrize( + "ops", + [ + {"a": lambda x: x + 1}, + {"a": lambda x: x.sum()}, + {"a": ["sum", np.sum, lambda x: x.sum()]}, + {"a": lambda x: 1}, + ], +) +def test_dictlike_lambda_raises(ops): + # GH53601 + df = DataFrame({"a": [1, 2]}) + with pytest.raises(ValueError, match="by_row=True not allowed"): + df.apply(ops, by_row=True) + + def test_with_dictlike_columns(): # GH 17602 df = DataFrame([[1, 2], [1, 2]], columns=["a", "b"]) @@ -716,6 +760,58 @@ def test_with_dictlike_columns_with_infer(): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize( + "ops, by_row, expected", + [ + ([lambda x: x + 1], "compat", DataFrame({("a", ""): [2, 3]})), + ([lambda x: x + 1], False, DataFrame({("a", ""): [2, 3]})), + ([lambda x: x.sum()], "compat", DataFrame({"a": [3]}, index=[""])), + ([lambda x: x.sum()], False, DataFrame({"a": [3]}, index=[""])), + ( + ["sum", np.sum, lambda x: x.sum()], + "compat", + DataFrame({"a": [3, 3, 3]}, index=["sum", "sum", ""]), + ), + ( + ["sum", np.sum, lambda x: x.sum()], + False, + DataFrame({"a": [3, 3, 3]}, index=["sum", "sum", ""]), + ), + ( + [lambda x: x + 1, lambda x: 3], + "compat", + DataFrame([[2, 3], [3, 3]], columns=[["a", "a"], ["", ""]]), + ), + ( + [lambda x: 2, lambda x: 3], + False, + DataFrame({"a": [2, 3]}, ["", ""]), + ), + ], +) +def test_listlike_lambda(ops, by_row, expected): + # GH53601 + df = DataFrame({"a": [1, 2]}) + result = df.apply(ops, by_row=by_row) + tm.assert_equal(result, expected) + + +@pytest.mark.parametrize( + "ops", + [ + [lambda x: x + 1], + [lambda x: x.sum()], + ["sum", np.sum, lambda x: x.sum()], + [lambda x: x + 1, lambda x: 3], + ], +) +def test_listlike_lambda_raises(ops): + # GH53601 + df = DataFrame({"a": [1, 2]}) + with pytest.raises(ValueError, match="by_row=True not allowed"): + df.apply(ops, by_row=True) + + def test_with_listlike_columns(): # GH 17348 df = DataFrame( diff --git a/pandas/tests/apply/test_series_apply.py b/pandas/tests/apply/test_series_apply.py index 3e0ff19ae4c1a..9002a5f85cba6 100644 --- a/pandas/tests/apply/test_series_apply.py +++ b/pandas/tests/apply/test_series_apply.py @@ -14,7 +14,7 @@ from pandas.tests.apply.common import series_transform_kernels -@pytest.fixture(params=[True, False]) +@pytest.fixture(params=[False, "compat"]) def by_row(request): return request.param @@ -69,12 +69,7 @@ def test_apply_map_same_length_inference_bug(): def f(x): return (x, x + 1) - result = s.apply(f, by_row=True) - expected = s.map(f) - tm.assert_series_equal(result, expected) - - s = Series([1, 2, 3]) - result = s.apply(f, by_row=by_row) + result = s.apply(f, by_row="compat") expected = s.map(f) tm.assert_series_equal(result, expected) @@ -87,7 +82,7 @@ def func(x): return x if x > 0 else np.nan with tm.assert_produces_warning(FutureWarning): - ser.apply(func, convert_dtype=convert_dtype, by_row=True) + ser.apply(func, convert_dtype=convert_dtype, by_row="compat") def test_apply_args(): @@ -161,7 +156,7 @@ def test_apply_box(): s = Series(vals) assert s.dtype == "datetime64[ns]" # boxed value must be Timestamp instance - res = s.apply(lambda x: f"{type(x).__name__}_{x.day}_{x.tz}", by_row=True) + res = s.apply(lambda x: f"{type(x).__name__}_{x.day}_{x.tz}", by_row="compat") exp = Series(["Timestamp_1_None", "Timestamp_2_None"]) tm.assert_series_equal(res, exp) @@ -171,7 +166,7 @@ def test_apply_box(): ] s = Series(vals) assert s.dtype == "datetime64[ns, US/Eastern]" - res = s.apply(lambda x: f"{type(x).__name__}_{x.day}_{x.tz}", by_row=True) + res = s.apply(lambda x: f"{type(x).__name__}_{x.day}_{x.tz}", by_row="compat") exp = Series(["Timestamp_1_US/Eastern", "Timestamp_2_US/Eastern"]) tm.assert_series_equal(res, exp) @@ -179,7 +174,7 @@ def test_apply_box(): vals = [pd.Timedelta("1 days"), pd.Timedelta("2 days")] s = Series(vals) assert s.dtype == "timedelta64[ns]" - res = s.apply(lambda x: f"{type(x).__name__}_{x.days}", by_row=True) + res = s.apply(lambda x: f"{type(x).__name__}_{x.days}", by_row="compat") exp = Series(["Timedelta_1", "Timedelta_2"]) tm.assert_series_equal(res, exp) @@ -187,7 +182,7 @@ def test_apply_box(): vals = [pd.Period("2011-01-01", freq="M"), pd.Period("2011-01-02", freq="M")] s = Series(vals) assert s.dtype == "Period[M]" - res = s.apply(lambda x: f"{type(x).__name__}_{x.freqstr}", by_row=True) + res = s.apply(lambda x: f"{type(x).__name__}_{x.freqstr}", by_row="compat") exp = Series(["Period_M", "Period_M"]) tm.assert_series_equal(res, exp) @@ -397,7 +392,7 @@ def test_demo(): @pytest.mark.parametrize("func", [str, lambda x: str(x)]) def test_apply_map_evaluate_lambdas_the_same(string_series, func, by_row): - # test that we are evaluating row-by-row first if by_row=True + # test that we are evaluating row-by-row first if by_row="compat" # else vectorized evaluation result = string_series.apply(func, by_row=by_row) @@ -435,7 +430,7 @@ def test_with_nested_series(datetime_series, op_name): tm.assert_frame_equal(result, expected) -def test_replicate_describe(string_series, by_row): +def test_replicate_describe(string_series): # this also tests a result set that is all scalars expected = string_series.describe() result = string_series.apply( @@ -449,7 +444,6 @@ def test_replicate_describe(string_series, by_row): "75%": lambda x: x.quantile(0.75), "max": "max", }, - by_row=by_row, ) tm.assert_series_equal(result, expected) @@ -467,7 +461,7 @@ def test_reduce(string_series): @pytest.mark.parametrize( "how, kwds", - [("agg", {}), ("apply", {"by_row": True}), ("apply", {"by_row": False})], + [("agg", {}), ("apply", {"by_row": "compat"}), ("apply", {"by_row": False})], ) def test_non_callable_aggregates(how, kwds): # test agg using non-callable series attributes @@ -526,7 +520,7 @@ def test_apply_series_on_date_time_index_aware_series(dti, exp, aware): @pytest.mark.parametrize( - "by_row, expected", [(True, Series(np.ones(30), dtype="int64")), (False, 1)] + "by_row, expected", [("compat", Series(np.ones(30), dtype="int64")), (False, 1)] ) def test_apply_scalar_on_date_time_index_aware_series(by_row, expected): # GH 25959 @@ -561,7 +555,7 @@ def test_apply_to_timedelta(by_row): ) @pytest.mark.parametrize( "how, kwargs", - [["agg", {}], ["apply", {"by_row": True}], ["apply", {"by_row": False}]], + [["agg", {}], ["apply", {"by_row": "compat"}], ["apply", {"by_row": False}]], ) def test_apply_listlike_reducer(string_series, ops, names, how, kwargs): # GH 39140 @@ -582,7 +576,7 @@ def test_apply_listlike_reducer(string_series, ops, names, how, kwargs): ) @pytest.mark.parametrize( "how, kwargs", - [["agg", {}], ["apply", {"by_row": True}], ["apply", {"by_row": False}]], + [["agg", {}], ["apply", {"by_row": "compat"}], ["apply", {"by_row": False}]], ) def test_apply_dictlike_reducer(string_series, ops, how, kwargs, by_row): # GH 39140 @@ -617,7 +611,7 @@ def test_apply_listlike_transformer(string_series, ops, names, by_row): ([lambda x: x.sum()], Series([6], index=[""])), ], ) -def test_apply_listlike_lambda(ops, expected, by_row=by_row): +def test_apply_listlike_lambda(ops, expected, by_row): # GH53400 ser = Series([1, 2, 3]) result = ser.apply(ops, by_row=by_row)