From 75ce829603734ea1f1d878776964619079d6f441 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Thu, 25 May 2023 09:23:03 +0100 Subject: [PATCH 01/11] REF: Decouple Series.apply from Series.agg --- doc/source/whatsnew/v2.1.0.rst | 1 + pandas/core/apply.py | 57 ++++++++++++++++++------- pandas/core/series.py | 16 ++++++- pandas/tests/apply/test_series_apply.py | 14 ++++++ 4 files changed, 72 insertions(+), 16 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 2c5263f447951..9e061569d8953 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -100,6 +100,7 @@ Other enhancements - Added ``engine_kwargs`` parameter to :meth:`DataFrame.to_excel` (:issue:`53220`) - Performance improvement in :func:`concat` with homogeneous ``np.float64`` or ``np.float32`` dtypes (:issue:`52685`) - Performance improvement in :meth:`DataFrame.filter` when ``items`` is given (:issue:`52941`) +- Added a new parameter ``array_ops_only`` to :meth:`Series.apply`. When set to ``True`` the supplied callables will always operate on the whole Series (:issue:`xxxxx`). - .. --------------------------------------------------------------------------- diff --git a/pandas/core/apply.py b/pandas/core/apply.py index c03f1a268906e..00dcdca02f7ac 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -16,6 +16,7 @@ Iterable, Iterator, List, + Literal, Sequence, cast, ) @@ -302,6 +303,9 @@ def agg_list_like(self) -> DataFrame | Series: ------- Result of aggregation. """ + return self._apply_list_like(op_name="agg") + + def _apply_list_like(self, op_name: Literal["agg", "apply"]) -> DataFrame | Series: from pandas.core.groupby.generic import ( DataFrameGroupBy, SeriesGroupBy, @@ -310,6 +314,9 @@ def agg_list_like(self) -> DataFrame | Series: obj = self.obj arg = cast(List[AggFuncTypeBase], self.f) + kwargs = self.kwargs + if op_name == "apply": + kwargs = {**kwargs, "array_ops_only": True} if getattr(obj, "axis", 0) == 1: raise NotImplementedError("axis other than 0 is not supported") @@ -327,8 +334,6 @@ def agg_list_like(self) -> DataFrame | Series: keys = [] is_groupby = isinstance(obj, (DataFrameGroupBy, SeriesGroupBy)) - is_ser_or_df = isinstance(obj, (ABCDataFrame, ABCSeries)) - this_args = [self.axis, *self.args] if is_ser_or_df else self.args context_manager: ContextManager if is_groupby: @@ -337,12 +342,19 @@ def agg_list_like(self) -> DataFrame | Series: context_manager = com.temp_setattr(obj, "as_index", True) else: context_manager = nullcontext() + + def include_axis(colg) -> bool: + return isinstance(colg, ABCDataFrame) or ( + isinstance(colg, ABCSeries) and op_name == "agg" + ) + with context_manager: # degenerate case if selected_obj.ndim == 1: for a in arg: colg = obj._gotitem(selected_obj.name, ndim=1, subset=selected_obj) - new_res = colg.aggregate(a, *this_args, **self.kwargs) + args = [self.axis, *self.args] if include_axis(colg) else self.args + new_res = getattr(colg, op_name)(a, *args, **kwargs) results.append(new_res) # make sure we find a good name @@ -353,7 +365,8 @@ def agg_list_like(self) -> DataFrame | Series: indices = [] for index, col in enumerate(selected_obj): colg = obj._gotitem(col, ndim=1, subset=selected_obj.iloc[:, index]) - new_res = colg.aggregate(arg, *this_args, **self.kwargs) + args = [self.axis, *self.args] if include_axis(colg) else self.args + new_res = getattr(colg, op_name)(arg, *args, **kwargs) results.append(new_res) indices.append(index) keys = selected_obj.columns.take(indices) @@ -380,6 +393,9 @@ def agg_dict_like(self) -> DataFrame | Series: ------- Result of aggregation. """ + return self._apply_dict_like(op_name="agg") + + def _apply_dict_like(self, op_name: Literal["agg", "apply"]) -> DataFrame | Series: from pandas import Index from pandas.core.groupby.generic import ( DataFrameGroupBy, @@ -387,8 +403,11 @@ def agg_dict_like(self) -> DataFrame | Series: ) from pandas.core.reshape.concat import concat + assert op_name in ["agg", "apply"] + obj = self.obj arg = cast(AggFuncTypeDict, self.f) + k = {"array_ops_only": True} if op_name == "apply" else {} if getattr(obj, "axis", 0) == 1: raise NotImplementedError("axis other than 0 is not supported") @@ -401,7 +420,7 @@ def agg_dict_like(self) -> DataFrame | Series: selected_obj = obj._selected_obj selection = obj._selection - arg = self.normalize_dictlike_arg("agg", selected_obj, arg) + arg = self.normalize_dictlike_arg(op_name, selected_obj, arg) is_groupby = isinstance(obj, (DataFrameGroupBy, SeriesGroupBy)) context_manager: ContextManager @@ -421,7 +440,9 @@ def agg_dict_like(self) -> DataFrame | Series: if selected_obj.ndim == 1: # key only used for output colg = obj._gotitem(selection, ndim=1) - result_data = [colg.agg(how) for _, how in arg.items()] + result_data = [ + getattr(colg, op_name)(how, **k) for _, how in arg.items() + ] result_index = list(arg.keys()) elif is_non_unique_col: # key used for column selection and output @@ -436,7 +457,7 @@ def agg_dict_like(self) -> DataFrame | Series: label_to_indices[label].append(index) key_data = [ - selected_obj._ixs(indice, axis=1).agg(how) + getattr(selected_obj._ixs(indice, axis=1), op_name)(how, **k) for label, indices in label_to_indices.items() for indice in indices ] @@ -446,7 +467,8 @@ def agg_dict_like(self) -> DataFrame | Series: else: # key used for column selection and output result_data = [ - obj._gotitem(key, ndim=1).agg(how) for key, how in arg.items() + getattr(obj._gotitem(key, ndim=1), op_name)(how) + for key, how in arg.items() ] result_index = list(arg.keys()) @@ -541,7 +563,7 @@ def apply_str(self) -> DataFrame | Series: self.kwargs["axis"] = self.axis return self._apply_str(obj, f, *self.args, **self.kwargs) - def apply_multiple(self) -> DataFrame | Series: + def apply_list_or_dict_like(self) -> DataFrame | Series: """ Compute apply in case of a list-like or dict-like. @@ -557,9 +579,9 @@ def apply_multiple(self) -> DataFrame | Series: kwargs = self.kwargs if is_dict_like(func): - result = self.agg_dict_like() + result = self._apply_dict_like(op_name="apply") else: - result = self.agg_list_like() + result = self._apply_list_like(op_name="apply") result = reconstruct_and_relabel_result(result, func, **kwargs) @@ -699,8 +721,8 @@ def values(self): def apply(self) -> DataFrame | Series: """compute the results""" # dispatch to agg - if is_list_like(self.f): - return self.apply_multiple() + if is_list_like(self.f) or is_dict_like(self.f): + return self.apply_list_or_dict_like() # all empty if len(self.columns) == 0 and len(self.index) == 0: @@ -1041,6 +1063,7 @@ def infer_to_same_shape(self, results: ResType, res_index: Index) -> DataFrame: class SeriesApply(NDFrameApply): obj: Series axis: AxisInt = 0 + array_ops_only: bool # only relevant for apply() def __init__( self, @@ -1048,6 +1071,7 @@ def __init__( func: AggFuncType, *, convert_dtype: bool | lib.NoDefault = lib.no_default, + array_ops_only: bool = False, args, kwargs, ) -> None: @@ -1062,6 +1086,7 @@ def __init__( stacklevel=find_stack_level(), ) self.convert_dtype = convert_dtype + self.array_ops_only = array_ops_only super().__init__( obj, @@ -1079,8 +1104,8 @@ def apply(self) -> DataFrame | Series: return self.apply_empty_result() # dispatch to agg - if is_list_like(self.f): - return self.apply_multiple() + if is_list_like(self.f) or is_dict_like(self.f): + return self.apply_list_or_dict_like() if isinstance(self.f, str): # if we are a string, try to dispatch @@ -1126,6 +1151,8 @@ def apply_standard(self) -> DataFrame | Series: if isinstance(f, np.ufunc): with np.errstate(all="ignore"): return f(obj) + elif self.array_ops_only: + return f(obj) # row-wise access # apply doesn't have a `na_action` keyword and for backward compat reasons diff --git a/pandas/core/series.py b/pandas/core/series.py index 12924a167fc46..b919a46a7abf1 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4492,6 +4492,8 @@ def apply( func: AggFuncType, convert_dtype: bool | lib.NoDefault = lib.no_default, args: tuple[Any, ...] = (), + *, + array_ops_only: bool = False, **kwargs, ) -> DataFrame | Series: """ @@ -4519,6 +4521,13 @@ def apply( instead if you want ``convert_dtype=False``. args : tuple Positional arguments passed to func after the series value. + array_ops_only: bool, default False + If True, func will always operate on the whole Series. + If False, will operate on each element of the Series when given a single + callable that is not a numpy ufunc, else on the whole Series + (backward compatible). + + .. versionadded:: 2.1.0 **kwargs Additional keyword arguments passed to func. @@ -4607,7 +4616,12 @@ def apply( dtype: float64 """ return SeriesApply( - self, func, convert_dtype=convert_dtype, args=args, kwargs=kwargs + self, + func, + convert_dtype=convert_dtype, + array_ops_only=array_ops_only, + args=args, + kwargs=kwargs, ).apply() def _reindex_indexer( diff --git a/pandas/tests/apply/test_series_apply.py b/pandas/tests/apply/test_series_apply.py index e37006eb0a5f6..d2bb3d51431ca 100644 --- a/pandas/tests/apply/test_series_apply.py +++ b/pandas/tests/apply/test_series_apply.py @@ -535,6 +535,20 @@ def test_apply_listlike_transformer(string_series, ops, names): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize( + "ops, expected", + [ + ([lambda x: x], DataFrame({"": [1, 2, 3]})), + ([lambda x: x.sum()], Series([6], index=[""])), + ], +) +def test_apply_listlike_lambda(ops, expected): + # GHxxxxxx + ser = Series([1, 2, 3]) + result = ser.apply(ops) + tm.assert_equal(result, expected) + + @pytest.mark.parametrize( "ops", [ From 52db878dedf4f0d3ae24707ca9078f80ea879722 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Fri, 26 May 2023 13:05:34 +0100 Subject: [PATCH 02/11] add GH number --- doc/source/whatsnew/v2.1.0.rst | 2 +- pandas/tests/apply/test_series_apply.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 9e061569d8953..f23bca905a484 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -98,9 +98,9 @@ Other enhancements - Performance improvement in :func:`read_csv` (:issue:`52632`) with ``engine="c"`` - :meth:`Categorical.from_codes` has gotten a ``validate`` parameter (:issue:`50975`) - Added ``engine_kwargs`` parameter to :meth:`DataFrame.to_excel` (:issue:`53220`) +- Added a new parameter ``array_ops_only`` to :meth:`Series.apply`. When set to ``True`` the supplied callables will always operate on the whole Series (:issue:`53400`). - Performance improvement in :func:`concat` with homogeneous ``np.float64`` or ``np.float32`` dtypes (:issue:`52685`) - Performance improvement in :meth:`DataFrame.filter` when ``items`` is given (:issue:`52941`) -- Added a new parameter ``array_ops_only`` to :meth:`Series.apply`. When set to ``True`` the supplied callables will always operate on the whole Series (:issue:`xxxxx`). - .. --------------------------------------------------------------------------- diff --git a/pandas/tests/apply/test_series_apply.py b/pandas/tests/apply/test_series_apply.py index d2bb3d51431ca..07fbf6ece014b 100644 --- a/pandas/tests/apply/test_series_apply.py +++ b/pandas/tests/apply/test_series_apply.py @@ -543,7 +543,7 @@ def test_apply_listlike_transformer(string_series, ops, names): ], ) def test_apply_listlike_lambda(ops, expected): - # GHxxxxxx + # GH53400 ser = Series([1, 2, 3]) result = ser.apply(ops) tm.assert_equal(result, expected) From fc268286411cd687e2c42bc871bd093d3dc6cdc4 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Fri, 26 May 2023 13:58:13 +0100 Subject: [PATCH 03/11] fix docstring --- pandas/core/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index b919a46a7abf1..0697699bd0c70 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4521,7 +4521,7 @@ def apply( instead if you want ``convert_dtype=False``. args : tuple Positional arguments passed to func after the series value. - array_ops_only: bool, default False + array_ops_only : bool, default False If True, func will always operate on the whole Series. If False, will operate on each element of the Series when given a single callable that is not a numpy ufunc, else on the whole Series From c521691e2a6948e9eed42848876b75429bec19d2 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Thu, 1 Jun 2023 19:05:59 +0100 Subject: [PATCH 04/11] update according to comments --- pandas/core/apply.py | 14 ++++++++------ pandas/core/series.py | 7 +++---- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index f52f1ced06c2d..f40997de1202e 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -289,9 +289,11 @@ def agg_list_like(self) -> DataFrame | Series: ------- Result of aggregation. """ - return self._apply_list_like(op_name="agg") + return self.agg_or_apply_list_like(op_name="agg") - def _apply_list_like(self, op_name: Literal["agg", "apply"]) -> DataFrame | Series: + def agg_or_apply_list_like( + self, op_name: Literal["agg", "apply"] + ) -> DataFrame | Series: from pandas.core.groupby.generic import ( DataFrameGroupBy, SeriesGroupBy, @@ -393,7 +395,7 @@ def _apply_dict_like(self, op_name: Literal["agg", "apply"]) -> DataFrame | Seri obj = self.obj func = cast(AggFuncTypeDict, self.func) - k = {"array_ops_only": True} if op_name == "apply" else {} + kwds = {"array_ops_only": True} if op_name == "apply" else {} if getattr(obj, "axis", 0) == 1: raise NotImplementedError("axis other than 0 is not supported") @@ -427,7 +429,7 @@ def _apply_dict_like(self, op_name: Literal["agg", "apply"]) -> DataFrame | Seri # key only used for output colg = obj._gotitem(selection, ndim=1) result_data = [ - getattr(colg, op_name)(how, **k) for _, how in func.items() + getattr(colg, op_name)(how, **kwds) for _, how in func.items() ] result_index = list(func.keys()) elif is_non_unique_col: @@ -443,7 +445,7 @@ def _apply_dict_like(self, op_name: Literal["agg", "apply"]) -> DataFrame | Seri label_to_indices[label].append(index) key_data = [ - getattr(selected_obj._ixs(indice, axis=1), op_name)(how, **k) + getattr(selected_obj._ixs(indice, axis=1), op_name)(how, **kwds) for label, indices in label_to_indices.items() for indice in indices ] @@ -567,7 +569,7 @@ def apply_list_or_dict_like(self) -> DataFrame | Series: if is_dict_like(func): result = self._apply_dict_like(op_name="apply") else: - result = self._apply_list_like(op_name="apply") + result = self.agg_or_apply_list_like(op_name="apply") result = reconstruct_and_relabel_result(result, func, **kwargs) diff --git a/pandas/core/series.py b/pandas/core/series.py index 0697699bd0c70..a85f553021d51 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4522,10 +4522,9 @@ def apply( args : tuple Positional arguments passed to func after the series value. array_ops_only : bool, default False - If True, func will always operate on the whole Series. - If False, will operate on each element of the Series when given a single - callable that is not a numpy ufunc, else on the whole Series - (backward compatible). + If True, the func will be passed the whole Series at once. + If False, will func will be passed each element of the Series, like + Series.map (backward compatible). .. versionadded:: 2.1.0 **kwargs From 9353f06361d273b67d104665ae7eff6a82ac9d99 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Thu, 1 Jun 2023 19:25:09 +0100 Subject: [PATCH 05/11] rename array_ops_only -> by_row --- pandas/core/apply.py | 12 ++++++------ pandas/core/series.py | 10 +++++----- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index f40997de1202e..1bab70237da1f 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -304,7 +304,7 @@ def agg_or_apply_list_like( func = cast(List[AggFuncTypeBase], self.func) kwargs = self.kwargs if op_name == "apply": - kwargs = {**kwargs, "array_ops_only": True} + kwargs = {**kwargs, "by_row": False} if getattr(obj, "axis", 0) == 1: raise NotImplementedError("axis other than 0 is not supported") @@ -395,7 +395,7 @@ def _apply_dict_like(self, op_name: Literal["agg", "apply"]) -> DataFrame | Seri obj = self.obj func = cast(AggFuncTypeDict, self.func) - kwds = {"array_ops_only": True} if op_name == "apply" else {} + kwds = {"by_row": False} if op_name == "apply" else {} if getattr(obj, "axis", 0) == 1: raise NotImplementedError("axis other than 0 is not supported") @@ -1057,7 +1057,7 @@ def infer_to_same_shape(self, results: ResType, res_index: Index) -> DataFrame: class SeriesApply(NDFrameApply): obj: Series axis: AxisInt = 0 - array_ops_only: bool # only relevant for apply() + by_row: bool # only relevant for apply() def __init__( self, @@ -1065,7 +1065,7 @@ def __init__( func: AggFuncType, *, convert_dtype: bool | lib.NoDefault = lib.no_default, - array_ops_only: bool = False, + by_row: bool = True, args, kwargs, ) -> None: @@ -1080,7 +1080,7 @@ def __init__( stacklevel=find_stack_level(), ) self.convert_dtype = convert_dtype - self.array_ops_only = array_ops_only + self.by_row = by_row super().__init__( obj, @@ -1145,7 +1145,7 @@ def apply_standard(self) -> DataFrame | Series: if isinstance(func, np.ufunc): with np.errstate(all="ignore"): return func(obj, *self.args, **self.kwargs) - elif self.array_ops_only: + elif not self.by_row: return func(obj, *self.args, **self.kwargs) if self.args or self.kwargs: diff --git a/pandas/core/series.py b/pandas/core/series.py index a85f553021d51..d9e2609664802 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4493,7 +4493,7 @@ def apply( convert_dtype: bool | lib.NoDefault = lib.no_default, args: tuple[Any, ...] = (), *, - array_ops_only: bool = False, + by_row: bool = True, **kwargs, ) -> DataFrame | Series: """ @@ -4521,9 +4521,9 @@ def apply( instead if you want ``convert_dtype=False``. args : tuple Positional arguments passed to func after the series value. - array_ops_only : bool, default False - If True, the func will be passed the whole Series at once. - If False, will func will be passed each element of the Series, like + by_row : bool, default True + If False, the func will be passed the whole Series at once. + If True, will func will be passed each element of the Series, like Series.map (backward compatible). .. versionadded:: 2.1.0 @@ -4618,7 +4618,7 @@ def apply( self, func, convert_dtype=convert_dtype, - array_ops_only=array_ops_only, + by_row=by_row, args=args, kwargs=kwargs, ).apply() From e7e3433c64920c54844121e5478b0c5a658363a7 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Fri, 2 Jun 2023 06:15:23 +0100 Subject: [PATCH 06/11] rename _apply_dict_like -> agg_or_apply_dict_like --- pandas/core/apply.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 1bab70237da1f..79bbc2a9bb944 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -381,9 +381,11 @@ def agg_dict_like(self) -> DataFrame | Series: ------- Result of aggregation. """ - return self._apply_dict_like(op_name="agg") + return self.agg_or_apply_dict_like(op_name="agg") - def _apply_dict_like(self, op_name: Literal["agg", "apply"]) -> DataFrame | Series: + def agg_or_apply_dict_like( + self, op_name: Literal["agg", "apply"] + ) -> DataFrame | Series: from pandas import Index from pandas.core.groupby.generic import ( DataFrameGroupBy, @@ -567,7 +569,7 @@ def apply_list_or_dict_like(self) -> DataFrame | Series: kwargs = self.kwargs if is_dict_like(func): - result = self._apply_dict_like(op_name="apply") + result = self.agg_or_apply_dict_like(op_name="apply") else: result = self.agg_or_apply_list_like(op_name="apply") From 755ec0729df0e8d2229848f14d15085c3437fa49 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Sat, 3 Jun 2023 07:36:16 +0100 Subject: [PATCH 07/11] update tests --- pandas/tests/apply/test_series_apply.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/pandas/tests/apply/test_series_apply.py b/pandas/tests/apply/test_series_apply.py index 07fbf6ece014b..b5bea3c738f33 100644 --- a/pandas/tests/apply/test_series_apply.py +++ b/pandas/tests/apply/test_series_apply.py @@ -567,6 +567,20 @@ def test_apply_dictlike_transformer(string_series, ops): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize( + "ops, expected", + [ + ({"a": lambda x: x}, DataFrame({"a": [1, 2, 3]})), + ({"a": lambda x: x.sum()}, Series([6], index=["a"])), + ], +) +def test_apply_dictlike_lambda(ops, expected): + # GH53400 + ser = Series([1, 2, 3]) + result = ser.apply(ops) + tm.assert_equal(result, expected) + + def test_apply_retains_column_name(): # GH 16380 df = DataFrame({"x": range(3)}, Index(range(3), name="x")) From e68c46ef3a669e7f390d132984cad7697a85ea32 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Sat, 3 Jun 2023 17:20:50 +0100 Subject: [PATCH 08/11] add tests --- pandas/tests/apply/test_series_apply.py | 72 +++++++++++++++++-------- 1 file changed, 49 insertions(+), 23 deletions(-) diff --git a/pandas/tests/apply/test_series_apply.py b/pandas/tests/apply/test_series_apply.py index b5bea3c738f33..32dcf17a4c21c 100644 --- a/pandas/tests/apply/test_series_apply.py +++ b/pandas/tests/apply/test_series_apply.py @@ -7,6 +7,7 @@ from pandas import ( DataFrame, Index, + MultiIndex, Series, concat, timedelta_range, @@ -15,6 +16,11 @@ from pandas.tests.apply.common import series_transform_kernels +@pytest.fixture(params=[True, False]) +def by_row(request): + return request.param + + def test_series_map_box_timedelta(): # GH#11349 ser = Series(timedelta_range("1 day 1 s", periods=3, freq="h")) @@ -459,25 +465,28 @@ def test_apply_series_on_date_time_index_aware_series(dti, exp, aware): tm.assert_frame_equal(result, exp) -def test_apply_scalar_on_date_time_index_aware_series(): +@pytest.mark.parametrize( + "by_row, expected", [(True, Series(np.ones(30), dtype="int64")), (False, 1)] +) +def test_apply_scalar_on_date_time_index_aware_series(by_row, expected): # GH 25959 # Calling apply on a localized time series should not cause an error series = tm.makeTimeSeries(nper=30).tz_localize("UTC") - result = Series(series.index).apply(lambda x: 1) - tm.assert_series_equal(result, Series(np.ones(30), dtype="int64")) + result = Series(series.index).apply(lambda x: 1, by_row=by_row) + tm.assert_equal(result, expected) -def test_apply_to_timedelta(): +def test_apply_to_timedelta(by_row): list_of_valid_strings = ["00:00:01", "00:00:02"] a = pd.to_timedelta(list_of_valid_strings) - b = Series(list_of_valid_strings).apply(pd.to_timedelta) + b = Series(list_of_valid_strings).apply(pd.to_timedelta, by_row=by_row) tm.assert_series_equal(Series(a), b) list_of_strings = ["00:00:01", np.nan, pd.NaT, pd.NaT] a = pd.to_timedelta(list_of_strings) ser = Series(list_of_strings) - b = ser.apply(pd.to_timedelta) + b = ser.apply(pd.to_timedelta, by_row=by_row) tm.assert_series_equal(Series(a), b) @@ -490,12 +499,15 @@ def test_apply_to_timedelta(): (np.array([np.sum, np.mean]), ["sum", "mean"]), ], ) -@pytest.mark.parametrize("how", ["agg", "apply"]) -def test_apply_listlike_reducer(string_series, ops, names, how): +@pytest.mark.parametrize( + "how, kwargs", + [["agg", {}], ["apply", {"by_row": True}], ["apply", {"by_row": False}]], +) +def test_apply_listlike_reducer(string_series, ops, names, how, kwargs): # GH 39140 expected = Series({name: op(string_series) for name, op in zip(names, ops)}) expected.name = "series" - result = getattr(string_series, how)(ops) + result = getattr(string_series, how)(ops, **kwargs) tm.assert_series_equal(result, expected) @@ -508,12 +520,15 @@ def test_apply_listlike_reducer(string_series, ops, names, how): Series({"A": np.sum, "B": np.mean}), ], ) -@pytest.mark.parametrize("how", ["agg", "apply"]) -def test_apply_dictlike_reducer(string_series, ops, how): +@pytest.mark.parametrize( + "how, kwargs", + [["agg", {}], ["apply", {"by_row": True}], ["apply", {"by_row": False}]], +) +def test_apply_dictlike_reducer(string_series, ops, how, kwargs, by_row): # GH 39140 expected = Series({name: op(string_series) for name, op in ops.items()}) expected.name = string_series.name - result = getattr(string_series, how)(ops) + result = getattr(string_series, how)(ops, **kwargs) tm.assert_series_equal(result, expected) @@ -526,12 +541,12 @@ def test_apply_dictlike_reducer(string_series, ops, how): (np.array([np.abs, np.sqrt]), ["absolute", "sqrt"]), ], ) -def test_apply_listlike_transformer(string_series, ops, names): +def test_apply_listlike_transformer(string_series, ops, names, by_row): # GH 39140 with np.errstate(all="ignore"): expected = concat([op(string_series) for op in ops], axis=1) expected.columns = names - result = string_series.apply(ops) + result = string_series.apply(ops, by_row=by_row) tm.assert_frame_equal(result, expected) @@ -542,10 +557,10 @@ def test_apply_listlike_transformer(string_series, ops, names): ([lambda x: x.sum()], Series([6], index=[""])), ], ) -def test_apply_listlike_lambda(ops, expected): +def test_apply_listlike_lambda(ops, expected, by_row=by_row): # GH53400 ser = Series([1, 2, 3]) - result = ser.apply(ops) + result = ser.apply(ops, by_row=by_row) tm.assert_equal(result, expected) @@ -558,37 +573,48 @@ def test_apply_listlike_lambda(ops, expected): Series({"A": np.sqrt, "B": np.exp}), ], ) -def test_apply_dictlike_transformer(string_series, ops): +def test_apply_dictlike_transformer(string_series, ops, by_row): # GH 39140 with np.errstate(all="ignore"): expected = concat({name: op(string_series) for name, op in ops.items()}) expected.name = string_series.name - result = string_series.apply(ops) + result = string_series.apply(ops, by_row=by_row) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "ops, expected", [ - ({"a": lambda x: x}, DataFrame({"a": [1, 2, 3]})), + ( + {"a": lambda x: x}, + Series([1, 2, 3], index=MultiIndex.from_arrays([["a"] * 3, range(3)])), + ), ({"a": lambda x: x.sum()}, Series([6], index=["a"])), ], ) -def test_apply_dictlike_lambda(ops, expected): +def test_apply_dictlike_lambda(ops, by_row, expected): # GH53400 ser = Series([1, 2, 3]) - result = ser.apply(ops) + result = ser.apply(ops, by_row=by_row) tm.assert_equal(result, expected) -def test_apply_retains_column_name(): +def test_apply_retains_column_name(by_row): # GH 16380 df = DataFrame({"x": range(3)}, Index(range(3), name="x")) func = lambda x: Series(range(x + 1), Index(range(x + 1), name="y")) + + if not by_row: + # GH53400 + msg = "'Series' object cannot be interpreted as an integer" + with pytest.raises(TypeError, match=msg): + df.x.apply(func, by_row=by_row) + return + msg = "Returning a DataFrame from Series.apply when the supplied function" with tm.assert_produces_warning(FutureWarning, match=msg): # GH52123 - result = df.x.apply(func) + result = df.x.apply(func, by_row=by_row) expected = DataFrame( [[0.0, np.nan, np.nan], [0.0, 1.0, np.nan], [0.0, 1.0, 2.0]], columns=Index(range(3), name="y"), From 9af24b2022440bb9fe864f877d2ed11987501db2 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Sun, 4 Jun 2023 10:06:02 +0100 Subject: [PATCH 09/11] add testr II --- pandas/tests/apply/test_series_apply.py | 150 +++++++++++++++--------- 1 file changed, 92 insertions(+), 58 deletions(-) diff --git a/pandas/tests/apply/test_series_apply.py b/pandas/tests/apply/test_series_apply.py index 32dcf17a4c21c..985cb5aa5b09c 100644 --- a/pandas/tests/apply/test_series_apply.py +++ b/pandas/tests/apply/test_series_apply.py @@ -1,5 +1,3 @@ -import math - import numpy as np import pytest @@ -21,31 +19,36 @@ def by_row(request): return request.param -def test_series_map_box_timedelta(): +def test_series_map_box_timedelta(by_row): # GH#11349 ser = Series(timedelta_range("1 day 1 s", periods=3, freq="h")) def f(x): - return x.total_seconds() + return x.total_seconds() if by_row else x.dt.total_seconds() - result = ser.apply(f) + result = ser.apply(f, by_row=by_row) - tm.assert_series_equal(result, ser.map(f)) + expected = ser.map(lambda x: x.total_seconds()) + tm.assert_series_equal(result, expected) expected = Series([86401.0, 90001.0, 93601.0]) tm.assert_series_equal(result, expected) -def test_apply(datetime_series): +def test_apply(datetime_series, by_row): + result = datetime_series.apply(np.sqrt, by_row=by_row) with np.errstate(all="ignore"): - tm.assert_series_equal(datetime_series.apply(np.sqrt), np.sqrt(datetime_series)) + expected = np.sqrt(datetime_series) + tm.assert_series_equal(result, expected) - # element-wise apply - tm.assert_series_equal(datetime_series.apply(math.exp), np.exp(datetime_series)) + # element-wise apply (ufunc) + result = datetime_series.apply(np.exp, by_row=by_row) + expected = np.exp(datetime_series) + tm.assert_series_equal(result, expected) # empty series s = Series(dtype=object, name="foo", index=Index([], name="bar")) - rs = s.apply(lambda x: x) + rs = s.apply(lambda x: x, by_row=by_row) tm.assert_series_equal(s, rs) # check all metadata (GH 9322) @@ -56,7 +59,7 @@ def test_apply(datetime_series): # index but no data s = Series(index=[1, 2, 3], dtype=np.float64) - rs = s.apply(lambda x: x) + rs = s.apply(lambda x: x, by_row=by_row) tm.assert_series_equal(s, rs) @@ -66,12 +69,12 @@ def test_apply_map_same_length_inference_bug(): def f(x): return (x, x + 1) - result = s.apply(f) + result = s.apply(f, by_row=True) expected = s.map(f) tm.assert_series_equal(result, expected) s = Series([1, 2, 3]) - result = s.apply(f) + result = s.apply(f, by_row=by_row) expected = s.map(f) tm.assert_series_equal(result, expected) @@ -84,7 +87,7 @@ def func(x): return x if x > 0 else np.nan with tm.assert_produces_warning(FutureWarning): - ser.apply(func, convert_dtype=convert_dtype) + ser.apply(func, convert_dtype=convert_dtype, by_row=True) def test_apply_args(): @@ -130,14 +133,20 @@ def foo2(x, b=2, c=0): tm.assert_frame_equal(result, expected) -def test_series_apply_map_box_timestamps(): +def test_series_apply_map_box_timestamps(by_row): # GH#2689, GH#2627 ser = Series(pd.date_range("1/1/2000", periods=10)) def func(x): return (x.hour, x.day, x.month) - result = ser.apply(func) + if not by_row: + msg = "Series' object has no attribute 'hour'" + with pytest.raises(AttributeError, match=msg): + ser.apply(func, by_row=by_row) + return + + result = ser.apply(func, by_row=by_row) expected = ser.map(func) tm.assert_series_equal(result, expected) @@ -148,7 +157,7 @@ def test_apply_box(): s = Series(vals) assert s.dtype == "datetime64[ns]" # boxed value must be Timestamp instance - res = s.apply(lambda x: f"{type(x).__name__}_{x.day}_{x.tz}") + res = s.apply(lambda x: f"{type(x).__name__}_{x.day}_{x.tz}", by_row=True) exp = Series(["Timestamp_1_None", "Timestamp_2_None"]) tm.assert_series_equal(res, exp) @@ -158,7 +167,7 @@ def test_apply_box(): ] s = Series(vals) assert s.dtype == "datetime64[ns, US/Eastern]" - res = s.apply(lambda x: f"{type(x).__name__}_{x.day}_{x.tz}") + res = s.apply(lambda x: f"{type(x).__name__}_{x.day}_{x.tz}", by_row=True) exp = Series(["Timestamp_1_US/Eastern", "Timestamp_2_US/Eastern"]) tm.assert_series_equal(res, exp) @@ -166,7 +175,7 @@ def test_apply_box(): vals = [pd.Timedelta("1 days"), pd.Timedelta("2 days")] s = Series(vals) assert s.dtype == "timedelta64[ns]" - res = s.apply(lambda x: f"{type(x).__name__}_{x.days}") + res = s.apply(lambda x: f"{type(x).__name__}_{x.days}", by_row=True) exp = Series(["Timedelta_1", "Timedelta_2"]) tm.assert_series_equal(res, exp) @@ -174,41 +183,52 @@ def test_apply_box(): vals = [pd.Period("2011-01-01", freq="M"), pd.Period("2011-01-02", freq="M")] s = Series(vals) assert s.dtype == "Period[M]" - res = s.apply(lambda x: f"{type(x).__name__}_{x.freqstr}") + res = s.apply(lambda x: f"{type(x).__name__}_{x.freqstr}", by_row=True) exp = Series(["Period_M", "Period_M"]) tm.assert_series_equal(res, exp) -def test_apply_datetimetz(): +def test_apply_datetimetz(by_row): values = pd.date_range("2011-01-01", "2011-01-02", freq="H").tz_localize( "Asia/Tokyo" ) s = Series(values, name="XX") - result = s.apply(lambda x: x + pd.offsets.Day()) + result = s.apply(lambda x: x + pd.offsets.Day(), by_row=by_row) exp_values = pd.date_range("2011-01-02", "2011-01-03", freq="H").tz_localize( "Asia/Tokyo" ) exp = Series(exp_values, name="XX") tm.assert_series_equal(result, exp) - result = s.apply(lambda x: x.hour) - exp = Series(list(range(24)) + [0], name="XX", dtype=np.int64) + result = s.apply(lambda x: x.hour if by_row else x.dt.hour, by_row=by_row) + exp = Series(list(range(24)) + [0], name="XX", dtype="int64" if by_row else "int32") tm.assert_series_equal(result, exp) # not vectorized def f(x): - return str(x.tz) + return str(x.tz) if by_row else str(x.dt.tz) - result = s.apply(f) - exp = Series(["Asia/Tokyo"] * 25, name="XX") - tm.assert_series_equal(result, exp) + result = s.apply(f, by_row=by_row) + if by_row: + exp = Series(["Asia/Tokyo"] * 25, name="XX") + tm.assert_series_equal(result, exp) + else: + result == "Asia/Tokyo" -def test_apply_categorical(): +def test_apply_categorical(by_row): values = pd.Categorical(list("ABBABCD"), categories=list("DCBA"), ordered=True) ser = Series(values, name="XX", index=list("abcdefg")) - result = ser.apply(lambda x: x.lower()) + + if not by_row: + msg = "Series' object has no attribute 'lower" + with pytest.raises(AttributeError, match=msg): + ser.apply(lambda x: x.lower(), by_row=by_row) + assert ser.apply(lambda x: "A", by_row=by_row) == "A" + return + + result = ser.apply(lambda x: x.lower(), by_row=by_row) # should be categorical dtype when the number of categories are # the same @@ -224,24 +244,30 @@ def test_apply_categorical(): @pytest.mark.parametrize("series", [["1-1", "1-1", np.NaN], ["1-1", "1-2", np.NaN]]) -def test_apply_categorical_with_nan_values(series): +def test_apply_categorical_with_nan_values(series, by_row): # GH 20714 bug fixed in: GH 24275 s = Series(series, dtype="category") - result = s.apply(lambda x: x.split("-")[0]) + if not by_row: + msg = "'Series' object has no attribute 'split'" + with pytest.raises(AttributeError, match=msg): + s.apply(lambda x: x.split("-")[0], by_row=by_row) + return + + result = s.apply(lambda x: x.split("-")[0], by_row=by_row) result = result.astype(object) expected = Series(["1", "1", np.NaN], dtype="category") expected = expected.astype(object) tm.assert_series_equal(result, expected) -def test_apply_empty_integer_series_with_datetime_index(): +def test_apply_empty_integer_series_with_datetime_index(by_row): # GH 21245 s = Series([], index=pd.date_range(start="2018-01-01", periods=0), dtype=int) - result = s.apply(lambda x: x) + result = s.apply(lambda x: x, by_row=by_row) tm.assert_series_equal(result, s) -def test_transform(string_series): +def test_transform(string_series, by_row): # transforming functions with np.errstate(all="ignore"): @@ -249,17 +275,17 @@ def test_transform(string_series): f_abs = np.abs(string_series) # ufunc - result = string_series.apply(np.sqrt) + result = string_series.apply(np.sqrt, by_row=by_row) expected = f_sqrt.copy() tm.assert_series_equal(result, expected) # list-like - result = string_series.apply([np.sqrt]) + result = string_series.apply([np.sqrt], by_row=by_row) expected = f_sqrt.to_frame().copy() expected.columns = ["sqrt"] tm.assert_frame_equal(result, expected) - result = string_series.apply(["sqrt"]) + result = string_series.apply(["sqrt"], by_row=by_row) tm.assert_frame_equal(result, expected) # multiple items in list @@ -267,7 +293,7 @@ def test_transform(string_series): # series and then concatting expected = concat([f_sqrt, f_abs], axis=1) expected.columns = ["sqrt", "absolute"] - result = string_series.apply([np.sqrt, np.abs]) + result = string_series.apply([np.sqrt, np.abs], by_row=by_row) tm.assert_frame_equal(result, expected) # dict, provide renaming @@ -275,7 +301,7 @@ def test_transform(string_series): expected.columns = ["foo", "bar"] expected = expected.unstack().rename("series") - result = string_series.apply({"foo": np.sqrt, "bar": np.abs}) + result = string_series.apply({"foo": np.sqrt, "bar": np.abs}, by_row=by_row) tm.assert_series_equal(result.reindex_like(expected), expected) @@ -352,16 +378,17 @@ def test_demo(): tm.assert_series_equal(result, expected) -def test_agg_apply_evaluate_lambdas_the_same(string_series): - # test that we are evaluating row-by-row first - # before vectorized evaluation - result = string_series.apply(lambda x: str(x)) - expected = string_series.agg(lambda x: str(x)) - tm.assert_series_equal(result, expected) +@pytest.mark.parametrize("func", [str, lambda x: str(x)]) +def test_apply_map_evaluate_lambdas_the_same(string_series, func, by_row): + # test that we are evaluating row-by-row first if by_row=True + # else vectorized evaluation + result = string_series.apply(func, by_row=by_row) - result = string_series.apply(str) - expected = string_series.agg(str) - tm.assert_series_equal(result, expected) + if by_row: + expected = string_series.map(func) + tm.assert_series_equal(result, expected) + else: + assert result == str(string_series) def test_with_nested_series(datetime_series): @@ -382,7 +409,7 @@ def test_with_nested_series(datetime_series): tm.assert_frame_equal(result, expected) -def test_replicate_describe(string_series): +def test_replicate_describe(string_series, by_row): # this also tests a result set that is all scalars expected = string_series.describe() result = string_series.apply( @@ -395,7 +422,8 @@ def test_replicate_describe(string_series): "50%": "median", "75%": lambda x: x.quantile(0.75), "max": "max", - } + }, + by_row=by_row, ) tm.assert_series_equal(result, expected) @@ -411,27 +439,33 @@ def test_reduce(string_series): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("how", ["agg", "apply"]) -def test_non_callable_aggregates(how): +@pytest.mark.parametrize( + "how, kwds", + [("agg", {}), ("apply", {"by_row": True}), ("apply", {"by_row": False})], +) +def test_non_callable_aggregates(how, kwds): # test agg using non-callable series attributes # GH 39116 - expand to apply s = Series([1, 2, None]) # Calling agg w/ just a string arg same as calling s.arg - result = getattr(s, how)("size") + result = getattr(s, how)("size", **kwds) expected = s.size assert result == expected # test when mixed w/ callable reducers - result = getattr(s, how)(["size", "count", "mean"]) + result = getattr(s, how)(["size", "count", "mean"], **kwds) expected = Series({"size": 3.0, "count": 2.0, "mean": 1.5}) tm.assert_series_equal(result, expected) + result = getattr(s, how)({"size": "size", "count": "count", "mean": "mean"}, **kwds) + tm.assert_series_equal(result, expected) + -def test_series_apply_no_suffix_index(): +def test_series_apply_no_suffix_index(by_row): # GH36189 s = Series([4] * 3) - result = s.apply(["sum", lambda x: x.sum(), lambda x: x.sum()]) + result = s.apply(["sum", lambda x: x.sum(), lambda x: x.sum()], by_row=by_row) expected = Series([12, 12, 12], index=["sum", "", ""]) tm.assert_series_equal(result, expected) From af0417d19de1ab4c3cc30dfccd5475926bd65047 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Sun, 4 Jun 2023 15:36:20 +0100 Subject: [PATCH 10/11] update according to comments --- doc/source/whatsnew/v2.1.0.rst | 2 +- pandas/core/apply.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 3ca6fc92183bc..9f2171a6aba9a 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -101,7 +101,7 @@ Other enhancements - :meth:`DataFrame.unstack` gained the ``sort`` keyword to dictate whether the resulting :class:`MultiIndex` levels are sorted (:issue:`15105`) - :meth:`SeriesGroupby.agg` and :meth:`DataFrameGroupby.agg` now support passing in multiple functions for ``engine="numba"`` (:issue:`53486`) - Added ``engine_kwargs`` parameter to :meth:`DataFrame.to_excel` (:issue:`53220`) -- Added a new parameter ``array_ops_only`` to :meth:`Series.apply`. When set to ``True`` the supplied callables will always operate on the whole Series (:issue:`53400`). +- Added a new parameter ``by_row`` to :meth:`Series.apply`. When set to ``False`` the supplied callables will always operate on the whole Series (:issue:`53400`). - Many read/to_* functions, such as :meth:`DataFrame.to_pickle` and :func:`read_csv`, support forwarding compression arguments to lzma.LZMAFile (:issue:`52979`) - Performance improvement in :func:`concat` with homogeneous ``np.float64`` or ``np.float32`` dtypes (:issue:`52685`) - Performance improvement in :meth:`DataFrame.filter` when ``items`` is given (:issue:`52941`) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index a6b2492473886..37c6288e53cb3 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -716,8 +716,8 @@ def values(self): def apply(self) -> DataFrame | Series: """compute the results""" - # dispatch to agg - if is_list_like(self.func) or is_dict_like(self.func): + # dispatch to handle list-like or dict-like + if is_list_like(self.func): return self.apply_list_or_dict_like() # all empty @@ -1105,8 +1105,8 @@ def apply(self) -> DataFrame | Series: if len(obj) == 0: return self.apply_empty_result() - # dispatch to agg - if is_list_like(self.func) or is_dict_like(self.func): + # dispatch to handle list-like or dict-like + if is_list_like(self.func): return self.apply_list_or_dict_like() if isinstance(self.func, str): From 85649683aecbd4f30e558b318a0ae35eef188cba Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Sun, 4 Jun 2023 17:13:32 +0100 Subject: [PATCH 11/11] kwds -> kwargs --- pandas/core/apply.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 37c6288e53cb3..007dd2bb2a89d 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -397,7 +397,7 @@ def agg_or_apply_dict_like( obj = self.obj func = cast(AggFuncTypeDict, self.func) - kwds = {"by_row": False} if op_name == "apply" else {} + kwargs = {"by_row": False} if op_name == "apply" else {} if getattr(obj, "axis", 0) == 1: raise NotImplementedError("axis other than 0 is not supported") @@ -430,14 +430,14 @@ def agg_or_apply_dict_like( if is_groupby: engine = self.kwargs.get("engine", None) engine_kwargs = self.kwargs.get("engine_kwargs", None) - kwds.update({"engine": engine, "engine_kwargs": engine_kwargs}) + kwargs.update({"engine": engine, "engine_kwargs": engine_kwargs}) with context_manager: if selected_obj.ndim == 1: # key only used for output colg = obj._gotitem(selection, ndim=1) result_data = [ - getattr(colg, op_name)(how, **kwds) for _, how in func.items() + getattr(colg, op_name)(how, **kwargs) for _, how in func.items() ] result_index = list(func.keys()) elif is_non_unique_col: @@ -453,7 +453,9 @@ def agg_or_apply_dict_like( label_to_indices[label].append(index) key_data = [ - getattr(selected_obj._ixs(indice, axis=1), op_name)(how, **kwds) + getattr(selected_obj._ixs(indice, axis=1), op_name)( + how, **kwargs + ) for label, indices in label_to_indices.items() for indice in indices ] @@ -463,7 +465,7 @@ def agg_or_apply_dict_like( else: # key used for column selection and output result_data = [ - getattr(obj._gotitem(key, ndim=1), op_name)(how, **kwds) + getattr(obj._gotitem(key, ndim=1), op_name)(how, **kwargs) for key, how in func.items() ] result_index = list(func.keys())