Skip to content

BUG: fix Series.apply(..., by_row), v2. #53601

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Jun 29, 2023
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v2.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ Other enhancements
- :meth:`SeriesGroupby.agg` and :meth:`DataFrameGroupby.agg` now support passing in multiple functions for ``engine="numba"`` (:issue:`53486`)
- :meth:`SeriesGroupby.transform` and :meth:`DataFrameGroupby.transform` now support passing in a string as the function for ``engine="numba"`` (:issue:`53579`)
- Added ``engine_kwargs`` parameter to :meth:`DataFrame.to_excel` (:issue:`53220`)
- Added a new parameter ``by_row`` to :meth:`Series.apply`. When set to ``False`` the supplied callables will always operate on the whole Series (:issue:`53400`).
- Added a new parameter ``by_row`` to :meth:`Series.apply` and :meth:`DataFrame.apply`. When set to ``False`` the supplied callables will always operate on the whole Series or DataFrame (:issue:`53400`, :issue:`53601`).
- Groupby aggregations (such as :meth:`DataFrameGroupby.sum`) now can preserve the dtype of the input instead of casting to ``float64`` (:issue:`44952`)
- Improved error message when :meth:`DataFrameGroupBy.agg` failed (:issue:`52930`)
- Many read/to_* functions, such as :meth:`DataFrame.to_pickle` and :func:`read_csv`, support forwarding compression arguments to lzma.LZMAFile (:issue:`52979`)
Expand Down
74 changes: 69 additions & 5 deletions pandas/core/apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ def frame_apply(
axis: Axis = 0,
raw: bool = False,
result_type: str | None = None,
by_row: Literal[False, "compat"] = "compat",
args=None,
kwargs=None,
) -> FrameApply:
Expand All @@ -100,6 +101,7 @@ def frame_apply(
func,
raw=raw,
result_type=result_type,
by_row=by_row,
args=args,
kwargs=kwargs,
)
Expand All @@ -115,11 +117,16 @@ def __init__(
raw: bool,
result_type: str | None,
*,
by_row: bool | Literal["compat"] = True,
args,
kwargs,
) -> None:
self.obj = obj
self.raw = raw

assert isinstance(by_row, bool) or by_row == "compat"
self.by_row = by_row

self.args = args or ()
self.kwargs = kwargs or {}

Expand Down Expand Up @@ -304,7 +311,14 @@ def agg_or_apply_list_like(
func = cast(List[AggFuncTypeBase], self.func)
kwargs = self.kwargs
if op_name == "apply":
kwargs = {**kwargs, "by_row": False}
if isinstance(self, FrameApply):
by_row = self.by_row

elif isinstance(self, SeriesApply):
by_row = "compat" if self.by_row else False
else:
by_row = False
kwargs = {**kwargs, "by_row": by_row}

if getattr(obj, "axis", 0) == 1:
raise NotImplementedError("axis other than 0 is not supported")
Expand Down Expand Up @@ -397,7 +411,15 @@ def agg_or_apply_dict_like(

obj = self.obj
func = cast(AggFuncTypeDict, self.func)
kwargs = {"by_row": False} if op_name == "apply" else {}
kwargs = {}
if op_name == "apply":
if isinstance(self, FrameApply):
by_row = self.by_row
elif isinstance(self, SeriesApply) and self.by_row:
by_row = "compat"
else:
by_row = False
kwargs.update({"by_row": by_row})

if getattr(obj, "axis", 0) == 1:
raise NotImplementedError("axis other than 0 is not supported")
Expand Down Expand Up @@ -678,6 +700,23 @@ def agg_axis(self) -> Index:
class FrameApply(NDFrameApply):
obj: DataFrame

def __init__(
self,
obj: AggObjType,
func: AggFuncType,
raw: bool,
result_type: str | None,
*,
by_row: Literal[False, "compat"] = False,
args,
kwargs,
) -> None:
if by_row is not False and by_row != "compat":
raise NotImplementedError(f"by_row={by_row} not implemented")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From the docs, I think NotImplementedError signifies the implementation is currently incomplete, and that users can expect this to be supported once we "get around to it". Can this be a ValueError instead.

This comment was marked as resolved.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I missed this comment somehow. I've changed it now.

super().__init__(
obj, func, raw, result_type, by_row=by_row, args=args, kwargs=kwargs
)

# ---------------------------------------------------------------
# Abstract Methods

Expand Down Expand Up @@ -1067,15 +1106,15 @@ def infer_to_same_shape(self, results: ResType, res_index: Index) -> DataFrame:
class SeriesApply(NDFrameApply):
obj: Series
axis: AxisInt = 0
by_row: bool # only relevant for apply()
by_row: bool | Literal["compat"] # only relevant for apply()

def __init__(
self,
obj: Series,
func: AggFuncType,
*,
convert_dtype: bool | lib.NoDefault = lib.no_default,
by_row: bool = True,
by_row: bool | Literal["compat"] = True,
args,
kwargs,
) -> None:
Expand All @@ -1090,13 +1129,13 @@ def __init__(
stacklevel=find_stack_level(),
)
self.convert_dtype = convert_dtype
self.by_row = by_row

super().__init__(
obj,
func,
raw=False,
result_type=None,
by_row=by_row,
args=args,
kwargs=kwargs,
)
Expand All @@ -1115,6 +1154,9 @@ def apply(self) -> DataFrame | Series:
# if we are a string, try to dispatch
return self.apply_str()

if self.by_row == "compat":
return self.apply_compat()

# self.func is Callable
return self.apply_standard()

Expand Down Expand Up @@ -1149,6 +1191,28 @@ def apply_empty_result(self) -> Series:
obj, method="apply"
)

def apply_compat(self):
"""compat apply method.

Used for each callable when giving listlikes and dictlikes of callables to
apply. Needed for copatability with Pandas < v2.1.

.. versionadded:: 2.1.0
"""
obj = self.obj
func = self.func

if callable(func):
f = com.get_cython_func(func)
if f and not self.args and not self.kwargs:
return obj.apply(func, by_row=False)

try:
result = obj.apply(func, by_row=True)
except (ValueError, AttributeError, TypeError):
result = obj.apply(func, by_row=False)
return result

def apply_standard(self) -> DataFrame | Series:
# caller is responsible for ensuring that f is Callable
func = cast(Callable, self.func)
Expand Down
15 changes: 15 additions & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -9611,6 +9611,7 @@ def apply(
raw: bool = False,
result_type: Literal["expand", "reduce", "broadcast"] | None = None,
args=(),
by_row: Literal["compat", False] = "compat",
**kwargs,
):
"""
Expand Down Expand Up @@ -9659,6 +9660,19 @@ def apply(
args : tuple
Positional arguments to pass to `func` in addition to the
array/series.
by_row : False or "compat", default "compat"
If "compat", will if possible first translate the func into pandas
methods (e.g. ``Series().apply(np.sum)`` will be translated to
``Series().sum()``). If that doesn't work, will try call to apply again with
``by_row=True`` and if that fails, will call apply again with
``by_row=False``
If False, the funcs will be passed the whole Series at once.
``by_row`` only has effect when ``func`` is a listlike or dictlike of funcs
and the func isn't a string.
``by_row=True`` has not been implemented, and will raise an
``NotImplenentedError``.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it'd be good to have the callout on this only applying to list/dict-likes at the beginning, and adding in that this is compatible with previous versions. What do you think about being more vague about the compat behavior instead of trying to detail it out? Something like

        by_row : False or "compat", default "compat"
            Only has effect an when ``func`` is a listlike or dictlike of funcs
            on the values that aren't NumPy functions (e.g. ``np.sum``) or 
            string-aliases for operations (e.g. ``"sum"``). 
            "compat" is backwards compatible with previous versions and will
            sometimes operate by row and sometimes operate on the whole Series at once.
            If False, the funcs will be passed the whole Series at once.

I'm also okay with keeping the more detailed description of compat if you prefer.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My preference is for the other version, if that's ok. I changed it a bit though.


.. versionadded:: 2.1.0
**kwargs
Additional keyword arguments to pass as keywords arguments to
`func`.
Expand Down Expand Up @@ -9758,6 +9772,7 @@ def apply(
axis=axis,
raw=raw,
result_type=result_type,
by_row=by_row,
args=args,
kwargs=kwargs,
)
Expand Down
13 changes: 10 additions & 3 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -4509,7 +4509,7 @@ def apply(
convert_dtype: bool | lib.NoDefault = lib.no_default,
args: tuple[Any, ...] = (),
*,
by_row: bool = True,
by_row: bool | Literal["compat"] = True,
**kwargs,
) -> DataFrame | Series:
"""
Expand Down Expand Up @@ -4537,10 +4537,17 @@ def apply(
instead if you want ``convert_dtype=False``.
args : tuple
Positional arguments passed to func after the series value.
by_row : bool, default True
by_row : bool or "compat", default True
If False, the func will be passed the whole Series at once.
If True, will func will be passed each element of the Series, like
Series.map (backward compatible).
``Series.map`` (backward compatible).
If "compat", will if possible first translate the func into pandas
methods (e.g. ``Series().apply(np.sum)`` will be translated to
``Series().sum()``). If that doesn't work, will try call to apply again with
``by_row=True`` and if that fails, will call apply again with
``by_row=False``. Added for backwards compatibility, should not be used
directly.
``by_row`` has no effect when ``func`` is a string.

.. versionadded:: 2.1.0
**kwargs
Expand Down
96 changes: 96 additions & 0 deletions pandas/tests/apply/test_frame_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -667,6 +667,50 @@ def test_infer_row_shape():
assert result == (6, 2)


@pytest.mark.parametrize(
"ops, by_row, expected",
[
({"a": lambda x: x + 1}, "compat", DataFrame({"a": [2, 3]})),
({"a": lambda x: x + 1}, False, DataFrame({"a": [2, 3]})),
({"a": lambda x: x.sum()}, "compat", Series({"a": 3})),
({"a": lambda x: x.sum()}, False, Series({"a": 3})),
(
{"a": ["sum", np.sum, lambda x: x.sum()]},
"compat",
DataFrame({"a": [3, 3, 3]}, index=["sum", "sum", "<lambda>"]),
),
(
{"a": ["sum", np.sum, lambda x: x.sum()]},
False,
DataFrame({"a": [3, 3, 3]}, index=["sum", "sum", "<lambda>"]),
),
({"a": lambda x: 1}, "compat", DataFrame({"a": [1, 1]})),
({"a": lambda x: 1}, False, Series({"a": 1})),
],
)
def test_dictlike_lambda(ops, by_row, expected):
# GH53601
df = DataFrame({"a": [1, 2]})
result = df.apply(ops, by_row=by_row)
tm.assert_equal(result, expected)


@pytest.mark.parametrize(
"ops",
[
{"a": lambda x: x + 1},
{"a": lambda x: x.sum()},
{"a": ["sum", np.sum, lambda x: x.sum()]},
{"a": lambda x: 1},
],
)
def test_dictlike_lambda_raises(ops):
# GH53601
df = DataFrame({"a": [1, 2]})
with pytest.raises(NotImplementedError, match="by_row=True not implemented"):
df.apply(ops, by_row=True)


def test_with_dictlike_columns():
# GH 17602
df = DataFrame([[1, 2], [1, 2]], columns=["a", "b"])
Expand Down Expand Up @@ -716,6 +760,58 @@ def test_with_dictlike_columns_with_infer():
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize(
"ops, by_row, expected",
[
([lambda x: x + 1], "compat", DataFrame({("a", "<lambda>"): [2, 3]})),
([lambda x: x + 1], False, DataFrame({("a", "<lambda>"): [2, 3]})),
([lambda x: x.sum()], "compat", DataFrame({"a": [3]}, index=["<lambda>"])),
([lambda x: x.sum()], False, DataFrame({"a": [3]}, index=["<lambda>"])),
(
["sum", np.sum, lambda x: x.sum()],
"compat",
DataFrame({"a": [3, 3, 3]}, index=["sum", "sum", "<lambda>"]),
),
(
["sum", np.sum, lambda x: x.sum()],
False,
DataFrame({"a": [3, 3, 3]}, index=["sum", "sum", "<lambda>"]),
),
(
[lambda x: x + 1, lambda x: 3],
"compat",
DataFrame([[2, 3], [3, 3]], columns=[["a", "a"], ["<lambda>", "<lambda>"]]),
),
(
[lambda x: 2, lambda x: 3],
False,
DataFrame({"a": [2, 3]}, ["<lambda>", "<lambda>"]),
),
],
)
def test_listlike_lambda(ops, by_row, expected):
# GH53601
df = DataFrame({"a": [1, 2]})
result = df.apply(ops, by_row=by_row)
tm.assert_equal(result, expected)


@pytest.mark.parametrize(
"ops",
[
[lambda x: x + 1],
[lambda x: x.sum()],
["sum", np.sum, lambda x: x.sum()],
[lambda x: x + 1, lambda x: 3],
],
)
def test_listlike_lambda_raises(ops):
# GH53601
df = DataFrame({"a": [1, 2]})
with pytest.raises(NotImplementedError, match="by_row=True not implemented"):
df.apply(ops, by_row=True)


def test_with_listlike_columns():
# GH 17348
df = DataFrame(
Expand Down
10 changes: 2 additions & 8 deletions pandas/tests/apply/test_series_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,11 +73,6 @@ def f(x):
expected = s.map(f)
tm.assert_series_equal(result, expected)

s = Series([1, 2, 3])
result = s.apply(f, by_row=by_row)
expected = s.map(f)
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize("convert_dtype", [True, False])
def test_apply_convert_dtype_deprecated(convert_dtype):
Expand Down Expand Up @@ -435,7 +430,7 @@ def test_with_nested_series(datetime_series, op_name):
tm.assert_frame_equal(result, expected)


def test_replicate_describe(string_series, by_row):
def test_replicate_describe(string_series):
# this also tests a result set that is all scalars
expected = string_series.describe()
result = string_series.apply(
Expand All @@ -449,7 +444,6 @@ def test_replicate_describe(string_series, by_row):
"75%": lambda x: x.quantile(0.75),
"max": "max",
},
by_row=by_row,
)
tm.assert_series_equal(result, expected)

Expand Down Expand Up @@ -617,7 +611,7 @@ def test_apply_listlike_transformer(string_series, ops, names, by_row):
([lambda x: x.sum()], Series([6], index=["<lambda>"])),
],
)
def test_apply_listlike_lambda(ops, expected, by_row=by_row):
def test_apply_listlike_lambda(ops, expected, by_row):
# GH53400
ser = Series([1, 2, 3])
result = ser.apply(ops, by_row=by_row)
Expand Down