Skip to content

BUG: fix Series.apply(..., by_row), v2. #53601

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Jun 29, 2023
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v2.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ Other enhancements
- :meth:`DataFrameGroupby.agg` and :meth:`DataFrameGroupby.transform` now support grouping by multiple keys when the index is not a :class:`MultiIndex` for ``engine="numba"`` (:issue:`53486`)
- :meth:`SeriesGroupby.agg` and :meth:`DataFrameGroupby.agg` now support passing in multiple functions for ``engine="numba"`` (:issue:`53486`)
- Added ``engine_kwargs`` parameter to :meth:`DataFrame.to_excel` (:issue:`53220`)
- Added a new parameter ``by_row`` to :meth:`Series.apply`. When set to ``False`` the supplied callables will always operate on the whole Series (:issue:`53400`).
- Added a new parameter ``by_row`` to :meth:`Series.apply` and :meth:`DataFrame.apply`. When set to ``False`` the supplied callables will always operate on the whole Series or DataFrame (:issue:`53400`, :issue:`53601`).
- Many read/to_* functions, such as :meth:`DataFrame.to_pickle` and :func:`read_csv`, support forwarding compression arguments to lzma.LZMAFile (:issue:`52979`)
- Performance improvement in :func:`concat` with homogeneous ``np.float64`` or ``np.float32`` dtypes (:issue:`52685`)
- Performance improvement in :meth:`DataFrame.filter` when ``items`` is given (:issue:`52941`)
Expand Down
57 changes: 52 additions & 5 deletions pandas/core/apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ def frame_apply(
axis: Axis = 0,
raw: bool = False,
result_type: str | None = None,
by_row: bool | Literal["compat"] = "compat",
args=None,
kwargs=None,
) -> FrameApply:
Expand All @@ -100,6 +101,7 @@ def frame_apply(
func,
raw=raw,
result_type=result_type,
by_row=by_row,
args=args,
kwargs=kwargs,
)
Expand All @@ -115,11 +117,16 @@ def __init__(
raw: bool,
result_type: str | None,
*,
by_row: bool | Literal["compat"] = True,
args,
kwargs,
) -> None:
self.obj = obj
self.raw = raw

assert isinstance(by_row, bool) or by_row == "compat"
self.by_row = by_row

self.args = args or ()
self.kwargs = kwargs or {}

Expand Down Expand Up @@ -304,7 +311,14 @@ def agg_or_apply_list_like(
func = cast(List[AggFuncTypeBase], self.func)
kwargs = self.kwargs
if op_name == "apply":
kwargs = {**kwargs, "by_row": False}
if isinstance(self, FrameApply):
by_row = self.by_row

elif isinstance(self, SeriesApply):
by_row = "compat" if self.by_row else False
else:
by_row = False
kwargs = {**kwargs, "by_row": by_row}

if getattr(obj, "axis", 0) == 1:
raise NotImplementedError("axis other than 0 is not supported")
Expand Down Expand Up @@ -397,7 +411,15 @@ def agg_or_apply_dict_like(

obj = self.obj
func = cast(AggFuncTypeDict, self.func)
kwargs = {"by_row": False} if op_name == "apply" else {}
kwargs = {}
if op_name == "apply":
if isinstance(self, FrameApply):
by_row = self.by_row
elif isinstance(self, SeriesApply) and self.by_row:
by_row = "compat"
else:
by_row = False
kwargs.update({"by_row": by_row})

if getattr(obj, "axis", 0) == 1:
raise NotImplementedError("axis other than 0 is not supported")
Expand Down Expand Up @@ -1067,15 +1089,15 @@ def infer_to_same_shape(self, results: ResType, res_index: Index) -> DataFrame:
class SeriesApply(NDFrameApply):
obj: Series
axis: AxisInt = 0
by_row: bool # only relevant for apply()
by_row: bool | Literal["compat"] # only relevant for apply()

def __init__(
self,
obj: Series,
func: AggFuncType,
*,
convert_dtype: bool | lib.NoDefault = lib.no_default,
by_row: bool = True,
by_row: bool | Literal["compat"] = True,
args,
kwargs,
) -> None:
Expand All @@ -1090,13 +1112,13 @@ def __init__(
stacklevel=find_stack_level(),
)
self.convert_dtype = convert_dtype
self.by_row = by_row

super().__init__(
obj,
func,
raw=False,
result_type=None,
by_row=by_row,
args=args,
kwargs=kwargs,
)
Expand All @@ -1115,6 +1137,9 @@ def apply(self) -> DataFrame | Series:
# if we are a string, try to dispatch
return self.apply_str()

if self.by_row == "compat":
return self.apply_compat()

# self.func is Callable
return self.apply_standard()

Expand Down Expand Up @@ -1149,6 +1174,28 @@ def apply_empty_result(self) -> Series:
obj, method="apply"
)

def apply_compat(self):
"""compat apply method.

Used for each callable when giving listlikes and dictlikes of callables to
apply. Needed for copatability with Pandas < v2.1.

.. versionadded:: 2.1.0
"""
obj = self.obj
func = self.func

if callable(func):
f = com.get_cython_func(func)
if f and not self.args and not self.kwargs:
return obj.apply(func, by_row=False)

try:
result = obj.apply(func, by_row=True)
except (ValueError, AttributeError, TypeError):
result = obj.apply(func, by_row=False)
return result

def apply_standard(self) -> DataFrame | Series:
# caller is responsible for ensuring that f is Callable
func = cast(Callable, self.func)
Expand Down
14 changes: 14 additions & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -9604,6 +9604,7 @@ def apply(
raw: bool = False,
result_type: Literal["expand", "reduce", "broadcast"] | None = None,
args=(),
by_row: Literal["compat", False] = "compat",
**kwargs,
):
"""
Expand Down Expand Up @@ -9652,6 +9653,18 @@ def apply(
args : tuple
Positional arguments to pass to `func` in addition to the
array/series.
by_row : bool or "compat", default "compat"
If "compat", will if possible first translate the func into pandas
methods (e.g. ``Series().apply(np.sum)`` will be translated to
``Series().sum()``). If that doesn't work, will try call to apply again with
``by_row=True`` and if that fails, will call apply again with
``by_row=False``
If True, gives the same results as for "compat".
If False, the funcs will be passed the whole Series at once.
``by_row`` only has effect when ``func`` is a listlike or dictlike of funcs
and the func isn't a string.

.. versionadded:: 2.1.0
**kwargs
Additional keyword arguments to pass as keywords arguments to
`func`.
Expand Down Expand Up @@ -9751,6 +9764,7 @@ def apply(
axis=axis,
raw=raw,
result_type=result_type,
by_row=by_row,
args=args,
kwargs=kwargs,
)
Expand Down
13 changes: 10 additions & 3 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -4503,7 +4503,7 @@ def apply(
convert_dtype: bool | lib.NoDefault = lib.no_default,
args: tuple[Any, ...] = (),
*,
by_row: bool = True,
by_row: bool | Literal["compat"] = True,
**kwargs,
) -> DataFrame | Series:
"""
Expand Down Expand Up @@ -4531,10 +4531,17 @@ def apply(
instead if you want ``convert_dtype=False``.
args : tuple
Positional arguments passed to func after the series value.
by_row : bool, default True
by_row : bool or "compat", default True
If False, the func will be passed the whole Series at once.
If True, will func will be passed each element of the Series, like
Series.map (backward compatible).
``Series.map`` (backward compatible).
If "compat", will if possible first translate the func into pandas
methods (e.g. ``Series().apply(np.sum)`` will be translated to
``Series().sum()``). If that doesn't work, will try call to apply again with
``by_row=True`` and if that fails, will call apply again with
``by_row=False``. Added for backwards compatibility, should not be used
directly.
``by_row`` has no effect when ``func`` is a string.

.. versionadded:: 2.1.0
**kwargs
Expand Down
66 changes: 66 additions & 0 deletions pandas/tests/apply/test_frame_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -667,6 +667,34 @@ def test_infer_row_shape():
assert result == (6, 2)


@pytest.mark.parametrize(
"ops, by_row, expected",
[
({"a": lambda x: x + 1}, "compat", DataFrame({"a": [2, 3]})),
({"a": lambda x: x + 1}, False, DataFrame({"a": [2, 3]})),
({"a": lambda x: x.sum()}, "compat", Series({"a": 3})),
({"a": lambda x: x.sum()}, False, Series({"a": 3})),
(
{"a": ["sum", np.sum, lambda x: x.sum()]},
"compat",
DataFrame({"a": [3, 3, 3]}, index=["sum", "sum", "<lambda>"]),
),
(
{"a": ["sum", np.sum, lambda x: x.sum()]},
False,
DataFrame({"a": [3, 3, 3]}, index=["sum", "sum", "<lambda>"]),
),
({"a": lambda x: 1}, "compat", DataFrame({"a": [1, 1]})),
({"a": lambda x: 1}, False, Series({"a": 1})),
],
)
def test_dictlike_lambda(ops, by_row, expected):
# GH53601
df = DataFrame({"a": [1, 2]})
result = df.apply(ops, by_row=by_row)
tm.assert_equal(result, expected)


def test_with_dictlike_columns():
# GH 17602
df = DataFrame([[1, 2], [1, 2]], columns=["a", "b"])
Expand Down Expand Up @@ -716,6 +744,44 @@ def test_with_dictlike_columns_with_infer():
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize(
"ops, by_row, expected",
[
([lambda x: x + 1], "compat", DataFrame({("a", "<lambda>"): [2, 3]})),
([lambda x: x + 1], True, DataFrame({("a", "<lambda>"): [2, 3]})),
([lambda x: x + 1], False, DataFrame({("a", "<lambda>"): [2, 3]})),
([lambda x: x.sum()], "compat", DataFrame({"a": [3]}, index=["<lambda>"])),
([lambda x: x.sum()], True, DataFrame({"a": [3]}, index=["<lambda>"])),
([lambda x: x.sum()], False, DataFrame({"a": [3]}, index=["<lambda>"])),
(
["sum", np.sum, lambda x: x.sum()],
"compat",
DataFrame({"a": [3, 3, 3]}, index=["sum", "sum", "<lambda>"]),
),
(
["sum", np.sum, lambda x: x.sum()],
False,
DataFrame({"a": [3, 3, 3]}, index=["sum", "sum", "<lambda>"]),
),
(
[lambda x: x + 1, lambda x: 3],
"compat",
DataFrame([[2, 3], [3, 3]], columns=[["a", "a"], ["<lambda>", "<lambda>"]]),
),
(
[lambda x: 2, lambda x: 3],
False,
DataFrame({"a": [2, 3]}, ["<lambda>", "<lambda>"]),
),
],
)
def test_listlike_lambda(ops, by_row, expected):
# GH53601
df = DataFrame({"a": [1, 2]})
result = df.apply(ops, by_row=by_row)
tm.assert_equal(result, expected)


def test_with_listlike_columns():
# GH 17348
df = DataFrame(
Expand Down
10 changes: 2 additions & 8 deletions pandas/tests/apply/test_series_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,11 +73,6 @@ def f(x):
expected = s.map(f)
tm.assert_series_equal(result, expected)

s = Series([1, 2, 3])
result = s.apply(f, by_row=by_row)
expected = s.map(f)
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize("convert_dtype", [True, False])
def test_apply_convert_dtype_deprecated(convert_dtype):
Expand Down Expand Up @@ -422,7 +417,7 @@ def test_with_nested_series(datetime_series, op_name):
tm.assert_frame_equal(result, expected)


def test_replicate_describe(string_series, by_row):
def test_replicate_describe(string_series):
# this also tests a result set that is all scalars
expected = string_series.describe()
result = string_series.apply(
Expand All @@ -436,7 +431,6 @@ def test_replicate_describe(string_series, by_row):
"75%": lambda x: x.quantile(0.75),
"max": "max",
},
by_row=by_row,
)
tm.assert_series_equal(result, expected)

Expand Down Expand Up @@ -604,7 +598,7 @@ def test_apply_listlike_transformer(string_series, ops, names, by_row):
([lambda x: x.sum()], Series([6], index=["<lambda>"])),
],
)
def test_apply_listlike_lambda(ops, expected, by_row=by_row):
def test_apply_listlike_lambda(ops, expected, by_row):
# GH53400
ser = Series([1, 2, 3])
result = ser.apply(ops, by_row=by_row)
Expand Down