Skip to content

POC: Don't special case Python builtin and NumPy functions #53426

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 16 additions & 24 deletions pandas/core/apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,10 +159,7 @@ def agg(self) -> DataFrame | Series | None:
Result of aggregation, or None if agg cannot be performed by
this method.
"""
obj = self.obj
arg = self.f
args = self.args
kwargs = self.kwargs

if isinstance(arg, str):
return self.apply_str()
Expand All @@ -173,11 +170,6 @@ def agg(self) -> DataFrame | Series | None:
# we require a list, but not a 'str'
return self.agg_list_like()

if callable(arg):
f = com.get_cython_func(arg)
if f and not args and not kwargs:
return getattr(obj, f)()

# caller can react
return None

Expand Down Expand Up @@ -283,11 +275,6 @@ def transform_str_or_callable(self, func) -> DataFrame | Series:
if isinstance(func, str):
return self._apply_str(obj, func, *args, **kwargs)

if not args and not kwargs:
f = com.get_cython_func(func)
if f:
return getattr(obj, f)()

# Two possible ways to use a UDF - apply or call directly
try:
return obj.apply(func, args=args, **kwargs)
Expand Down Expand Up @@ -1097,18 +1084,23 @@ def agg(self):
# string, list-like, and dict-like are entirely handled in super
assert callable(f)

# try a regular apply, this evaluates lambdas
# row-by-row; however if the lambda is expected a Series
# expression, e.g.: lambda x: x-x.quantile(0.25)
# this will fail, so we can try a vectorized evaluation

# we cannot FIRST try the vectorized evaluation, because
# then .agg and .apply would have different semantics if the
# operation is actually defined on the Series, e.g. str
try:
result = self.obj.apply(f)
except (ValueError, AttributeError, TypeError):
has_cython_func = f in com._cython_table
if has_cython_func and not self.args and not self.kwargs:
# previous versions would vectorize NumPy functions
result = f(self.obj)
else:
# try a regular apply, this evaluates lambdas
# row-by-row; however if the lambda is expected a Series
# expression, e.g.: lambda x: x-x.quantile(0.25)
# this will fail, so we can try a vectorized evaluation

# we cannot FIRST try the vectorized evaluation, because
# then .agg and .apply would have different semantics if the
# operation is actually defined on the Series, e.g. str
try:
result = self.obj.apply(f)
except (ValueError, AttributeError, TypeError):
result = f(self.obj)

return result

Expand Down
24 changes: 0 additions & 24 deletions pandas/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -556,15 +556,6 @@ def require_length_match(data, index: Index) -> None:
)


# the ufuncs np.maximum.reduce and np.minimum.reduce default to axis=0,
# whereas np.min and np.max (which directly call obj.min and obj.max)
# default to axis=None.
_builtin_table = {
builtins.sum: np.sum,
builtins.max: np.maximum.reduce,
builtins.min: np.minimum.reduce,
}

_cython_table = {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can _cython_table also ve removed?

Copy link
Member Author

@rhshadrach rhshadrach May 29, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need to know whether the passed function is a NumPy function (maybe there is a better way to do this?) for backwards compatibility. This is because Series.apply treats these differently.

builtins.sum: "sum",
builtins.max: "max",
Expand Down Expand Up @@ -594,21 +585,6 @@ def require_length_match(data, index: Index) -> None:
}


def get_cython_func(arg: Callable) -> str | None:
"""
if we define an internal function for this argument, return it
"""
return _cython_table.get(arg)


def is_builtin_func(arg):
"""
if we define a builtin function for this argument, return it,
otherwise return the arg
"""
return _builtin_table.get(arg, arg)


def fill_missing_names(names: Sequence[Hashable | None]) -> list[Hashable]:
"""
If a name is missing then replace it by level_n, where n is the count
Expand Down
6 changes: 0 additions & 6 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,10 +251,6 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs)
return ret

else:
cyfunc = com.get_cython_func(func)
if cyfunc and not args and not kwargs:
return getattr(self, cyfunc)()

if self.ngroups == 0:
# e.g. test_evaluate_with_empty_groups without any groups to
# iterate over, we have no output on which to do dtype
Expand Down Expand Up @@ -297,7 +293,6 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs)
agg = aggregate

def _python_agg_general(self, func, *args, **kwargs):
func = com.is_builtin_func(func)
f = lambda x: func(x, *args, **kwargs)

obj = self._obj_with_exclusions
Expand Down Expand Up @@ -1463,7 +1458,6 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs)
agg = aggregate

def _python_agg_general(self, func, *args, **kwargs):
func = com.is_builtin_func(func)
f = lambda x: func(x, *args, **kwargs)

if self.ngroups == 0:
Expand Down
5 changes: 0 additions & 5 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1459,8 +1459,6 @@ def _aggregate_with_numba(self, func, *args, engine_kwargs=None, **kwargs):
)
)
def apply(self, func, *args, **kwargs) -> NDFrameT:
func = com.is_builtin_func(func)

if isinstance(func, str):
if hasattr(self, func):
res = getattr(self, func)
Expand Down Expand Up @@ -1659,9 +1657,6 @@ def _transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
func, *args, engine_kwargs=engine_kwargs, **kwargs
)

# optimized transforms
func = com.get_cython_func(func) or func

if not isinstance(func, str):
return self._transform_general(func, *args, **kwargs)

Expand Down
3 changes: 0 additions & 3 deletions pandas/core/resample.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@
import pandas.core.algorithms as algos
from pandas.core.apply import ResamplerWindowApply
from pandas.core.base import PandasObject
import pandas.core.common as com
from pandas.core.generic import (
NDFrame,
_shared_docs,
Expand Down Expand Up @@ -1414,7 +1413,6 @@ def _downsample(self, how, **kwargs):
how : string / cython mapped function
**kwargs : kw args passed to how function
"""
how = com.get_cython_func(how) or how
ax = self.ax

# Excludes `on` column when provided
Expand Down Expand Up @@ -1566,7 +1564,6 @@ def _downsample(self, how, **kwargs):
if self.kind == "timestamp":
return super()._downsample(how, **kwargs)

how = com.get_cython_func(how) or how
ax = self.ax

if is_subperiod(ax.freq, self.freq):
Expand Down
5 changes: 3 additions & 2 deletions pandas/tests/apply/test_frame_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -1502,13 +1502,14 @@ def foo2(x, b=2, c=0):

def test_agg_std():
df = DataFrame(np.arange(6).reshape(3, 2), columns=["A", "B"])
expected_value = 1.632993161855452

result = df.agg(np.std)
expected = Series({"A": 2.0, "B": 2.0}, dtype=float)
expected = Series({"A": expected_value, "B": expected_value}, dtype=float)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Different ddof here will be import to communicate in a possible deprecation.

tm.assert_series_equal(result, expected)

result = df.agg([np.std])
expected = DataFrame({"A": 2.0, "B": 2.0}, index=["std"])
expected = DataFrame({"A": expected_value, "B": expected_value}, index=["std"])
tm.assert_frame_equal(result, expected)


Expand Down
12 changes: 9 additions & 3 deletions pandas/tests/apply/test_invalid_arg.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,9 +243,15 @@ def test_agg_cython_table_raises_frame(df, func, expected, axis):
)
def test_agg_cython_table_raises_series(series, func, expected):
# GH21224
msg = r"[Cc]ould not convert|can't multiply sequence by non-int of type"
if func == "median" or func is np.nanmedian or func is np.median:
msg = r"Cannot convert \['a' 'b' 'c'\] to numeric"
msg = "|".join(
[
"Cannot convert",
"[Cc]ould not convert",
"can't multiply sequence by non-int of type",
"not supported for the input types",
"unsupported operand type",
]
)

with pytest.raises(expected, match=msg):
# e.g. Series('a b'.split()).cumprod() will raise
Expand Down
159 changes: 0 additions & 159 deletions pandas/tests/apply/test_str.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,8 @@
from itertools import chain
import operator

import numpy as np
import pytest

from pandas.core.dtypes.common import is_number

from pandas import (
DataFrame,
Series,
Expand Down Expand Up @@ -86,162 +83,6 @@ def test_apply_np_transformer(float_frame, op, how):
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize(
"series, func, expected",
chain(
tm.get_cython_table_params(
Series(dtype=np.float64),
[
("sum", 0),
("max", np.nan),
("min", np.nan),
("all", True),
("any", False),
("mean", np.nan),
("prod", 1),
("std", np.nan),
("var", np.nan),
("median", np.nan),
],
),
tm.get_cython_table_params(
Series([np.nan, 1, 2, 3]),
[
("sum", 6),
("max", 3),
("min", 1),
("all", True),
("any", True),
("mean", 2),
("prod", 6),
("std", 1),
("var", 1),
("median", 2),
],
),
tm.get_cython_table_params(
Series("a b c".split()),
[
("sum", "abc"),
("max", "c"),
("min", "a"),
("all", True),
("any", True),
],
),
),
)
def test_agg_cython_table_series(series, func, expected):
# GH21224
# test reducing functions in
# pandas.core.base.SelectionMixin._cython_table
result = series.agg(func)
if is_number(expected):
assert np.isclose(result, expected, equal_nan=True)
else:
assert result == expected


@pytest.mark.parametrize(
"series, func, expected",
chain(
tm.get_cython_table_params(
Series(dtype=np.float64),
[
("cumprod", Series([], dtype=np.float64)),
("cumsum", Series([], dtype=np.float64)),
],
),
tm.get_cython_table_params(
Series([np.nan, 1, 2, 3]),
[
("cumprod", Series([np.nan, 1, 2, 6])),
("cumsum", Series([np.nan, 1, 3, 6])),
],
),
tm.get_cython_table_params(
Series("a b c".split()), [("cumsum", Series(["a", "ab", "abc"]))]
),
),
)
def test_agg_cython_table_transform_series(series, func, expected):
# GH21224
# test transforming functions in
# pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum)
result = series.agg(func)
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize(
"df, func, expected",
chain(
tm.get_cython_table_params(
DataFrame(),
[
("sum", Series(dtype="float64")),
("max", Series(dtype="float64")),
("min", Series(dtype="float64")),
("all", Series(dtype=bool)),
("any", Series(dtype=bool)),
("mean", Series(dtype="float64")),
("prod", Series(dtype="float64")),
("std", Series(dtype="float64")),
("var", Series(dtype="float64")),
("median", Series(dtype="float64")),
],
),
tm.get_cython_table_params(
DataFrame([[np.nan, 1], [1, 2]]),
[
("sum", Series([1.0, 3])),
("max", Series([1.0, 2])),
("min", Series([1.0, 1])),
("all", Series([True, True])),
("any", Series([True, True])),
("mean", Series([1, 1.5])),
("prod", Series([1.0, 2])),
("std", Series([np.nan, 0.707107])),
("var", Series([np.nan, 0.5])),
("median", Series([1, 1.5])),
],
),
),
)
def test_agg_cython_table_frame(df, func, expected, axis):
# GH 21224
# test reducing functions in
# pandas.core.base.SelectionMixin._cython_table
result = df.agg(func, axis=axis)
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize(
"df, func, expected",
chain(
tm.get_cython_table_params(
DataFrame(), [("cumprod", DataFrame()), ("cumsum", DataFrame())]
),
tm.get_cython_table_params(
DataFrame([[np.nan, 1], [1, 2]]),
[
("cumprod", DataFrame([[np.nan, 1], [1, 2]])),
("cumsum", DataFrame([[np.nan, 1], [1, 3]])),
],
),
),
)
def test_agg_cython_table_transform_frame(df, func, expected, axis):
# GH 21224
# test transforming functions in
# pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum)
if axis in ("columns", 1):
# operating blockwise doesn't let us preserve dtypes
expected = expected.astype("float64")

result = df.agg(func, axis=axis)
tm.assert_frame_equal(result, expected)


Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These tests should probably be testing, eventually for the results of the numpy functions?

Copy link
Member Author

@rhshadrach rhshadrach May 29, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yea, I wasn't sure if this was just testing compatibility between e.g. "sum" and np.sum. It could be rewritten to test NumPy functions directly.

With this test removed, we have 1308 tests (counting different parametrizations) that pass a NumPy function into agg/apply/transform (within pandas.core.apply). 306 of those are in tests.apply.

@pytest.mark.parametrize("op", series_transform_kernels)
def test_transform_groupby_kernel_series(request, string_series, op):
# GH 35964
Expand Down
Loading