Skip to content

Make Series.groupby.transform annotation more precise #459

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 64 additions & 1 deletion pandas-stubs/_typing.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,10 @@ from pandas.core.generic import NDFrame
from pandas.core.groupby.grouper import Grouper
from pandas.core.indexes.base import Index
from pandas.core.series import Series
from typing_extensions import TypeAlias
from typing_extensions import (
ParamSpec,
TypeAlias,
)

from pandas._libs.interval import Interval
from pandas._libs.tslibs import (
Expand Down Expand Up @@ -123,6 +126,7 @@ JSONSerializable: TypeAlias = Union[PythonScalar, list, dict]
Axes: TypeAlias = Union[AnyArrayLike, list, dict, range, tuple]
Renamer: TypeAlias = Union[Mapping[Any, Label], Callable[[Any], Label]]
T = TypeVar("T")
P = ParamSpec("P")
FuncType: TypeAlias = Callable[..., Any]
F = TypeVar("F", bound=FuncType)
HashableT = TypeVar("HashableT", bound=Hashable)
Expand Down Expand Up @@ -202,6 +206,27 @@ S1 = TypeVar(
Interval[Timestamp],
Interval[Timedelta],
)
S2 = TypeVar(
"S2",
str,
bytes,
datetime.date,
datetime.datetime,
datetime.time,
datetime.timedelta,
bool,
int,
float,
complex,
Timestamp,
Timedelta,
np.datetime64,
Period,
Interval[int],
Interval[float],
Interval[Timestamp],
Interval[Timedelta],
)
T1 = TypeVar(
"T1", str, int, np.int64, np.uint64, np.float64, float, np.dtype[np.generic]
)
Expand Down Expand Up @@ -285,6 +310,44 @@ GroupByObjectNonScalar: TypeAlias = Union[
list[Grouper],
]
GroupByObject: TypeAlias = Union[Scalar, GroupByObjectNonScalar]
GroupByFuncStrs: TypeAlias = Literal[
# Reduction/aggregation functions
"all",
"any",
"corrwith",
"count",
"first",
"idxmax",
"idxmin",
"last",
"max",
"mean",
"median",
"min",
"nunique",
"prod",
"quantile",
"sem",
"size",
"skew",
"std",
"sum",
"var",
# Transformation functions
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So this is another area where I really think a static analysis tool ought to save you from yourself and prevent you from making an illogical call. In this case, we could allow aggregation functions like sum because they work with transform(), however they make no sense to use as a transformation function.

Similar to the last PR, I am allowing it, because pandas permits it and returns a value without erroring. But I'm personally not a big fan of that.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You could always bring it up as a pandas issue for discussion.

"bfill",
"cumcount",
"cummax",
"cummin",
"cumprod",
"cumsum",
"diff",
"ffill",
"fillna",
"ngroup",
"pct_change",
"rank",
"shift",
]

StataDateFormat: TypeAlias = Literal[
"tc",
Expand Down
20 changes: 18 additions & 2 deletions pandas-stubs/core/groupby/generic.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,21 @@ from pandas.core.groupby.groupby import ( # , get_groupby as get_groupby
)
from pandas.core.groupby.grouper import Grouper
from pandas.core.series import Series
from typing_extensions import TypeAlias
from typing_extensions import (
Concatenate,
TypeAlias,
)

from pandas._typing import (
S1,
S2,
AggFuncTypeBase,
AggFuncTypeFrame,
AxisType,
GroupByFuncStrs,
Level,
ListLike,
P,
Scalar,
)

Expand Down Expand Up @@ -61,7 +67,17 @@ class SeriesGroupBy(GroupBy, Generic[S1]):
def agg(self, func: list[AggFuncTypeBase], *args, **kwargs) -> DataFrame: ...
@overload
def agg(self, func: AggFuncTypeBase, *args, **kwargs) -> Series: ...
def transform(self, func: Callable | str, *args, **kwargs) -> Series: ...
@overload
def transform(
self,
func: Callable[Concatenate[Series[S1], P], Series[S2]],
*args: P.args,
engine: Literal["cython", "numba", None] = ...,
engine_kwargs: dict[str, Any] | None = ...,
**kwargs: P.kwargs,
) -> Series[S2]: ...
@overload
def transform(self, func: GroupByFuncStrs, *args, **kwargs) -> Series: ...
def filter(self, func, dropna: bool = ..., *args, **kwargs): ...
def nunique(self, dropna: bool = ...) -> Series: ...
def describe(self, **kwargs) -> DataFrame: ...
Expand Down
24 changes: 21 additions & 3 deletions tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -517,6 +517,24 @@ def test_types_groupby_methods() -> None:
check(assert_type(s.groupby(level=0).unique(), pd.Series), pd.Series)


def test_types_groupby_transform() -> None:
s: pd.Series[int] = pd.Series([4, 2, 1, 8], index=["a", "b", "a", "b"])

def transform_func(
x: pd.Series[int], pos_arg: bool, kw_arg: str
) -> pd.Series[float]:
return x / (2.0 if pos_arg else 1.0)

check(
assert_type(
s.groupby(lambda x: x).transform(transform_func, True, kw_arg="foo"),
"pd.Series[float]",
),
pd.Series,
float,
)


def test_types_groupby_agg() -> None:
s = pd.Series([4, 2, 1, 8], index=["a", "b", "a", "b"])
check(assert_type(s.groupby(level=0).agg("sum"), pd.Series), pd.Series)
Expand Down Expand Up @@ -641,9 +659,9 @@ def test_types_aggregate() -> None:


def test_types_transform() -> None:
s = pd.Series([1, 2, 3], index=["col1", "col2", "col3"])
check(assert_type(s.transform("abs"), pd.Series), pd.Series)
check(assert_type(s.transform(abs), pd.Series), pd.Series)
s: pd.Series[int] = pd.Series([1, 2, 3], index=["col1", "col2", "col3"])
check(assert_type(s.transform("abs"), "pd.Series[int]"), pd.Series, int)
check(assert_type(s.transform(abs), "pd.Series[int]"), pd.Series, int)
check(assert_type(s.transform(["abs", "sqrt"]), pd.DataFrame), pd.DataFrame)
check(assert_type(s.transform([abs, np.sqrt]), pd.DataFrame), pd.DataFrame)
check(
Expand Down