Skip to content

BUG: Fix metadata propagation in reductions #53542

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Aug 1, 2023
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v2.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -770,8 +770,10 @@ Styler

Metadata
^^^^^^^^
- Fixed metadata propagation in :meth:`DataFrame.max`, :meth:`DataFrame.min`, :meth:`DataFrame.prod`, :meth:`DataFrame.mean`, :meth:`Series.mode`, :meth:`DataFrame.median`, :meth:`DataFrame.sem`, :meth:`DataFrame.skew`, :meth:`DataFrame.kurt` (:issue:`28283`)
- Fixed metadata propagation in :meth:`DataFrame.squeeze`, and :meth:`DataFrame.describe` (:issue:`28283`)
- Fixed metadata propagation in :meth:`DataFrame.std` (:issue:`28283`)
-

Other
^^^^^
Expand Down
67 changes: 48 additions & 19 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -11137,12 +11137,13 @@ def any( # type: ignore[override]
bool_only: bool = False,
skipna: bool = True,
**kwargs,
) -> Series:
# error: Incompatible return value type (got "Union[Series, bool]",
# expected "Series")
return self._logical_func( # type: ignore[return-value]
) -> Series | bool:
result = self._logical_func(
"any", nanops.nanany, axis, bool_only, skipna, **kwargs
)
if isinstance(result, Series):
result = result.__finalize__(self, method="any")
Comment on lines +11144 to +11145
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this only not a series when axis=None? Or are there other cases where a DataFrame can be reduced to a scaler?

I'd slightly prefer if axis is not None to this isinstance.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably, but mypy won't be able to figure out that result is a Series without an isinstance/cast, though.

return result

@doc(make_doc("all", ndim=2))
def all(
Expand All @@ -11151,12 +11152,13 @@ def all(
bool_only: bool = False,
skipna: bool = True,
**kwargs,
) -> Series:
# error: Incompatible return value type (got "Union[Series, bool]",
# expected "Series")
return self._logical_func( # type: ignore[return-value]
) -> Series | bool:
result = self._logical_func(
"all", nanops.nanall, axis, bool_only, skipna, **kwargs
)
if isinstance(result, Series):
result = result.__finalize__(self, method="all")
return result

@doc(make_doc("min", ndim=2))
def min(
Expand All @@ -11166,7 +11168,10 @@ def min(
numeric_only: bool = False,
**kwargs,
):
return super().min(axis, skipna, numeric_only, **kwargs)
result = super().min(axis, skipna, numeric_only, **kwargs)
if isinstance(result, Series):
result = result.__finalize__(self, method="min")
return result

@doc(make_doc("max", ndim=2))
def max(
Expand All @@ -11176,7 +11181,10 @@ def max(
numeric_only: bool = False,
**kwargs,
):
return super().max(axis, skipna, numeric_only, **kwargs)
result = super().max(axis, skipna, numeric_only, **kwargs)
if isinstance(result, Series):
result = result.__finalize__(self, method="max")
return result

@doc(make_doc("sum", ndim=2))
def sum(
Expand All @@ -11199,7 +11207,8 @@ def prod(
min_count: int = 0,
**kwargs,
):
return super().prod(axis, skipna, numeric_only, min_count, **kwargs)
result = super().prod(axis, skipna, numeric_only, min_count, **kwargs)
return result.__finalize__(self, method="prod")

@doc(make_doc("mean", ndim=2))
def mean(
Expand All @@ -11209,7 +11218,10 @@ def mean(
numeric_only: bool = False,
**kwargs,
):
return super().mean(axis, skipna, numeric_only, **kwargs)
result = super().mean(axis, skipna, numeric_only, **kwargs)
if isinstance(result, Series):
result = result.__finalize__(self, method="mean")
return result

@doc(make_doc("median", ndim=2))
def median(
Expand All @@ -11219,7 +11231,10 @@ def median(
numeric_only: bool = False,
**kwargs,
):
return super().median(axis, skipna, numeric_only, **kwargs)
result = super().median(axis, skipna, numeric_only, **kwargs)
if isinstance(result, Series):
result = result.__finalize__(self, method="median")
return result

@doc(make_doc("sem", ndim=2))
def sem(
Expand All @@ -11230,7 +11245,10 @@ def sem(
numeric_only: bool = False,
**kwargs,
):
return super().sem(axis, skipna, ddof, numeric_only, **kwargs)
result = super().sem(axis, skipna, ddof, numeric_only, **kwargs)
if isinstance(result, Series):
result = result.__finalize__(self, method="sem")
return result

@doc(make_doc("var", ndim=2))
def var(
Expand All @@ -11241,7 +11259,10 @@ def var(
numeric_only: bool = False,
**kwargs,
):
return super().var(axis, skipna, ddof, numeric_only, **kwargs)
result = super().var(axis, skipna, ddof, numeric_only, **kwargs)
if isinstance(result, Series):
result = result.__finalize__(self, method="var")
return result

@doc(make_doc("std", ndim=2))
def std(
Expand All @@ -11252,8 +11273,10 @@ def std(
numeric_only: bool = False,
**kwargs,
):
result = cast(Series, super().std(axis, skipna, ddof, numeric_only, **kwargs))
return result.__finalize__(self, method="std")
result = super().std(axis, skipna, ddof, numeric_only, **kwargs)
if isinstance(result, Series):
result = result.__finalize__(self, method="std")
return result

@doc(make_doc("skew", ndim=2))
def skew(
Expand All @@ -11263,7 +11286,10 @@ def skew(
numeric_only: bool = False,
**kwargs,
):
return super().skew(axis, skipna, numeric_only, **kwargs)
result = super().skew(axis, skipna, numeric_only, **kwargs)
if isinstance(result, Series):
result = result.__finalize__(self, method="skew")
return result

@doc(make_doc("kurt", ndim=2))
def kurt(
Expand All @@ -11273,7 +11299,10 @@ def kurt(
numeric_only: bool = False,
**kwargs,
):
return super().kurt(axis, skipna, numeric_only, **kwargs)
result = super().kurt(axis, skipna, numeric_only, **kwargs)
if isinstance(result, Series):
result = result.__finalize__(self, method="kurt")
return result

kurtosis = kurt
product = prod
Expand Down
7 changes: 5 additions & 2 deletions pandas/core/reshape/encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from collections.abc import (
Hashable,
Iterable,
cast,
)
import itertools
from typing import TYPE_CHECKING
Expand Down Expand Up @@ -455,10 +456,12 @@ def from_dummies(
f"Received 'data' of type: {type(data).__name__}"
)

if data.isna().any().any():
col_isna_mask = cast(Series, data.isna().any())

if col_isna_mask.any():
raise ValueError(
"Dummy DataFrame contains NA value in column: "
f"'{data.isna().any().idxmax()}'"
f"'{col_isna_mask.idxmax()}'"
)

# index data with a list of all columns that are dummies
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2202,7 +2202,7 @@ def mode(self, dropna: bool = True) -> Series:
# Ensure index is type stable (should always use int index)
return self._constructor(
res_values, index=range(len(res_values)), name=self.name, copy=False
)
).__finalize__(self, method="mode")

def unique(self) -> ArrayLike: # pylint: disable=useless-parent-delegation
"""
Expand Down
29 changes: 17 additions & 12 deletions pandas/tests/generic/test_finalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,10 +180,8 @@
(pd.DataFrame, frame_data, operator.methodcaller("idxmin")),
(pd.DataFrame, frame_data, operator.methodcaller("idxmax")),
(pd.DataFrame, frame_data, operator.methodcaller("mode")),
pytest.param(
(pd.Series, [0], operator.methodcaller("mode")),
marks=not_implemented_mark,
),
(pd.Series, [0], operator.methodcaller("mode")),
(pd.DataFrame, frame_data, operator.methodcaller("median")),
(
pd.DataFrame,
frame_data,
Expand Down Expand Up @@ -363,17 +361,24 @@
# Cumulative reductions
(pd.Series, ([1],), operator.methodcaller("cumsum")),
(pd.DataFrame, frame_data, operator.methodcaller("cumsum")),
(pd.Series, ([1],), operator.methodcaller("cummin")),
(pd.DataFrame, frame_data, operator.methodcaller("cummin")),
(pd.Series, ([1],), operator.methodcaller("cummax")),
(pd.DataFrame, frame_data, operator.methodcaller("cummax")),
(pd.Series, ([1],), operator.methodcaller("cumprod")),
(pd.DataFrame, frame_data, operator.methodcaller("cumprod")),
# Reductions
pytest.param(
(pd.DataFrame, frame_data, operator.methodcaller("any")),
marks=not_implemented_mark,
),
(pd.DataFrame, frame_data, operator.methodcaller("any")),
(pd.DataFrame, frame_data, operator.methodcaller("all")),
(pd.DataFrame, frame_data, operator.methodcaller("min")),
(pd.DataFrame, frame_data, operator.methodcaller("max")),
(pd.DataFrame, frame_data, operator.methodcaller("sum")),
(pd.DataFrame, frame_data, operator.methodcaller("std")),
pytest.param(
(pd.DataFrame, frame_data, operator.methodcaller("mean")),
marks=not_implemented_mark,
),
(pd.DataFrame, frame_data, operator.methodcaller("mean")),
(pd.DataFrame, frame_data, operator.methodcaller("prod")),
(pd.DataFrame, frame_data, operator.methodcaller("sem")),
(pd.DataFrame, frame_data, operator.methodcaller("skew")),
(pd.DataFrame, frame_data, operator.methodcaller("kurt")),
]


Expand Down