Skip to content

DEPR: Enforce certain DataFrame reductions w/ axis=None to return scalars #50593

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Jan 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -716,6 +716,7 @@ Removal of prior version deprecations/changes
- Changed behavior of :meth:`Series.fillna` and :meth:`DataFrame.fillna` with ``timedelta64[ns]`` dtype and an incompatible ``fill_value``; this now casts to ``object`` dtype instead of raising, consistent with the behavior with other dtypes (:issue:`45746`)
- Change the default argument of ``regex`` for :meth:`Series.str.replace` from ``True`` to ``False``. Additionally, a single character ``pat`` with ``regex=True`` is now treated as a regular expression instead of a string literal. (:issue:`36695`, :issue:`24804`)
- Changed behavior of :meth:`DataFrame.any` and :meth:`DataFrame.all` with ``bool_only=True``; object-dtype columns with all-bool values will no longer be included, manually cast to ``bool`` dtype first (:issue:`46188`)
- Changed behavior of :meth:`DataFrame.max`, :class:`DataFrame.min`, :class:`DataFrame.mean`, :class:`DataFrame.median`, :class:`DataFrame.skew`, :class:`DataFrame.kurt` with ``axis=None`` to return a scalar applying the aggregation across both axes (:issue:`45072`)
- Changed behavior of comparison of a :class:`Timestamp` with a ``datetime.date`` object; these now compare as un-equal and raise on inequality comparisons, matching the ``datetime.datetime`` behavior (:issue:`36131`)
- Changed behavior of comparison of ``NaT`` with a ``datetime.date`` object; these now raise on inequality comparisons (:issue:`39196`)
- Enforced deprecation of silently dropping columns that raised a ``TypeError`` in :class:`Series.transform` and :class:`DataFrame.transform` when used with a list or dictionary (:issue:`43740`)
Expand Down
10 changes: 6 additions & 4 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -10357,9 +10357,8 @@ def _reduce(
assert filter_type is None or filter_type == "bool", filter_type
out_dtype = "bool" if filter_type == "bool" else None

# TODO: Make other agg func handle axis=None properly GH#21597
axis = self._get_axis_number(axis)
assert axis in [0, 1]
if axis is not None:
axis = self._get_axis_number(axis)

def func(values: np.ndarray):
# We only use this in the case that operates on self.values
Expand Down Expand Up @@ -10410,7 +10409,7 @@ def _get_data() -> DataFrame:

return out

assert not numeric_only and axis == 1
assert not numeric_only and axis in (1, None)

data = self
values = data.values
Expand All @@ -10426,6 +10425,9 @@ def _get_data() -> DataFrame:
# try to coerce to the original dtypes item by item if we can
pass

if axis is None:
return result

labels = self._get_agg_axis(axis)
result = self._constructor_sliced(result, index=labels)
return result
Expand Down
51 changes: 20 additions & 31 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -10944,7 +10944,7 @@ def _stat_function(
self,
name: str,
func,
axis: Axis | None | lib.NoDefault = None,
axis: Axis | None = 0,
skipna: bool_t = True,
numeric_only: bool_t = False,
**kwargs,
Expand All @@ -10956,30 +10956,13 @@ def _stat_function(

validate_bool_kwarg(skipna, "skipna", none_allowed=False)

if axis is None and self.ndim > 1:
# user must have explicitly passed axis=None
# GH#21597
warnings.warn(
f"In a future version, DataFrame.{name}(axis=None) will return a "
f"scalar {name} over the entire DataFrame. To retain the old "
f"behavior, use 'frame.{name}(axis=0)' or just 'frame.{name}()'",
FutureWarning,
stacklevel=find_stack_level(),
)

if axis is lib.no_default:
axis = None

if axis is None:
axis = self._stat_axis_number

return self._reduce(
func, name=name, axis=axis, skipna=skipna, numeric_only=numeric_only
)

def min(
self,
axis: Axis | None | lib.NoDefault = lib.no_default,
axis: Axis | None = 0,
skipna: bool_t = True,
numeric_only: bool_t = False,
**kwargs,
Expand All @@ -10995,7 +10978,7 @@ def min(

def max(
self,
axis: Axis | None | lib.NoDefault = lib.no_default,
axis: Axis | None = 0,
skipna: bool_t = True,
numeric_only: bool_t = False,
**kwargs,
Expand All @@ -11011,7 +10994,7 @@ def max(

def mean(
self,
axis: Axis | None | lib.NoDefault = lib.no_default,
axis: Axis | None = 0,
skipna: bool_t = True,
numeric_only: bool_t = False,
**kwargs,
Expand All @@ -11022,7 +11005,7 @@ def mean(

def median(
self,
axis: Axis | None | lib.NoDefault = lib.no_default,
axis: Axis | None = 0,
skipna: bool_t = True,
numeric_only: bool_t = False,
**kwargs,
Expand All @@ -11033,7 +11016,7 @@ def median(

def skew(
self,
axis: Axis | None | lib.NoDefault = lib.no_default,
axis: Axis | None = 0,
skipna: bool_t = True,
numeric_only: bool_t = False,
**kwargs,
Expand All @@ -11044,7 +11027,7 @@ def skew(

def kurt(
self,
axis: Axis | None | lib.NoDefault = lib.no_default,
axis: Axis | None = 0,
skipna: bool_t = True,
numeric_only: bool_t = False,
**kwargs,
Expand Down Expand Up @@ -11366,7 +11349,7 @@ def prod(
)
def mean(
self,
axis: AxisInt | None | lib.NoDefault = lib.no_default,
axis: AxisInt | None = 0,
skipna: bool_t = True,
numeric_only: bool_t = False,
**kwargs,
Expand All @@ -11387,7 +11370,7 @@ def mean(
)
def skew(
self,
axis: AxisInt | None | lib.NoDefault = lib.no_default,
axis: AxisInt | None = 0,
skipna: bool_t = True,
numeric_only: bool_t = False,
**kwargs,
Expand All @@ -11411,7 +11394,7 @@ def skew(
)
def kurt(
self,
axis: Axis | None | lib.NoDefault = lib.no_default,
axis: Axis | None = 0,
skipna: bool_t = True,
numeric_only: bool_t = False,
**kwargs,
Expand All @@ -11433,7 +11416,7 @@ def kurt(
)
def median(
self,
axis: AxisInt | None | lib.NoDefault = lib.no_default,
axis: AxisInt | None = 0,
skipna: bool_t = True,
numeric_only: bool_t = False,
**kwargs,
Expand All @@ -11456,7 +11439,7 @@ def median(
)
def max(
self,
axis: AxisInt | None | lib.NoDefault = lib.no_default,
axis: AxisInt | None = 0,
skipna: bool_t = True,
numeric_only: bool_t = False,
**kwargs,
Expand All @@ -11479,7 +11462,7 @@ def max(
)
def min(
self,
axis: AxisInt | None | lib.NoDefault = lib.no_default,
axis: AxisInt | None = 0,
skipna: bool_t = True,
numeric_only: bool_t = False,
**kwargs,
Expand Down Expand Up @@ -11708,6 +11691,12 @@ def _doc_params(cls):
axis : {axis_descr}
Axis for the function to be applied on.
For `Series` this parameter is unused and defaults to 0.

For DataFrames, specifying ``axis=None`` will apply the aggregation
across both axes.

.. versionadded:: 2.0.0

skipna : bool, default True
Exclude NA/null values when computing the result.
numeric_only : bool, default False
Expand All @@ -11719,7 +11708,7 @@ def _doc_params(cls):

Returns
-------
{name1} or {name2} (if level specified)\
{name1} or scalar\
{see_also}\
{examples}
"""
Expand Down
31 changes: 16 additions & 15 deletions pandas/tests/frame/test_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -1488,7 +1488,6 @@ def test_median_categorical_dtype_nuisance_column(self):
# TODO: np.median(df, axis=0) gives np.array([2.0, 2.0]) instead
# of expected.values

@pytest.mark.filterwarnings("ignore:.*will return a scalar.*:FutureWarning")
@pytest.mark.parametrize("method", ["min", "max"])
def test_min_max_categorical_dtype_non_ordered_nuisance_column(self, method):
# GH#28949 DataFrame.min should behave like Series.min
Expand All @@ -1510,15 +1509,15 @@ def test_min_max_categorical_dtype_non_ordered_nuisance_column(self, method):
getattr(df, method)()

with pytest.raises(TypeError, match="is not ordered for operation"):
getattr(np, method)(df)
getattr(np, method)(df, axis=0)

# same thing, but with an additional non-categorical column
df["B"] = df["A"].astype(object)
with pytest.raises(TypeError, match="is not ordered for operation"):
getattr(df, method)()

with pytest.raises(TypeError, match="is not ordered for operation"):
getattr(np, method)(df)
getattr(np, method)(df, axis=0)


def test_sum_timedelta64_skipna_false(using_array_manager, request):
Expand Down Expand Up @@ -1600,20 +1599,22 @@ def test_prod_sum_min_count_mixed_object():


@pytest.mark.parametrize("method", ["min", "max", "mean", "median", "skew", "kurt"])
def test_reduction_axis_none_deprecation(method):
# GH#21597 deprecate axis=None defaulting to axis=0 so that we can change it
# to reducing over all axes.
def test_reduction_axis_none_returns_scalar(method):
# GH#21597 As of 2.0, axis=None reduces over all axes.

df = DataFrame(np.random.randn(4, 4))
meth = getattr(df, method)

msg = f"scalar {method} over the entire DataFrame"
with tm.assert_produces_warning(FutureWarning, match=msg):
res = meth(axis=None)
with tm.assert_produces_warning(None):
expected = meth()
tm.assert_series_equal(res, expected)
tm.assert_series_equal(res, meth(axis=0))

result = getattr(df, method)(axis=None)
np_arr = df.to_numpy()
if method in {"skew", "kurt"}:
comp_mod = pytest.importorskip("scipy.stats")
if method == "kurt":
method = "kurtosis"
expected = getattr(comp_mod, method)(np_arr, bias=False, axis=None)
tm.assert_almost_equal(result, expected)
else:
expected = getattr(np, method)(np_arr, axis=None)
assert result == expected


@pytest.mark.parametrize(
Expand Down
8 changes: 2 additions & 6 deletions pandas/tests/groupby/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,11 +147,7 @@ def f(x):
tm.assert_frame_equal(df.groupby(c, observed=False).transform(sum), df[["a"]])

gbc = df.groupby(c, observed=False)
with tm.assert_produces_warning(
FutureWarning, match="scalar max", check_stacklevel=False
):
# stacklevel is thrown off (i think) bc the stack goes through numpy C code
result = gbc.transform(lambda xs: np.max(xs))
result = gbc.transform(lambda xs: np.max(xs, axis=0))
tm.assert_frame_equal(result, df[["a"]])

with tm.assert_produces_warning(None):
Expand Down Expand Up @@ -295,7 +291,7 @@ def test_apply(ordered):
idx = MultiIndex.from_arrays([missing, dense], names=["missing", "dense"])
expected = DataFrame([0, 1, 2.0], index=idx, columns=["values"])

result = grouped.apply(lambda x: np.mean(x))
result = grouped.apply(lambda x: np.mean(x, axis=0))
tm.assert_frame_equal(result, expected)

result = grouped.mean()
Expand Down
16 changes: 4 additions & 12 deletions pandas/tests/groupby/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,28 +80,20 @@ def test_builtins_apply(keys, f):
assert_msg = f"invalid frame shape: {result.shape} (expected ({ngroups}, 3))"
assert result.shape == (ngroups, 3), assert_msg

npfunc = getattr(np, fname) # numpy's equivalent function
if f in [max, min]:
warn = FutureWarning
else:
warn = None
msg = "scalar (max|min) over the entire DataFrame"
with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False):
# stacklevel can be thrown off because (i think) the stack
# goes through some of numpy's C code.
expected = gb.apply(npfunc)
npfunc = lambda x: getattr(np, fname)(x, axis=0) # numpy's equivalent function
expected = gb.apply(npfunc)
tm.assert_frame_equal(result, expected)

with tm.assert_produces_warning(None):
expected2 = gb.apply(lambda x: npfunc(x, axis=0))
expected2 = gb.apply(lambda x: npfunc(x))
tm.assert_frame_equal(result, expected2)

if f != sum:
expected = gb.agg(fname).reset_index()
expected.set_index(keys, inplace=True, drop=False)
tm.assert_frame_equal(result, expected, check_dtype=False)

tm.assert_series_equal(getattr(result, fname)(), getattr(df, fname)())
tm.assert_series_equal(getattr(result, fname)(axis=0), getattr(df, fname)(axis=0))


class TestNumericOnly:
Expand Down
17 changes: 5 additions & 12 deletions pandas/tests/groupby/transform/test_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def test_transform():
tm.assert_frame_equal(result, expected)

def demean(arr):
return arr - arr.mean()
return arr - arr.mean(axis=0)

people = DataFrame(
np.random.randn(5, 5),
Expand Down Expand Up @@ -144,7 +144,7 @@ def test_transform_broadcast(tsframe, ts):
result = grouped.transform(np.mean)
tm.assert_index_equal(result.index, tsframe.index)
for _, gp in grouped:
agged = gp.mean()
agged = gp.mean(axis=0)
res = result.reindex(gp.index)
for col in tsframe:
assert_fp_equal(res[col], agged[col])
Expand Down Expand Up @@ -214,7 +214,7 @@ def test_transform_axis_ts(tsframe):
ts = tso
grouped = ts.groupby(lambda x: x.weekday(), group_keys=False)
result = ts - grouped.transform("mean")
expected = grouped.apply(lambda x: x - x.mean())
expected = grouped.apply(lambda x: x - x.mean(axis=0))
tm.assert_frame_equal(result, expected)

ts = ts.T
Expand All @@ -227,7 +227,7 @@ def test_transform_axis_ts(tsframe):
ts = tso.iloc[[1, 0] + list(range(2, len(base)))]
grouped = ts.groupby(lambda x: x.weekday(), group_keys=False)
result = ts - grouped.transform("mean")
expected = grouped.apply(lambda x: x - x.mean())
expected = grouped.apply(lambda x: x - x.mean(axis=0))
tm.assert_frame_equal(result, expected)

ts = ts.T
Expand Down Expand Up @@ -477,16 +477,9 @@ def test_transform_coercion():

expected = g.transform(np.mean)

# in 2.0 np.mean on a DataFrame is equivalent to frame.mean(axis=None)
# which not gives a scalar instead of Series
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
result = g.transform(lambda x: np.mean(x))
result = g.transform(lambda x: np.mean(x, axis=0))
tm.assert_frame_equal(result, expected)

with tm.assert_produces_warning(None):
result2 = g.transform(lambda x: np.mean(x, axis=0))
tm.assert_frame_equal(result2, expected)


def test_groupby_transform_with_int():

Expand Down