Skip to content

Commit 47c9ee7

Browse files
authored
DEPR: Enforce certain DataFrame reductions w/ axis=None to return scalars (#50593)
1 parent 4520f84 commit 47c9ee7

File tree

7 files changed

+54
-80
lines changed

7 files changed

+54
-80
lines changed

Diff for: doc/source/whatsnew/v2.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -715,6 +715,7 @@ Removal of prior version deprecations/changes
715715
- Changed behavior of :meth:`Series.fillna` and :meth:`DataFrame.fillna` with ``timedelta64[ns]`` dtype and an incompatible ``fill_value``; this now casts to ``object`` dtype instead of raising, consistent with the behavior with other dtypes (:issue:`45746`)
716716
- Change the default argument of ``regex`` for :meth:`Series.str.replace` from ``True`` to ``False``. Additionally, a single character ``pat`` with ``regex=True`` is now treated as a regular expression instead of a string literal. (:issue:`36695`, :issue:`24804`)
717717
- Changed behavior of :meth:`DataFrame.any` and :meth:`DataFrame.all` with ``bool_only=True``; object-dtype columns with all-bool values will no longer be included, manually cast to ``bool`` dtype first (:issue:`46188`)
718+
- Changed behavior of :meth:`DataFrame.max`, :class:`DataFrame.min`, :class:`DataFrame.mean`, :class:`DataFrame.median`, :class:`DataFrame.skew`, :class:`DataFrame.kurt` with ``axis=None`` to return a scalar applying the aggregation across both axes (:issue:`45072`)
718719
- Changed behavior of comparison of a :class:`Timestamp` with a ``datetime.date`` object; these now compare as un-equal and raise on inequality comparisons, matching the ``datetime.datetime`` behavior (:issue:`36131`)
719720
- Changed behavior of comparison of ``NaT`` with a ``datetime.date`` object; these now raise on inequality comparisons (:issue:`39196`)
720721
- Enforced deprecation of silently dropping columns that raised a ``TypeError`` in :class:`Series.transform` and :class:`DataFrame.transform` when used with a list or dictionary (:issue:`43740`)

Diff for: pandas/core/frame.py

+6-4
Original file line numberDiff line numberDiff line change
@@ -10357,9 +10357,8 @@ def _reduce(
1035710357
assert filter_type is None or filter_type == "bool", filter_type
1035810358
out_dtype = "bool" if filter_type == "bool" else None
1035910359

10360-
# TODO: Make other agg func handle axis=None properly GH#21597
10361-
axis = self._get_axis_number(axis)
10362-
assert axis in [0, 1]
10360+
if axis is not None:
10361+
axis = self._get_axis_number(axis)
1036310362

1036410363
def func(values: np.ndarray):
1036510364
# We only use this in the case that operates on self.values
@@ -10410,7 +10409,7 @@ def _get_data() -> DataFrame:
1041010409

1041110410
return out
1041210411

10413-
assert not numeric_only and axis == 1
10412+
assert not numeric_only and axis in (1, None)
1041410413

1041510414
data = self
1041610415
values = data.values
@@ -10426,6 +10425,9 @@ def _get_data() -> DataFrame:
1042610425
# try to coerce to the original dtypes item by item if we can
1042710426
pass
1042810427

10428+
if axis is None:
10429+
return result
10430+
1042910431
labels = self._get_agg_axis(axis)
1043010432
result = self._constructor_sliced(result, index=labels)
1043110433
return result

Diff for: pandas/core/generic.py

+20-31
Original file line numberDiff line numberDiff line change
@@ -10944,7 +10944,7 @@ def _stat_function(
1094410944
self,
1094510945
name: str,
1094610946
func,
10947-
axis: Axis | None | lib.NoDefault = None,
10947+
axis: Axis | None = 0,
1094810948
skipna: bool_t = True,
1094910949
numeric_only: bool_t = False,
1095010950
**kwargs,
@@ -10956,30 +10956,13 @@ def _stat_function(
1095610956

1095710957
validate_bool_kwarg(skipna, "skipna", none_allowed=False)
1095810958

10959-
if axis is None and self.ndim > 1:
10960-
# user must have explicitly passed axis=None
10961-
# GH#21597
10962-
warnings.warn(
10963-
f"In a future version, DataFrame.{name}(axis=None) will return a "
10964-
f"scalar {name} over the entire DataFrame. To retain the old "
10965-
f"behavior, use 'frame.{name}(axis=0)' or just 'frame.{name}()'",
10966-
FutureWarning,
10967-
stacklevel=find_stack_level(),
10968-
)
10969-
10970-
if axis is lib.no_default:
10971-
axis = None
10972-
10973-
if axis is None:
10974-
axis = self._stat_axis_number
10975-
1097610959
return self._reduce(
1097710960
func, name=name, axis=axis, skipna=skipna, numeric_only=numeric_only
1097810961
)
1097910962

1098010963
def min(
1098110964
self,
10982-
axis: Axis | None | lib.NoDefault = lib.no_default,
10965+
axis: Axis | None = 0,
1098310966
skipna: bool_t = True,
1098410967
numeric_only: bool_t = False,
1098510968
**kwargs,
@@ -10995,7 +10978,7 @@ def min(
1099510978

1099610979
def max(
1099710980
self,
10998-
axis: Axis | None | lib.NoDefault = lib.no_default,
10981+
axis: Axis | None = 0,
1099910982
skipna: bool_t = True,
1100010983
numeric_only: bool_t = False,
1100110984
**kwargs,
@@ -11011,7 +10994,7 @@ def max(
1101110994

1101210995
def mean(
1101310996
self,
11014-
axis: Axis | None | lib.NoDefault = lib.no_default,
10997+
axis: Axis | None = 0,
1101510998
skipna: bool_t = True,
1101610999
numeric_only: bool_t = False,
1101711000
**kwargs,
@@ -11022,7 +11005,7 @@ def mean(
1102211005

1102311006
def median(
1102411007
self,
11025-
axis: Axis | None | lib.NoDefault = lib.no_default,
11008+
axis: Axis | None = 0,
1102611009
skipna: bool_t = True,
1102711010
numeric_only: bool_t = False,
1102811011
**kwargs,
@@ -11033,7 +11016,7 @@ def median(
1103311016

1103411017
def skew(
1103511018
self,
11036-
axis: Axis | None | lib.NoDefault = lib.no_default,
11019+
axis: Axis | None = 0,
1103711020
skipna: bool_t = True,
1103811021
numeric_only: bool_t = False,
1103911022
**kwargs,
@@ -11044,7 +11027,7 @@ def skew(
1104411027

1104511028
def kurt(
1104611029
self,
11047-
axis: Axis | None | lib.NoDefault = lib.no_default,
11030+
axis: Axis | None = 0,
1104811031
skipna: bool_t = True,
1104911032
numeric_only: bool_t = False,
1105011033
**kwargs,
@@ -11366,7 +11349,7 @@ def prod(
1136611349
)
1136711350
def mean(
1136811351
self,
11369-
axis: AxisInt | None | lib.NoDefault = lib.no_default,
11352+
axis: AxisInt | None = 0,
1137011353
skipna: bool_t = True,
1137111354
numeric_only: bool_t = False,
1137211355
**kwargs,
@@ -11387,7 +11370,7 @@ def mean(
1138711370
)
1138811371
def skew(
1138911372
self,
11390-
axis: AxisInt | None | lib.NoDefault = lib.no_default,
11373+
axis: AxisInt | None = 0,
1139111374
skipna: bool_t = True,
1139211375
numeric_only: bool_t = False,
1139311376
**kwargs,
@@ -11411,7 +11394,7 @@ def skew(
1141111394
)
1141211395
def kurt(
1141311396
self,
11414-
axis: Axis | None | lib.NoDefault = lib.no_default,
11397+
axis: Axis | None = 0,
1141511398
skipna: bool_t = True,
1141611399
numeric_only: bool_t = False,
1141711400
**kwargs,
@@ -11433,7 +11416,7 @@ def kurt(
1143311416
)
1143411417
def median(
1143511418
self,
11436-
axis: AxisInt | None | lib.NoDefault = lib.no_default,
11419+
axis: AxisInt | None = 0,
1143711420
skipna: bool_t = True,
1143811421
numeric_only: bool_t = False,
1143911422
**kwargs,
@@ -11456,7 +11439,7 @@ def median(
1145611439
)
1145711440
def max(
1145811441
self,
11459-
axis: AxisInt | None | lib.NoDefault = lib.no_default,
11442+
axis: AxisInt | None = 0,
1146011443
skipna: bool_t = True,
1146111444
numeric_only: bool_t = False,
1146211445
**kwargs,
@@ -11479,7 +11462,7 @@ def max(
1147911462
)
1148011463
def min(
1148111464
self,
11482-
axis: AxisInt | None | lib.NoDefault = lib.no_default,
11465+
axis: AxisInt | None = 0,
1148311466
skipna: bool_t = True,
1148411467
numeric_only: bool_t = False,
1148511468
**kwargs,
@@ -11708,6 +11691,12 @@ def _doc_params(cls):
1170811691
axis : {axis_descr}
1170911692
Axis for the function to be applied on.
1171011693
For `Series` this parameter is unused and defaults to 0.
11694+
11695+
For DataFrames, specifying ``axis=None`` will apply the aggregation
11696+
across both axes.
11697+
11698+
.. versionadded:: 2.0.0
11699+
1171111700
skipna : bool, default True
1171211701
Exclude NA/null values when computing the result.
1171311702
numeric_only : bool, default False
@@ -11719,7 +11708,7 @@ def _doc_params(cls):
1171911708
1172011709
Returns
1172111710
-------
11722-
{name1} or {name2} (if level specified)\
11711+
{name1} or scalar\
1172311712
{see_also}\
1172411713
{examples}
1172511714
"""

Diff for: pandas/tests/frame/test_reductions.py

+16-15
Original file line numberDiff line numberDiff line change
@@ -1488,7 +1488,6 @@ def test_median_categorical_dtype_nuisance_column(self):
14881488
# TODO: np.median(df, axis=0) gives np.array([2.0, 2.0]) instead
14891489
# of expected.values
14901490

1491-
@pytest.mark.filterwarnings("ignore:.*will return a scalar.*:FutureWarning")
14921491
@pytest.mark.parametrize("method", ["min", "max"])
14931492
def test_min_max_categorical_dtype_non_ordered_nuisance_column(self, method):
14941493
# GH#28949 DataFrame.min should behave like Series.min
@@ -1510,15 +1509,15 @@ def test_min_max_categorical_dtype_non_ordered_nuisance_column(self, method):
15101509
getattr(df, method)()
15111510

15121511
with pytest.raises(TypeError, match="is not ordered for operation"):
1513-
getattr(np, method)(df)
1512+
getattr(np, method)(df, axis=0)
15141513

15151514
# same thing, but with an additional non-categorical column
15161515
df["B"] = df["A"].astype(object)
15171516
with pytest.raises(TypeError, match="is not ordered for operation"):
15181517
getattr(df, method)()
15191518

15201519
with pytest.raises(TypeError, match="is not ordered for operation"):
1521-
getattr(np, method)(df)
1520+
getattr(np, method)(df, axis=0)
15221521

15231522

15241523
def test_sum_timedelta64_skipna_false(using_array_manager, request):
@@ -1600,20 +1599,22 @@ def test_prod_sum_min_count_mixed_object():
16001599

16011600

16021601
@pytest.mark.parametrize("method", ["min", "max", "mean", "median", "skew", "kurt"])
1603-
def test_reduction_axis_none_deprecation(method):
1604-
# GH#21597 deprecate axis=None defaulting to axis=0 so that we can change it
1605-
# to reducing over all axes.
1602+
def test_reduction_axis_none_returns_scalar(method):
1603+
# GH#21597 As of 2.0, axis=None reduces over all axes.
16061604

16071605
df = DataFrame(np.random.randn(4, 4))
1608-
meth = getattr(df, method)
1609-
1610-
msg = f"scalar {method} over the entire DataFrame"
1611-
with tm.assert_produces_warning(FutureWarning, match=msg):
1612-
res = meth(axis=None)
1613-
with tm.assert_produces_warning(None):
1614-
expected = meth()
1615-
tm.assert_series_equal(res, expected)
1616-
tm.assert_series_equal(res, meth(axis=0))
1606+
1607+
result = getattr(df, method)(axis=None)
1608+
np_arr = df.to_numpy()
1609+
if method in {"skew", "kurt"}:
1610+
comp_mod = pytest.importorskip("scipy.stats")
1611+
if method == "kurt":
1612+
method = "kurtosis"
1613+
expected = getattr(comp_mod, method)(np_arr, bias=False, axis=None)
1614+
tm.assert_almost_equal(result, expected)
1615+
else:
1616+
expected = getattr(np, method)(np_arr, axis=None)
1617+
assert result == expected
16171618

16181619

16191620
@pytest.mark.parametrize(

Diff for: pandas/tests/groupby/test_categorical.py

+2-6
Original file line numberDiff line numberDiff line change
@@ -147,11 +147,7 @@ def f(x):
147147
tm.assert_frame_equal(df.groupby(c, observed=False).transform(sum), df[["a"]])
148148

149149
gbc = df.groupby(c, observed=False)
150-
with tm.assert_produces_warning(
151-
FutureWarning, match="scalar max", check_stacklevel=False
152-
):
153-
# stacklevel is thrown off (i think) bc the stack goes through numpy C code
154-
result = gbc.transform(lambda xs: np.max(xs))
150+
result = gbc.transform(lambda xs: np.max(xs, axis=0))
155151
tm.assert_frame_equal(result, df[["a"]])
156152

157153
with tm.assert_produces_warning(None):
@@ -295,7 +291,7 @@ def test_apply(ordered):
295291
idx = MultiIndex.from_arrays([missing, dense], names=["missing", "dense"])
296292
expected = DataFrame([0, 1, 2.0], index=idx, columns=["values"])
297293

298-
result = grouped.apply(lambda x: np.mean(x))
294+
result = grouped.apply(lambda x: np.mean(x, axis=0))
299295
tm.assert_frame_equal(result, expected)
300296

301297
result = grouped.mean()

Diff for: pandas/tests/groupby/test_function.py

+4-12
Original file line numberDiff line numberDiff line change
@@ -80,28 +80,20 @@ def test_builtins_apply(keys, f):
8080
assert_msg = f"invalid frame shape: {result.shape} (expected ({ngroups}, 3))"
8181
assert result.shape == (ngroups, 3), assert_msg
8282

83-
npfunc = getattr(np, fname) # numpy's equivalent function
84-
if f in [max, min]:
85-
warn = FutureWarning
86-
else:
87-
warn = None
88-
msg = "scalar (max|min) over the entire DataFrame"
89-
with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False):
90-
# stacklevel can be thrown off because (i think) the stack
91-
# goes through some of numpy's C code.
92-
expected = gb.apply(npfunc)
83+
npfunc = lambda x: getattr(np, fname)(x, axis=0) # numpy's equivalent function
84+
expected = gb.apply(npfunc)
9385
tm.assert_frame_equal(result, expected)
9486

9587
with tm.assert_produces_warning(None):
96-
expected2 = gb.apply(lambda x: npfunc(x, axis=0))
88+
expected2 = gb.apply(lambda x: npfunc(x))
9789
tm.assert_frame_equal(result, expected2)
9890

9991
if f != sum:
10092
expected = gb.agg(fname).reset_index()
10193
expected.set_index(keys, inplace=True, drop=False)
10294
tm.assert_frame_equal(result, expected, check_dtype=False)
10395

104-
tm.assert_series_equal(getattr(result, fname)(), getattr(df, fname)())
96+
tm.assert_series_equal(getattr(result, fname)(axis=0), getattr(df, fname)(axis=0))
10597

10698

10799
class TestNumericOnly:

Diff for: pandas/tests/groupby/transform/test_transform.py

+5-12
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ def test_transform():
5757
tm.assert_frame_equal(result, expected)
5858

5959
def demean(arr):
60-
return arr - arr.mean()
60+
return arr - arr.mean(axis=0)
6161

6262
people = DataFrame(
6363
np.random.randn(5, 5),
@@ -144,7 +144,7 @@ def test_transform_broadcast(tsframe, ts):
144144
result = grouped.transform(np.mean)
145145
tm.assert_index_equal(result.index, tsframe.index)
146146
for _, gp in grouped:
147-
agged = gp.mean()
147+
agged = gp.mean(axis=0)
148148
res = result.reindex(gp.index)
149149
for col in tsframe:
150150
assert_fp_equal(res[col], agged[col])
@@ -214,7 +214,7 @@ def test_transform_axis_ts(tsframe):
214214
ts = tso
215215
grouped = ts.groupby(lambda x: x.weekday(), group_keys=False)
216216
result = ts - grouped.transform("mean")
217-
expected = grouped.apply(lambda x: x - x.mean())
217+
expected = grouped.apply(lambda x: x - x.mean(axis=0))
218218
tm.assert_frame_equal(result, expected)
219219

220220
ts = ts.T
@@ -227,7 +227,7 @@ def test_transform_axis_ts(tsframe):
227227
ts = tso.iloc[[1, 0] + list(range(2, len(base)))]
228228
grouped = ts.groupby(lambda x: x.weekday(), group_keys=False)
229229
result = ts - grouped.transform("mean")
230-
expected = grouped.apply(lambda x: x - x.mean())
230+
expected = grouped.apply(lambda x: x - x.mean(axis=0))
231231
tm.assert_frame_equal(result, expected)
232232

233233
ts = ts.T
@@ -477,16 +477,9 @@ def test_transform_coercion():
477477

478478
expected = g.transform(np.mean)
479479

480-
# in 2.0 np.mean on a DataFrame is equivalent to frame.mean(axis=None)
481-
# which not gives a scalar instead of Series
482-
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
483-
result = g.transform(lambda x: np.mean(x))
480+
result = g.transform(lambda x: np.mean(x, axis=0))
484481
tm.assert_frame_equal(result, expected)
485482

486-
with tm.assert_produces_warning(None):
487-
result2 = g.transform(lambda x: np.mean(x, axis=0))
488-
tm.assert_frame_equal(result2, expected)
489-
490483

491484
def test_groupby_transform_with_int():
492485

0 commit comments

Comments
 (0)