Skip to content

Commit 50ae0bf

Browse files
authored
BUG: DataFrame reductions inconsistent with Series counterparts (#37827)
1 parent 840c142 commit 50ae0bf

File tree

4 files changed

+195
-36
lines changed

4 files changed

+195
-36
lines changed

doc/source/whatsnew/v1.2.0.rst

+57
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,63 @@ of columns could result in a larger :class:`Series` result. See (:issue:`37799`)
284284
In [6]: df[["B", "C"]].all(bool_only=True)
285285
286286
287+
Other :class:`DataFrame` reductions with ``numeric_only=None`` will also avoid
288+
this pathological behavior (:issue:`37827`):
289+
290+
.. ipython:: python
291+
292+
df = pd.DataFrame({"A": [0, 1, 2], "B": ["a", "b", "c"]}, dtype=object)
293+
294+
295+
*Previous behavior*:
296+
297+
.. code-block:: ipython
298+
299+
In [3]: df.mean()
300+
Out[3]: Series([], dtype: float64)
301+
302+
In [4]: df[["A"]].mean()
303+
Out[4]:
304+
A 1.0
305+
dtype: float64
306+
307+
*New behavior*:
308+
309+
.. ipython:: python
310+
311+
df.mean()
312+
313+
df[["A"]].mean()
314+
315+
Moreover, :class:`DataFrame` reductions with ``numeric_only=None`` will now be
316+
consistent with their :class:`Series` counterparts. In particular, for
317+
reductions where the :class:`Series` method raises ``TypeError``, the
318+
:class:`DataFrame` reduction will now consider that column non-numeric
319+
instead of casting to NumPy which may have different semantics (:issue:`36076`,
320+
:issue:`28949`, :issue:`21020`).
321+
322+
.. ipython:: python
323+
324+
ser = pd.Series([0, 1], dtype="category", name="A")
325+
df = ser.to_frame()
326+
327+
328+
*Previous behavior*:
329+
330+
.. code-block:: ipython
331+
332+
In [5]: df.any()
333+
Out[5]:
334+
A True
335+
dtype: bool
336+
337+
*New behavior*:
338+
339+
.. ipython:: python
340+
341+
df.any()
342+
343+
287344
.. _whatsnew_120.api_breaking.python:
288345

289346
Increased minimum version for Python

pandas/core/frame.py

+5-27
Original file line numberDiff line numberDiff line change
@@ -8765,7 +8765,7 @@ def _get_data() -> DataFrame:
87658765
data = self._get_bool_data()
87668766
return data
87678767

8768-
if numeric_only is not None:
8768+
if numeric_only is not None or axis == 0:
87698769
# For numeric_only non-None and axis non-None, we know
87708770
# which blocks to use and no try/except is needed.
87718771
# For numeric_only=None only the case with axis==0 and no object
@@ -8790,36 +8790,14 @@ def _get_data() -> DataFrame:
87908790
# GH#35865 careful to cast explicitly to object
87918791
nvs = coerce_to_dtypes(out.values, df.dtypes.iloc[np.sort(indexer)])
87928792
out[:] = np.array(nvs, dtype=object)
8793+
if axis == 0 and len(self) == 0 and name in ["sum", "prod"]:
8794+
# Even if we are object dtype, follow numpy and return
8795+
# float64, see test_apply_funcs_over_empty
8796+
out = out.astype(np.float64)
87938797
return out
87948798

87958799
assert numeric_only is None
87968800

8797-
if not self._is_homogeneous_type or self._mgr.any_extension_types:
8798-
# try to avoid self.values call
8799-
8800-
if filter_type is None and axis == 0:
8801-
# operate column-wise
8802-
8803-
# numeric_only must be None here, as other cases caught above
8804-
8805-
# this can end up with a non-reduction
8806-
# but not always. if the types are mixed
8807-
# with datelike then need to make sure a series
8808-
8809-
# we only end up here if we have not specified
8810-
# numeric_only and yet we have tried a
8811-
# column-by-column reduction, where we have mixed type.
8812-
# So let's just do what we can
8813-
from pandas.core.apply import frame_apply
8814-
8815-
opa = frame_apply(
8816-
self, func=func, result_type="expand", ignore_failures=True
8817-
)
8818-
result = opa.get_result()
8819-
if result.ndim == self.ndim:
8820-
result = result.iloc[0].rename(None)
8821-
return result
8822-
88238801
data = self
88248802
values = data.values
88258803

pandas/core/internals/blocks.py

+18-5
Original file line numberDiff line numberDiff line change
@@ -464,7 +464,9 @@ def _split(self) -> List["Block"]:
464464
new_blocks.append(nb)
465465
return new_blocks
466466

467-
def split_and_operate(self, mask, f, inplace: bool) -> List["Block"]:
467+
def split_and_operate(
468+
self, mask, f, inplace: bool, ignore_failures: bool = False
469+
) -> List["Block"]:
468470
"""
469471
split the block per-column, and apply the callable f
470472
per-column, return a new block for each. Handle
@@ -474,7 +476,8 @@ def split_and_operate(self, mask, f, inplace: bool) -> List["Block"]:
474476
----------
475477
mask : 2-d boolean mask
476478
f : callable accepting (1d-mask, 1d values, indexer)
477-
inplace : boolean
479+
inplace : bool
480+
ignore_failures : bool, default False
478481
479482
Returns
480483
-------
@@ -513,8 +516,16 @@ def make_a_block(nv, ref_loc):
513516
v = new_values[i]
514517

515518
# need a new block
516-
if m.any():
517-
nv = f(m, v, i)
519+
if m.any() or m.size == 0:
520+
# Apply our function; we may ignore_failures if this is a
521+
# reduction that is dropping nuisance columns GH#37827
522+
try:
523+
nv = f(m, v, i)
524+
except TypeError:
525+
if ignore_failures:
526+
continue
527+
else:
528+
raise
518529
else:
519530
nv = v if inplace else v.copy()
520531

@@ -2459,7 +2470,9 @@ def mask_func(mask, values, inplace):
24592470
values = values.reshape(1, -1)
24602471
return func(values)
24612472

2462-
return self.split_and_operate(None, mask_func, False)
2473+
return self.split_and_operate(
2474+
None, mask_func, False, ignore_failures=ignore_failures
2475+
)
24632476

24642477
try:
24652478
res = func(values)

pandas/tests/frame/test_reductions.py

+115-4
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from pandas import (
1313
Categorical,
1414
DataFrame,
15+
Index,
1516
MultiIndex,
1617
Series,
1718
Timestamp,
@@ -1083,10 +1084,12 @@ def test_any_all_bool_only(self):
10831084
pytest.param(np.any, {"A": Series([0, 1], dtype="m8[ns]")}, True),
10841085
pytest.param(np.all, {"A": Series([1, 2], dtype="m8[ns]")}, True),
10851086
pytest.param(np.any, {"A": Series([1, 2], dtype="m8[ns]")}, True),
1086-
(np.all, {"A": Series([0, 1], dtype="category")}, False),
1087-
(np.any, {"A": Series([0, 1], dtype="category")}, True),
1087+
# np.all on Categorical raises, so the reduction drops the
1088+
# column, so all is being done on an empty Series, so is True
1089+
(np.all, {"A": Series([0, 1], dtype="category")}, True),
1090+
(np.any, {"A": Series([0, 1], dtype="category")}, False),
10881091
(np.all, {"A": Series([1, 2], dtype="category")}, True),
1089-
(np.any, {"A": Series([1, 2], dtype="category")}, True),
1092+
(np.any, {"A": Series([1, 2], dtype="category")}, False),
10901093
# Mix GH#21484
10911094
pytest.param(
10921095
np.all,
@@ -1308,6 +1311,114 @@ def test_frame_any_with_timedelta(self):
13081311
tm.assert_series_equal(result, expected)
13091312

13101313

1314+
class TestNuisanceColumns:
1315+
@pytest.mark.parametrize("method", ["any", "all"])
1316+
def test_any_all_categorical_dtype_nuisance_column(self, method):
1317+
# GH#36076 DataFrame should match Series behavior
1318+
ser = Series([0, 1], dtype="category", name="A")
1319+
df = ser.to_frame()
1320+
1321+
# Double-check the Series behavior is to raise
1322+
with pytest.raises(TypeError, match="does not implement reduction"):
1323+
getattr(ser, method)()
1324+
1325+
with pytest.raises(TypeError, match="does not implement reduction"):
1326+
getattr(np, method)(ser)
1327+
1328+
with pytest.raises(TypeError, match="does not implement reduction"):
1329+
getattr(df, method)(bool_only=False)
1330+
1331+
# With bool_only=None, operating on this column raises and is ignored,
1332+
# so we expect an empty result.
1333+
result = getattr(df, method)(bool_only=None)
1334+
expected = Series([], index=Index([]), dtype=bool)
1335+
tm.assert_series_equal(result, expected)
1336+
1337+
result = getattr(np, method)(df, axis=0)
1338+
tm.assert_series_equal(result, expected)
1339+
1340+
def test_median_categorical_dtype_nuisance_column(self):
1341+
# GH#21020 DataFrame.median should match Series.median
1342+
df = DataFrame({"A": Categorical([1, 2, 2, 2, 3])})
1343+
ser = df["A"]
1344+
1345+
# Double-check the Series behavior is to raise
1346+
with pytest.raises(TypeError, match="does not implement reduction"):
1347+
ser.median()
1348+
1349+
with pytest.raises(TypeError, match="does not implement reduction"):
1350+
df.median(numeric_only=False)
1351+
1352+
result = df.median()
1353+
expected = Series([], index=Index([]), dtype=np.float64)
1354+
tm.assert_series_equal(result, expected)
1355+
1356+
# same thing, but with an additional non-categorical column
1357+
df["B"] = df["A"].astype(int)
1358+
1359+
with pytest.raises(TypeError, match="does not implement reduction"):
1360+
df.median(numeric_only=False)
1361+
1362+
result = df.median()
1363+
expected = Series([2.0], index=["B"])
1364+
tm.assert_series_equal(result, expected)
1365+
1366+
# TODO: np.median(df, axis=0) gives np.array([2.0, 2.0]) instead
1367+
# of expected.values
1368+
1369+
@pytest.mark.parametrize("method", ["min", "max"])
1370+
def test_min_max_categorical_dtype_non_ordered_nuisance_column(self, method):
1371+
# GH#28949 DataFrame.min should behave like Series.min
1372+
cat = Categorical(["a", "b", "c", "b"], ordered=False)
1373+
ser = Series(cat)
1374+
df = ser.to_frame("A")
1375+
1376+
# Double-check the Series behavior
1377+
with pytest.raises(TypeError, match="is not ordered for operation"):
1378+
getattr(ser, method)()
1379+
1380+
with pytest.raises(TypeError, match="is not ordered for operation"):
1381+
getattr(np, method)(ser)
1382+
1383+
with pytest.raises(TypeError, match="is not ordered for operation"):
1384+
getattr(df, method)(numeric_only=False)
1385+
1386+
result = getattr(df, method)()
1387+
expected = Series([], index=Index([]), dtype=np.float64)
1388+
tm.assert_series_equal(result, expected)
1389+
1390+
result = getattr(np, method)(df)
1391+
tm.assert_series_equal(result, expected)
1392+
1393+
# same thing, but with an additional non-categorical column
1394+
df["B"] = df["A"].astype(object)
1395+
result = getattr(df, method)()
1396+
if method == "min":
1397+
expected = Series(["a"], index=["B"])
1398+
else:
1399+
expected = Series(["c"], index=["B"])
1400+
tm.assert_series_equal(result, expected)
1401+
1402+
result = getattr(np, method)(df)
1403+
tm.assert_series_equal(result, expected)
1404+
1405+
def test_reduction_object_block_splits_nuisance_columns(self):
1406+
# GH#37827
1407+
df = DataFrame({"A": [0, 1, 2], "B": ["a", "b", "c"]}, dtype=object)
1408+
1409+
# We should only exclude "B", not "A"
1410+
result = df.mean()
1411+
expected = Series([1.0], index=["A"])
1412+
tm.assert_series_equal(result, expected)
1413+
1414+
# Same behavior but heterogeneous dtype
1415+
df["C"] = df["A"].astype(int) + 4
1416+
1417+
result = df.mean()
1418+
expected = Series([1.0, 5.0], index=["A", "C"])
1419+
tm.assert_series_equal(result, expected)
1420+
1421+
13111422
def test_sum_timedelta64_skipna_false():
13121423
# GH#17235
13131424
arr = np.arange(8).astype(np.int64).view("m8[s]").reshape(4, 2)
@@ -1352,6 +1463,6 @@ def test_minmax_extensionarray(method, numeric_only):
13521463
df = DataFrame({"Int64": ser})
13531464
result = getattr(df, method)(numeric_only=numeric_only)
13541465
expected = Series(
1355-
[getattr(int64_info, method)], index=pd.Index(["Int64"], dtype="object")
1466+
[getattr(int64_info, method)], index=Index(["Int64"], dtype="object")
13561467
)
13571468
tm.assert_series_equal(result, expected)

0 commit comments

Comments
 (0)