Skip to content

Commit ba85a64

Browse files
authored
BUG: DataFrame.all(bool_only=True) inconsistency with object dtype (#37799)
1 parent 5bf688a commit ba85a64

File tree

5 files changed

+130
-3
lines changed

5 files changed

+130
-3
lines changed

doc/source/whatsnew/v1.2.0.rst

+48
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,54 @@ Other enhancements
236236
- Improve numerical stability for :meth:`Rolling.skew()`, :meth:`Rolling.kurt()`, :meth:`Expanding.skew()` and :meth:`Expanding.kurt()` through implementation of Kahan summation (:issue:`6929`)
237237
- Improved error reporting for subsetting columns of a :class:`DataFrameGroupBy` with ``axis=1`` (:issue:`37725`)
238238

239+
.. ---------------------------------------------------------------------------
240+
241+
.. _whatsnew_120.notable_bug_fixes:
242+
243+
Notable bug fixes
244+
~~~~~~~~~~~~~~~~~
245+
246+
These are bug fixes that might have notable behavior changes.
247+
248+
Consistency of DataFrame Reductions
249+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
250+
:meth:`DataFrame.any` and :meth:`DataFrame.all` with ``bool_only=True`` now
251+
determines whether to exclude object-dtype columns on a column-by-column basis,
252+
instead of checking if *all* object-dtype columns can be considered boolean.
253+
254+
This prevents pathological behavior where applying the reduction on a subset
255+
of columns could result in a larger :class:`Series` result. See (:issue:`37799`).
256+
257+
.. ipython:: python
258+
259+
df = pd.DataFrame({"A": ["foo", "bar"], "B": [True, False]}, dtype=object)
260+
df["C"] = pd.Series([True, True])
261+
262+
263+
*Previous behavior*:
264+
265+
.. code-block:: ipython
266+
267+
In [5]: df.all(bool_only=True)
268+
Out[5]:
269+
C True
270+
dtype: bool
271+
272+
In [6]: df[["B", "C"]].all(bool_only=True)
273+
Out[6]:
274+
B False
275+
C True
276+
dtype: bool
277+
278+
*New behavior*:
279+
280+
.. ipython:: python
281+
282+
In [5]: df.all(bool_only=True)
283+
284+
In [6]: df[["B", "C"]].all(bool_only=True)
285+
286+
239287
.. _whatsnew_120.api_breaking.python:
240288

241289
Increased minimum version for Python

pandas/core/internals/blocks.py

+14
Original file line numberDiff line numberDiff line change
@@ -450,6 +450,20 @@ def f(mask, val, idx):
450450

451451
return self.split_and_operate(None, f, inplace)
452452

453+
def _split(self) -> List["Block"]:
454+
"""
455+
Split a block into a list of single-column blocks.
456+
"""
457+
assert self.ndim == 2
458+
459+
new_blocks = []
460+
for i, ref_loc in enumerate(self.mgr_locs):
461+
vals = self.values[slice(i, i + 1)]
462+
463+
nb = self.make_block(vals, [ref_loc])
464+
new_blocks.append(nb)
465+
return new_blocks
466+
453467
def split_and_operate(self, mask, f, inplace: bool) -> List["Block"]:
454468
"""
455469
split the block per-column, and apply the callable f

pandas/core/internals/managers.py

+17-2
Original file line numberDiff line numberDiff line change
@@ -713,13 +713,28 @@ def is_view(self) -> bool:
713713

714714
def get_bool_data(self, copy: bool = False) -> "BlockManager":
715715
"""
716+
Select blocks that are bool-dtype and columns from object-dtype blocks
717+
that are all-bool.
718+
716719
Parameters
717720
----------
718721
copy : bool, default False
719722
Whether to copy the blocks
720723
"""
721-
self._consolidate_inplace()
722-
return self._combine([b for b in self.blocks if b.is_bool], copy)
724+
725+
new_blocks = []
726+
727+
for blk in self.blocks:
728+
if blk.dtype == bool:
729+
new_blocks.append(blk)
730+
731+
elif blk.is_object:
732+
nbs = blk._split()
733+
for nb in nbs:
734+
if nb.is_bool:
735+
new_blocks.append(nb)
736+
737+
return self._combine(new_blocks, copy)
723738

724739
def get_numeric_data(self, copy: bool = False) -> "BlockManager":
725740
"""

pandas/tests/frame/test_reductions.py

+31
Original file line numberDiff line numberDiff line change
@@ -1118,6 +1118,37 @@ def test_any_all_object(self):
11181118
result = np.any(DataFrame(columns=["a", "b"])).item()
11191119
assert result is False
11201120

1121+
def test_any_all_object_bool_only(self):
1122+
df = DataFrame({"A": ["foo", 2], "B": [True, False]}).astype(object)
1123+
df._consolidate_inplace()
1124+
df["C"] = Series([True, True])
1125+
1126+
# The underlying bug is in DataFrame._get_bool_data, so we check
1127+
# that while we're here
1128+
res = df._get_bool_data()
1129+
expected = df[["B", "C"]]
1130+
tm.assert_frame_equal(res, expected)
1131+
1132+
res = df.all(bool_only=True, axis=0)
1133+
expected = Series([False, True], index=["B", "C"])
1134+
tm.assert_series_equal(res, expected)
1135+
1136+
# operating on a subset of columns should not produce a _larger_ Series
1137+
res = df[["B", "C"]].all(bool_only=True, axis=0)
1138+
tm.assert_series_equal(res, expected)
1139+
1140+
assert not df.all(bool_only=True, axis=None)
1141+
1142+
res = df.any(bool_only=True, axis=0)
1143+
expected = Series([True, True], index=["B", "C"])
1144+
tm.assert_series_equal(res, expected)
1145+
1146+
# operating on a subset of columns should not produce a _larger_ Series
1147+
res = df[["B", "C"]].any(bool_only=True, axis=0)
1148+
tm.assert_series_equal(res, expected)
1149+
1150+
assert df.any(bool_only=True, axis=None)
1151+
11211152
@pytest.mark.parametrize("method", ["any", "all"])
11221153
def test_any_all_level_axis_none_raises(self, method):
11231154
df = DataFrame(

pandas/tests/internals/test_internals.py

+20-1
Original file line numberDiff line numberDiff line change
@@ -264,6 +264,25 @@ def test_delete(self):
264264
with pytest.raises(IndexError, match=None):
265265
newb.delete(3)
266266

267+
def test_split(self):
268+
# GH#37799
269+
values = np.random.randn(3, 4)
270+
blk = make_block(values, placement=[3, 1, 6])
271+
result = blk._split()
272+
273+
# check that we get views, not copies
274+
values[:] = -9999
275+
assert (blk.values == -9999).all()
276+
277+
assert len(result) == 3
278+
expected = [
279+
make_block(values[[0]], placement=[3]),
280+
make_block(values[[1]], placement=[1]),
281+
make_block(values[[2]], placement=[6]),
282+
]
283+
for res, exp in zip(result, expected):
284+
assert_block_equal(res, exp)
285+
267286

268287
class TestBlockManager:
269288
def test_attrs(self):
@@ -667,7 +686,7 @@ def test_get_bool_data(self):
667686
mgr.iset(6, np.array([True, False, True], dtype=np.object_))
668687

669688
bools = mgr.get_bool_data()
670-
tm.assert_index_equal(bools.items, Index(["bool"]))
689+
tm.assert_index_equal(bools.items, Index(["bool", "dt"]))
671690
tm.assert_almost_equal(
672691
mgr.iget(mgr.items.get_loc("bool")).internal_values(),
673692
bools.iget(bools.items.get_loc("bool")).internal_values(),

0 commit comments

Comments
 (0)