pandas-dev · jreback · Nov 14, 2020 · Nov 12, 2020 · Nov 13, 2020 · Nov 13, 2020
diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
@@ -236,6 +236,54 @@ Other enhancements
 - Improve numerical stability for :meth:`Rolling.skew()`, :meth:`Rolling.kurt()`, :meth:`Expanding.skew()` and :meth:`Expanding.kurt()` through implementation of Kahan summation (:issue:`6929`)
 - Improved error reporting for subsetting columns of a :class:`DataFrameGroupBy` with ``axis=1`` (:issue:`37725`)
 
+.. ---------------------------------------------------------------------------
+
+.. _whatsnew_120.notable_bug_fixes:
+
+Notable bug fixes
+~~~~~~~~~~~~~~~~~
+
+These are bug fixes that might have notable behavior changes.
+
+Consistency of DataFrame Reductions
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+:meth:`DataFrame.any` and :meth:`DataFrame.all` with ``bool_only=True`` now
+determines whether to exclude object-dtype columns on a column-by-column basis,
+instead of checking if *all* object-dtype columns can be considered boolean.
+
+This prevents pathological behavior where applying the reduction on a subset
+of columns could result in a larger :class:`Series` result. See (:issue:`37799`).
+
+.. ipython:: python
+
+    df = pd.DataFrame({"A": ["foo", "bar"], "B": [True, False]}, dtype=object)
+    df["C"] = pd.Series([True, True])
+
+
+*Previous behavior*:
+
+.. code-block:: ipython
+
+    In [5]: df.all(bool_only=True)
+    Out[5]:
+    C    True
+    dtype: bool
+
+    In [6]: df[["B", "C"]].all(bool_only=True)
+    Out[6]:
+    B    False
+    C    True
+    dtype: bool
+
+*New behavior*:
+
+.. ipython:: python
+
+    In [5]: df.all(bool_only=True)
+
+    In [6]: df[["B", "C"]].all(bool_only=True)
+
+
 .. _whatsnew_120.api_breaking.python:
 
 Increased minimum version for Python

diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
@@ -450,6 +450,20 @@ def f(mask, val, idx):
 
         return self.split_and_operate(None, f, inplace)
 
+    def _split(self) -> List["Block"]:
+        """
+        Split a block into a list of single-column blocks.
+        """
+        assert self.ndim == 2
+
+        new_blocks = []
+        for i, ref_loc in enumerate(self.mgr_locs):
+            vals = self.values[slice(i, i + 1)]
+
+            nb = self.make_block(vals, [ref_loc])
+            new_blocks.append(nb)
+        return new_blocks
+
     def split_and_operate(self, mask, f, inplace: bool) -> List["Block"]:
         """
         split the block per-column, and apply the callable f

diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
@@ -713,13 +713,28 @@ def is_view(self) -> bool:
 
     def get_bool_data(self, copy: bool = False) -> "BlockManager":
         """
+        Select blocks that are bool-dtype and columns from object-dtype blocks
+        that are all-bool.
+
         Parameters
         ----------
         copy : bool, default False
             Whether to copy the blocks
         """
-        self._consolidate_inplace()
-        return self._combine([b for b in self.blocks if b.is_bool], copy)
+
+        new_blocks = []
+
+        for blk in self.blocks:
+            if blk.dtype == bool:
+                new_blocks.append(blk)
+
+            elif blk.is_object:
+                nbs = blk._split()
+                for nb in nbs:
+                    if nb.is_bool:
+                        new_blocks.append(nb)
+
+        return self._combine(new_blocks, copy)
 
     def get_numeric_data(self, copy: bool = False) -> "BlockManager":
         """

diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py
@@ -1118,6 +1118,37 @@ def test_any_all_object(self):
         result = np.any(DataFrame(columns=["a", "b"])).item()
         assert result is False
 
+    def test_any_all_object_bool_only(self):
+        df = DataFrame({"A": ["foo", 2], "B": [True, False]}).astype(object)
+        df._consolidate_inplace()
+        df["C"] = Series([True, True])
+
+        # The underlying bug is in DataFrame._get_bool_data, so we check
+        #  that while we're here
+        res = df._get_bool_data()
+        expected = df[["B", "C"]]
+        tm.assert_frame_equal(res, expected)
+
+        res = df.all(bool_only=True, axis=0)
+        expected = Series([False, True], index=["B", "C"])
+        tm.assert_series_equal(res, expected)
+
+        # operating on a subset of columns should not produce a _larger_ Series
+        res = df[["B", "C"]].all(bool_only=True, axis=0)
+        tm.assert_series_equal(res, expected)
+
+        assert not df.all(bool_only=True, axis=None)
+
+        res = df.any(bool_only=True, axis=0)
+        expected = Series([True, True], index=["B", "C"])
+        tm.assert_series_equal(res, expected)
+
+        # operating on a subset of columns should not produce a _larger_ Series
+        res = df[["B", "C"]].any(bool_only=True, axis=0)
+        tm.assert_series_equal(res, expected)
+
+        assert df.any(bool_only=True, axis=None)
+
     @pytest.mark.parametrize("method", ["any", "all"])
     def test_any_all_level_axis_none_raises(self, method):
         df = DataFrame(

diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py
@@ -264,6 +264,25 @@ def test_delete(self):
         with pytest.raises(IndexError, match=None):
             newb.delete(3)
 
+    def test_split(self):
+        # GH#37799
+        values = np.random.randn(3, 4)
+        blk = make_block(values, placement=[3, 1, 6])
+        result = blk._split()
+
+        # check that we get views, not copies
+        values[:] = -9999
+        assert (blk.values == -9999).all()
+
+        assert len(result) == 3
+        expected = [
+            make_block(values[[0]], placement=[3]),
+            make_block(values[[1]], placement=[1]),
+            make_block(values[[2]], placement=[6]),
+        ]
+        for res, exp in zip(result, expected):
+            assert_block_equal(res, exp)
+
 
 class TestBlockManager:
     def test_attrs(self):
@@ -667,7 +686,7 @@ def test_get_bool_data(self):
         mgr.iset(6, np.array([True, False, True], dtype=np.object_))
 
         bools = mgr.get_bool_data()
-        tm.assert_index_equal(bools.items, Index(["bool"]))
+        tm.assert_index_equal(bools.items, Index(["bool", "dt"]))
         tm.assert_almost_equal(
             mgr.iget(mgr.items.get_loc("bool")).internal_values(),
             bools.iget(bools.items.get_loc("bool")).internal_values(),