From 77ee0fec5d10af68c6bed19cd5d668a6d605eb2b Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 12 Nov 2020 15:47:32 -0800 Subject: [PATCH 1/3] BUG: DataFrame.all(bool_only=True) inconsistency with object dtype --- doc/source/whatsnew/v1.2.0.rst | 56 ++++++++++++++++++++++++ pandas/core/internals/blocks.py | 14 ++++++ pandas/core/internals/managers.py | 19 +++++++- pandas/tests/frame/test_reductions.py | 31 +++++++++++++ pandas/tests/internals/test_internals.py | 2 +- 5 files changed, 119 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index f751a91cecf19..619bbaf926c99 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -235,6 +235,62 @@ Other enhancements - Improve numerical stability for :meth:`Rolling.skew()`, :meth:`Rolling.kurt()`, :meth:`Expanding.skew()` and :meth:`Expanding.kurt()` through implementation of Kahan summation (:issue:`6929`) - Improved error reporting for subsetting columns of a :class:`DataFrameGroupBy` with ``axis=1`` (:issue:`37725`) +.. --------------------------------------------------------------------------- + +.. _whatsnew_120.notable_bug_fixes: + +Notable bug fixes +~~~~~~~~~~~~~~~~~ + +These are bug fixes that might have notable behavior changes. + +Consistency of DataFrame Reductions +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +:meth:`DataFrame.any` and :meth:`DataFrame.all` with ``bool_only=True`` now +determines whether to exclude object-dtype columns on a column-by-column basis, +instead of checking if *all* object-dtype columns can be considered boolean. + +This prevents pathological behavior where applying the reduction on a subset +of columns could result in a larger :class:`Series` result. + +.. ipython:: python + + df = pd.DataFrame({"A": ["foo", "bar"], "B": [True, False]}, dtype=object) + df["C"] = pd.Series([True, True]) + + +*Previous behavior*: + +.. code-block:: ipython + + In [5]: df.all(bool_only=True) + Out[5]: + C True + dtype: bool + + In [6]: df[["B", "C"]].all(bool_only=True) + Out[6]: + B False + C True + dtype: bool + +*New behavior*: + +.. code-block:: ipython + + In [5]: df.all(bool_only=True) + Out[5]: + B False + C True + dtype: bool + + In [6]: df[["B", "C"]].all(bool_only=True) + Out[6]: + B False + C True + dtype: bool + + .. _whatsnew_120.api_breaking.python: Increased minimum version for Python diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index ed77a210b6913..d3f629f985b98 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -448,6 +448,20 @@ def f(mask, val, idx): return self.split_and_operate(None, f, inplace) + def _split(self) -> List["Block"]: + """ + Split a block into a list of single-column blocks. + """ + assert self.ndim == 2 + + new_blocks = [] + for i, ref_loc in enumerate(self.mgr_locs): + vals = self.values[[i]] + + nb = self.make_block(vals, [ref_loc]) + new_blocks.append(nb) + return new_blocks + def split_and_operate(self, mask, f, inplace: bool) -> List["Block"]: """ split the block per-column, and apply the callable f diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 767c653f8a404..155d88d6ec2d9 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -713,13 +713,28 @@ def is_view(self) -> bool: def get_bool_data(self, copy: bool = False) -> "BlockManager": """ + Select blocks that are bool-dtype and columns from object-dtype blocks + that are all-bool. + Parameters ---------- copy : bool, default False Whether to copy the blocks """ - self._consolidate_inplace() - return self._combine([b for b in self.blocks if b.is_bool], copy) + + new_blocks = [] + + for blk in self.blocks: + if blk.dtype == bool: + new_blocks.append(blk) + + elif blk.is_object: + nbs = blk._split() + for nb in nbs: + if nb.is_bool: + new_blocks.append(nb) + + return self._combine(new_blocks, copy) def get_numeric_data(self, copy: bool = False) -> "BlockManager": """ diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 374d185f45844..95cb770a11408 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -1118,6 +1118,37 @@ def test_any_all_object(self): result = np.any(DataFrame(columns=["a", "b"])).item() assert result is False + def test_any_all_object_bool_only(self): + df = DataFrame({"A": ["foo", 2], "B": [True, False]}).astype(object) + df._consolidate_inplace() + df["C"] = Series([True, True]) + + # The underlying bug is in DataFrame._get_bool_data, so we check + # that while we're here + res = df._get_bool_data() + expected = df[["B", "C"]] + tm.assert_frame_equal(res, expected) + + res = df.all(bool_only=True, axis=0) + expected = Series([False, True], index=["B", "C"]) + tm.assert_series_equal(res, expected) + + # operating on a subset of columns should not produce a _larger_ Series + res = df[["B", "C"]].all(bool_only=True, axis=0) + tm.assert_series_equal(res, expected) + + assert not df.all(bool_only=True, axis=None) + + res = df.any(bool_only=True, axis=0) + expected = Series([True, True], index=["B", "C"]) + tm.assert_series_equal(res, expected) + + # operating on a subset of columns should not produce a _larger_ Series + res = df[["B", "C"]].any(bool_only=True, axis=0) + tm.assert_series_equal(res, expected) + + assert df.any(bool_only=True, axis=None) + @pytest.mark.parametrize("method", ["any", "all"]) def test_any_all_level_axis_none_raises(self, method): df = DataFrame( diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 88b91ecc79060..842298b3cdc7f 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -667,7 +667,7 @@ def test_get_bool_data(self): mgr.iset(6, np.array([True, False, True], dtype=np.object_)) bools = mgr.get_bool_data() - tm.assert_index_equal(bools.items, Index(["bool"])) + tm.assert_index_equal(bools.items, Index(["bool", "dt"])) tm.assert_almost_equal( mgr.iget(mgr.items.get_loc("bool")).internal_values(), bools.iget(bools.items.get_loc("bool")).internal_values(), From 359abd987884163b0f8bf4201001c7a24c0f6f8a Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 12 Nov 2020 21:11:05 -0800 Subject: [PATCH 2/3] GH ref in whatsnew --- doc/source/whatsnew/v1.2.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 1b32be873aaaf..ef3e936fa4ae9 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -251,7 +251,7 @@ determines whether to exclude object-dtype columns on a column-by-column basis, instead of checking if *all* object-dtype columns can be considered boolean. This prevents pathological behavior where applying the reduction on a subset -of columns could result in a larger :class:`Series` result. +of columns could result in a larger :class:`Series` result. See (:issue:`37799`). .. ipython:: python From 76a97eb8d408801d586f3f22c48c3baf51e32f78 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 13 Nov 2020 08:34:56 -0800 Subject: [PATCH 3/3] whatsnew edit, test --- doc/source/whatsnew/v1.2.0.rst | 10 +--------- pandas/core/internals/blocks.py | 2 +- pandas/tests/internals/test_internals.py | 19 +++++++++++++++++++ 3 files changed, 21 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 3c6d8b55c0001..4dedf9771955d 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -277,19 +277,11 @@ of columns could result in a larger :class:`Series` result. See (:issue:`37799`) *New behavior*: -.. code-block:: ipython +.. ipython:: python In [5]: df.all(bool_only=True) - Out[5]: - B False - C True - dtype: bool In [6]: df[["B", "C"]].all(bool_only=True) - Out[6]: - B False - C True - dtype: bool .. _whatsnew_120.api_breaking.python: diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 581f7f273f0f4..92f6bb6f1cbdd 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -458,7 +458,7 @@ def _split(self) -> List["Block"]: new_blocks = [] for i, ref_loc in enumerate(self.mgr_locs): - vals = self.values[[i]] + vals = self.values[slice(i, i + 1)] nb = self.make_block(vals, [ref_loc]) new_blocks.append(nb) diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 842298b3cdc7f..d069b5aa08e22 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -264,6 +264,25 @@ def test_delete(self): with pytest.raises(IndexError, match=None): newb.delete(3) + def test_split(self): + # GH#37799 + values = np.random.randn(3, 4) + blk = make_block(values, placement=[3, 1, 6]) + result = blk._split() + + # check that we get views, not copies + values[:] = -9999 + assert (blk.values == -9999).all() + + assert len(result) == 3 + expected = [ + make_block(values[[0]], placement=[3]), + make_block(values[[1]], placement=[1]), + make_block(values[[2]], placement=[6]), + ] + for res, exp in zip(result, expected): + assert_block_equal(res, exp) + class TestBlockManager: def test_attrs(self):