From 02dcdc3a739a959aeea21853bf3409192f0ac453 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Thu, 22 Apr 2021 12:19:14 -0400 Subject: [PATCH 1/4] BUG: any/all not returning booleans for object type --- doc/source/whatsnew/v1.3.0.rst | 1 + pandas/core/nanops.py | 12 ++++++++++ pandas/tests/apply/test_series_apply.py | 4 ++-- pandas/tests/frame/test_reductions.py | 27 +++++++++++++++++++--- pandas/tests/reductions/test_reductions.py | 26 +++++++++++++++++++-- pandas/tests/test_nanops.py | 1 + 6 files changed, 64 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 0e567972e7823..6120e6acbc558 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -696,6 +696,7 @@ Numeric - Bug in :meth:`DataFrame.transform` would raise ``SpecificationError`` when passed a dictionary and columns were missing; will now raise a ``KeyError`` instead (:issue:`40004`) - Bug in :meth:`DataFrameGroupBy.rank` giving incorrect results with ``pct=True`` and equal values between consecutive groups (:issue:`40518`) - Bug in :meth:`Series.count` would result in an ``int32`` result on 32-bit platforms when argument ``level=None`` (:issue:`40908`) +- Bug in :class:`Series` and :class:`DataFrame` reductions with methods ``any`` and ``all`` not returning boolean results for object data (:issue:`12863`, :issue:`35450`, :issue:`27709`) Conversion ^^^^^^^^^^ diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 54588eafc3fa0..0283711f6667c 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -486,6 +486,12 @@ def nanany( False """ values, _, _, _, _ = _get_values(values, skipna, fill_value=False, mask=mask) + + # For object type, any won't necessarily return + # boolean values (numpy/numpy#4352) + if is_object_dtype(values): + values = values.astype(bool) + # error: Incompatible return value type (got "Union[bool_, ndarray]", expected # "bool") return values.any(axis) # type: ignore[return-value] @@ -526,6 +532,12 @@ def nanall( False """ values, _, _, _, _ = _get_values(values, skipna, fill_value=True, mask=mask) + + # For object type, all won't necessarily return + # boolean values (numpy/numpy#4352) + if is_object_dtype(values): + values = values.astype(bool) + # error: Incompatible return value type (got "Union[bool_, ndarray]", expected # "bool") return values.all(axis) # type: ignore[return-value] diff --git a/pandas/tests/apply/test_series_apply.py b/pandas/tests/apply/test_series_apply.py index 6722fc43aa75e..bfa88f54e4f10 100644 --- a/pandas/tests/apply/test_series_apply.py +++ b/pandas/tests/apply/test_series_apply.py @@ -441,8 +441,8 @@ def test_non_callable_aggregates(how): ("sum", "abc"), ("max", "c"), ("min", "a"), - ("all", "c"), # see GH12863 - ("any", "a"), + ("all", True), + ("any", True), ], ), ), diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index c6d10ae5682cd..5012e88d8ec19 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -1068,13 +1068,17 @@ def test_idxmax_dt64_multicolumn_axis1(self): @pytest.mark.parametrize("opname", ["any", "all"]) def test_any_all(self, opname, bool_frame_with_na, float_string_frame): - assert_bool_op_calc( - opname, getattr(np, opname), bool_frame_with_na, has_skipna=True - ) assert_bool_op_api( opname, bool_frame_with_na, float_string_frame, has_bool_only=True ) + @pytest.mark.xfail(reason="GH12863: numpy result won't match for object type") + @pytest.mark.parametrize("opname", ["any", "all"]) + def test_any_all_matches_numpy(self, opname, bool_frame_with_na): + assert_bool_op_calc( + opname, getattr(np, opname), bool_frame_with_na, has_skipna=True + ) + def test_any_all_extra(self): df = DataFrame( { @@ -1108,6 +1112,23 @@ def test_any_all_extra(self): result = df[["C"]].all(axis=None).item() assert result is True + @pytest.mark.parametrize("axis", [0, 1]) + @pytest.mark.parametrize("bool_agg_func", ["any", "all"]) + def test_any_all_object_dtype(self, axis, bool_agg_func): + # GH#35450 + df = DataFrame( + data=[ + [1, np.nan, np.nan, True], + [np.nan, 2, np.nan, True], + [np.nan, np.nan, np.nan, True], + [np.nan, np.nan, np.nan, np.nan], + ] + ) + + result = getattr(df, bool_agg_func)(axis=axis, skipna=False) + expected = Series([True, True, True, True]) + tm.assert_series_equal(result, expected) + def test_any_datetime(self): # GH 23070 diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 906f05fe0f348..9469e8dc1db14 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -896,7 +896,7 @@ def test_all_any(self): # Alternative types, with implicit 'object' dtype. s = Series(["abc", True]) - assert "abc" == s.any() # 'abc' || True => 'abc' + assert s.any() @pytest.mark.parametrize("klass", [Index, Series]) def test_numpy_all_any(self, klass): @@ -913,7 +913,7 @@ def test_all_any_params(self): s2 = Series([np.nan, False]) assert s1.all(skipna=False) # nan && True => True assert s1.all(skipna=True) - assert np.isnan(s2.any(skipna=False)) # nan || False => nan + assert s2.any(skipna=False) assert not s2.any(skipna=True) # Check level. @@ -941,6 +941,28 @@ def test_all_any_params(self): with pytest.raises(NotImplementedError, match=msg): s.all(bool_only=True) + @pytest.mark.parametrize("bool_agg_func", ["any", "all"]) + @pytest.mark.parametrize("skipna", [True, False]) + def test_any_all_object_dtype(self, bool_agg_func, skipna): + ser = Series(["a", "b", "c", "d", "e"], dtype=object) + result = getattr(ser, bool_agg_func)(skipna=skipna) + expected = True + + assert result == expected + + @pytest.mark.parametrize("bool_agg_func", ["any", "all"]) + @pytest.mark.parametrize( + "data", [[False, None], [None, False], [False, np.nan], [np.nan, False]] + ) + def test_any_all_object_dtype_missing(self, data, bool_agg_func): + # GH#27709 + ser = Series(data) + result = getattr(ser, bool_agg_func)(skipna=False) + + # None is treated is False, but np.nan is treated as True + expected = bool_agg_func == "any" and None not in data + assert result == expected + @pytest.mark.parametrize("bool_agg_func", ["any", "all"]) @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize( diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index 7f8b941a9f115..c2da9bdbf8e90 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -270,6 +270,7 @@ def _badobj_wrap(self, value, func, allow_complex=True, **kwargs): value = value.astype("f8") return func(value, **kwargs) + @pytest.mark.xfail(reason="GH12863: numpy result won't match for object type") @pytest.mark.parametrize( "nan_op,np_op", [(nanops.nanany, np.any), (nanops.nanall, np.all)] ) From 1c24df9fd3ab15bf64517796c1128dd471f4a378 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Thu, 22 Apr 2021 12:30:50 -0400 Subject: [PATCH 2/4] Add gh ref --- pandas/tests/reductions/test_reductions.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 9469e8dc1db14..a6d7d78ff3af2 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -944,6 +944,7 @@ def test_all_any_params(self): @pytest.mark.parametrize("bool_agg_func", ["any", "all"]) @pytest.mark.parametrize("skipna", [True, False]) def test_any_all_object_dtype(self, bool_agg_func, skipna): + # GH#12863 ser = Series(["a", "b", "c", "d", "e"], dtype=object) result = getattr(ser, bool_agg_func)(skipna=skipna) expected = True From d58eb97b7de4f49174eaec2f1afb8c17d6fc93aa Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Thu, 22 Apr 2021 18:36:21 -0400 Subject: [PATCH 3/4] Unxfail test --- pandas/tests/frame/test_reductions.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 5012e88d8ec19..1f1a12a4e4ec9 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -1072,12 +1072,12 @@ def test_any_all(self, opname, bool_frame_with_na, float_string_frame): opname, bool_frame_with_na, float_string_frame, has_bool_only=True ) - @pytest.mark.xfail(reason="GH12863: numpy result won't match for object type") @pytest.mark.parametrize("opname", ["any", "all"]) - def test_any_all_matches_numpy(self, opname, bool_frame_with_na): - assert_bool_op_calc( - opname, getattr(np, opname), bool_frame_with_na, has_skipna=True - ) + def test_any_all_bool_frame(self, opname, bool_frame_with_na): + # GH#12863: numpy gives back NaN object data so fill NaNs + # to compare with pandas behavior + df = bool_frame_with_na.fillna(True) + assert_bool_op_calc(opname, getattr(np, opname), df, has_skipna=True) def test_any_all_extra(self): df = DataFrame( @@ -1114,18 +1114,19 @@ def test_any_all_extra(self): @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("bool_agg_func", ["any", "all"]) - def test_any_all_object_dtype(self, axis, bool_agg_func): + @pytest.mark.parametrize("skipna", [True, False]) + def test_any_all_object_dtype(self, axis, bool_agg_func, skipna): # GH#35450 df = DataFrame( data=[ [1, np.nan, np.nan, True], [np.nan, 2, np.nan, True], [np.nan, np.nan, np.nan, True], - [np.nan, np.nan, np.nan, np.nan], + [np.nan, np.nan, "5", np.nan], ] ) - result = getattr(df, bool_agg_func)(axis=axis, skipna=False) + result = getattr(df, bool_agg_func)(axis=axis, skipna=skipna) expected = Series([True, True, True, True]) tm.assert_series_equal(result, expected) From 7ce87dc9930bef94587b46a2a5613041369367b6 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Thu, 22 Apr 2021 18:47:21 -0400 Subject: [PATCH 4/4] Fix comment typo --- pandas/tests/frame/test_reductions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 1f1a12a4e4ec9..2df59923221ec 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -1074,8 +1074,8 @@ def test_any_all(self, opname, bool_frame_with_na, float_string_frame): @pytest.mark.parametrize("opname", ["any", "all"]) def test_any_all_bool_frame(self, opname, bool_frame_with_na): - # GH#12863: numpy gives back NaN object data so fill NaNs - # to compare with pandas behavior + # GH#12863: numpy gives back non-boolean data for object type + # so fill NaNs to compare with pandas behavior df = bool_frame_with_na.fillna(True) assert_bool_op_calc(opname, getattr(np, opname), df, has_skipna=True)