From 4cacf445fb909fc85a861890df5ccf00c3ef2f8f Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 4 Aug 2023 15:59:51 -0700 Subject: [PATCH 1/3] REF: share BooleanArray tests with other Masked cases --- pandas/tests/extension/test_boolean.py | 275 ------------------ pandas/tests/extension/test_masked_numeric.py | 115 +++++++- pandas/tests/groupby/test_function.py | 24 ++ 3 files changed, 136 insertions(+), 278 deletions(-) delete mode 100644 pandas/tests/extension/test_boolean.py diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py deleted file mode 100644 index 508e2da214336..0000000000000 --- a/pandas/tests/extension/test_boolean.py +++ /dev/null @@ -1,275 +0,0 @@ -""" -This file contains a minimal set of tests for compliance with the extension -array interface test suite, and should contain no other tests. -The test suite for the full functionality of the array is located in -`pandas/tests/arrays/`. - -The tests in this file are inherited from the BaseExtensionTests, and only -minimal tweaks should be applied to get the tests passing (by overwriting a -parent method). - -Additional tests should either be added to one of the BaseExtensionTests -classes (if they are relevant for the extension interface for all dtypes), or -be added to the array-specific tests in `pandas/tests/arrays/`. - -""" -import operator - -import numpy as np -import pytest - -from pandas.compat import ( - IS64, - is_platform_windows, -) - -import pandas as pd -import pandas._testing as tm -from pandas.core import roperator -from pandas.core.arrays.boolean import BooleanDtype -from pandas.tests.extension import base - -pytestmark = [ - pytest.mark.filterwarnings( - "ignore:invalid value encountered in divide:RuntimeWarning" - ), - pytest.mark.filterwarnings("ignore:Mean of empty slice:RuntimeWarning"), -] - - -def make_data(): - return [True, False] * 4 + [np.nan] + [True, False] * 44 + [np.nan] + [True, False] - - -@pytest.fixture -def dtype(): - return BooleanDtype() - - -@pytest.fixture -def data(dtype): - return pd.array(make_data(), dtype=dtype) - - -@pytest.fixture -def data_for_twos(dtype): - return pd.array(np.ones(100), dtype=dtype) - - -@pytest.fixture -def data_missing(dtype): - return pd.array([np.nan, True], dtype=dtype) - - -@pytest.fixture -def data_for_sorting(dtype): - return pd.array([True, True, False], dtype=dtype) - - -@pytest.fixture -def data_missing_for_sorting(dtype): - return pd.array([True, np.nan, False], dtype=dtype) - - -@pytest.fixture -def na_cmp(): - # we are pd.NA - return lambda x, y: x is pd.NA and y is pd.NA - - -@pytest.fixture -def na_value(): - return pd.NA - - -@pytest.fixture -def data_for_grouping(dtype): - b = True - a = False - c = b - na = np.nan - return pd.array([b, b, na, na, a, a, b, c], dtype=dtype) - - -class TestDtype(base.BaseDtypeTests): - pass - - -class TestInterface(base.BaseInterfaceTests): - pass - - -class TestConstructors(base.BaseConstructorsTests): - pass - - -class TestGetitem(base.BaseGetitemTests): - pass - - -class TestSetitem(base.BaseSetitemTests): - pass - - -class TestIndex(base.BaseIndexTests): - pass - - -class TestMissing(base.BaseMissingTests): - pass - - -class TestArithmeticOps(base.BaseArithmeticOpsTests): - implements = {"__sub__", "__rsub__"} - - def _get_expected_exception(self, op_name, obj, other): - if op_name.strip("_").lstrip("r") in ["pow", "truediv", "floordiv"]: - # match behavior with non-masked bool dtype - return NotImplementedError - elif op_name in self.implements: - # exception message would include "numpy boolean subtract"" - return TypeError - return None - - def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): - if op_name in ( - "__floordiv__", - "__rfloordiv__", - "__pow__", - "__rpow__", - "__mod__", - "__rmod__", - ): - # combine keeps boolean type - pointwise_result = pointwise_result.astype("Int8") - - elif op_name in ("__truediv__", "__rtruediv__"): - # combine with bools does not generate the correct result - # (numpy behaviour for div is to regard the bools as numeric) - if op_name == "__truediv__": - op = operator.truediv - else: - op = roperator.rtruediv - pointwise_result = self._combine(obj.astype(float), other, op) - pointwise_result = pointwise_result.astype("Float64") - - if op_name == "__rpow__": - # for rpow, combine does not propagate NaN - result = getattr(obj, op_name)(other) - pointwise_result[result.isna()] = np.nan - - return pointwise_result - - @pytest.mark.xfail( - reason="Inconsistency between floordiv and divmod; we raise for floordiv " - "but not for divmod. This matches what we do for non-masked bool dtype." - ) - def test_divmod_series_array(self, data, data_for_twos): - super().test_divmod_series_array(data, data_for_twos) - - -class TestComparisonOps(base.BaseComparisonOpsTests): - pass - - -class TestReshaping(base.BaseReshapingTests): - pass - - -class TestMethods(base.BaseMethodsTests): - _combine_le_expected_dtype = "boolean" - - -class TestCasting(base.BaseCastingTests): - pass - - -class TestGroupby(base.BaseGroupbyTests): - """ - Groupby-specific tests are overridden because boolean only has 2 - unique values, base tests uses 3 groups. - """ - - @pytest.mark.parametrize("min_count", [0, 10]) - def test_groupby_sum_mincount(self, data_for_grouping, min_count): - df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": data_for_grouping[:-1]}) - result = df.groupby("A").sum(min_count=min_count) - if min_count == 0: - expected = pd.DataFrame( - {"B": pd.array([3, 0, 0], dtype="Int64")}, - index=pd.Index([1, 2, 3], name="A"), - ) - tm.assert_frame_equal(result, expected) - else: - expected = pd.DataFrame( - {"B": pd.array([pd.NA] * 3, dtype="Int64")}, - index=pd.Index([1, 2, 3], name="A"), - ) - tm.assert_frame_equal(result, expected) - - -class TestNumericReduce(base.BaseNumericReduceTests): - def check_reduce(self, s, op_name, skipna): - if op_name == "count": - result = getattr(s, op_name)() - expected = getattr(s.astype("float64"), op_name)() - else: - result = getattr(s, op_name)(skipna=skipna) - expected = getattr(s.astype("float64"), op_name)(skipna=skipna) - # override parent function to cast to bool for min/max - if np.isnan(expected): - expected = pd.NA - elif op_name in ("min", "max"): - expected = bool(expected) - tm.assert_almost_equal(result, expected) - - def _get_expected_reduction_dtype(self, arr, op_name: str): - if op_name in ["mean", "median", "var", "std", "skew"]: - cmp_dtype = "Float64" - elif op_name in ["min", "max"]: - cmp_dtype = "boolean" - elif op_name in ["sum", "prod"]: - is_windows_or_32bit = is_platform_windows() or not IS64 - cmp_dtype = "Int32" if is_windows_or_32bit else "Int64" - else: - raise TypeError("not supposed to reach this") - return cmp_dtype - - -class TestBooleanReduce(base.BaseBooleanReduceTests): - pass - - -class TestPrinting(base.BasePrintingTests): - pass - - -class TestUnaryOps(base.BaseUnaryOpsTests): - pass - - -class TestAccumulation(base.BaseAccumulateTests): - def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool: - return True - - def check_accumulate(self, s, op_name, skipna): - length = 64 - if not IS64 or is_platform_windows(): - if not s.dtype.itemsize == 8: - length = 32 - - result = getattr(s, op_name)(skipna=skipna) - expected = getattr(pd.Series(s.astype("float64")), op_name)(skipna=skipna) - if op_name not in ("cummin", "cummax"): - expected = expected.astype(f"Int{length}") - else: - expected = expected.astype("boolean") - tm.assert_series_equal(result, expected) - - -class TestParsing(base.BaseParsingTests): - pass - - -class Test2DCompat(base.Dim2CompatTests): - pass diff --git a/pandas/tests/extension/test_masked_numeric.py b/pandas/tests/extension/test_masked_numeric.py index ce41c08cafbd6..52649156a599a 100644 --- a/pandas/tests/extension/test_masked_numeric.py +++ b/pandas/tests/extension/test_masked_numeric.py @@ -23,6 +23,7 @@ import pandas as pd import pandas._testing as tm +from pandas.core.arrays.boolean import BooleanDtype from pandas.core.arrays.floating import ( Float32Dtype, Float64Dtype, @@ -65,6 +66,10 @@ def make_float_data(): ) +def make_bool_data(): + return [True, False] * 4 + [np.nan] + [True, False] * 44 + [np.nan] + [True, False] + + @pytest.fixture( params=[ Int8Dtype, @@ -77,6 +82,7 @@ def make_float_data(): UInt64Dtype, Float32Dtype, Float64Dtype, + BooleanDtype, ] ) def dtype(request): @@ -87,6 +93,8 @@ def dtype(request): def data(dtype): if dtype.kind == "f": data = make_float_data() + elif dtype.kind == "b": + data = make_bool_data() else: data = make_data() return pd.array(data, dtype=dtype) @@ -94,6 +102,8 @@ def data(dtype): @pytest.fixture def data_for_twos(dtype): + if dtype.kind == "b": + return pd.array(np.ones(100), dtype=dtype) return pd.array(np.ones(100) * 2, dtype=dtype) @@ -101,6 +111,8 @@ def data_for_twos(dtype): def data_missing(dtype): if dtype.kind == "f": return pd.array([pd.NA, 0.1], dtype=dtype) + elif dtype.kind == "b": + return pd.array([np.nan, True], dtype=dtype) return pd.array([pd.NA, 1], dtype=dtype) @@ -108,6 +120,8 @@ def data_missing(dtype): def data_for_sorting(dtype): if dtype.kind == "f": return pd.array([0.1, 0.2, 0.0], dtype=dtype) + elif dtype.kind == "b": + return pd.array([True, True, False], dtype=dtype) return pd.array([1, 2, 0], dtype=dtype) @@ -115,6 +129,8 @@ def data_for_sorting(dtype): def data_missing_for_sorting(dtype): if dtype.kind == "f": return pd.array([0.1, pd.NA, 0.0], dtype=dtype) + elif dtype.kind == "b": + return pd.array([True, np.nan, False], dtype=dtype) return pd.array([1, pd.NA, 0], dtype=dtype) @@ -135,10 +151,15 @@ def data_for_grouping(dtype): b = 0.1 a = 0.0 c = 0.2 + elif dtype.kind == "b": + b = True + a = False + c = b else: b = 1 a = 0 c = 2 + na = pd.NA return pd.array([b, b, na, na, a, a, b, c], dtype=dtype) @@ -148,6 +169,23 @@ class TestDtype(base.BaseDtypeTests): class TestArithmeticOps(base.BaseArithmeticOpsTests): + def _get_expected_exception(self, op_name, obj, other): + try: + dtype = tm.get_dtype(obj) + except AttributeError: + # passed arguments reversed + dtype = tm.get_dtype(other) + + if dtype.kind == "b": + if op_name.strip("_").lstrip("r") in ["pow", "truediv", "floordiv"]: + # match behavior with non-masked bool dtype + return NotImplementedError + elif op_name in ["__sub__", "__rsub__"]: + # exception message would include "numpy boolean subtract"" + return TypeError + return None + return super()._get_expected_exception(op_name, obj, other) + def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): sdtype = tm.get_dtype(obj) expected = pointwise_result @@ -158,6 +196,29 @@ def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): else: # combine method result in 'biggest' (int64) dtype expected = expected.astype(sdtype) + elif sdtype.kind == "b": + if op_name in ( + "__floordiv__", + "__rfloordiv__", + "__pow__", + "__rpow__", + "__mod__", + "__rmod__", + ): + # combine keeps boolean type + expected = expected.astype("Int8") + + elif op_name in ("__truediv__", "__rtruediv__"): + # combine with bools does not generate the correct result + # (numpy behaviour for div is to regard the bools as numeric) + op = self.get_op_from_name(op_name) + expected = self._combine(obj.astype(float), other, op) + expected = expected.astype("Float64") + + if op_name == "__rpow__": + # for rpow, combine does not propagate NaN + result = getattr(obj, op_name)(other) + expected[result.isna()] = np.nan else: # combine method result in 'biggest' (float64) dtype expected = expected.astype(sdtype) @@ -168,6 +229,16 @@ def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): frame_scalar_exc = None divmod_exc = None + def test_divmod_series_array(self, data, data_for_twos, request): + if data.dtype.kind == "b": + mark = pytest.mark.xfail( + reason="Inconsistency between floordiv and divmod; we raise for " + "floordiv but not for divmod. This matches what we do for " + "non-masked bool dtype." + ) + request.node.add_marker(mark) + super().test_divmod_series_array(data, data_for_twos) + class TestComparisonOps(base.BaseComparisonOpsTests): series_scalar_exc = None @@ -215,7 +286,15 @@ class TestMissing(base.BaseMissingTests): class TestMethods(base.BaseMethodsTests): - _combine_le_expected_dtype = object # TODO: can we make this boolean? + def test_combine_le(self, data_repeated): + # TODO: patching self is a bad pattern here + orig_data1, orig_data2 = data_repeated(2) + if orig_data1.dtype.kind == "b": + self._combine_le_expected_dtype = "boolean" + else: + # TODO: can we make this boolean? + self._combine_le_expected_dtype = object + super().test_combine_le(data_repeated) class TestCasting(base.BaseCastingTests): @@ -236,6 +315,9 @@ def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool): # Item "dtype[Any]" of "Union[dtype[Any], ExtensionDtype]" has # no attribute "numpy_dtype" cmp_dtype = ser.dtype.numpy_dtype # type: ignore[union-attr] + elif ser.dtype.kind == "b": + if op_name in ["min", "max"]: + cmp_dtype = "bool" if op_name == "count": result = getattr(ser, op_name)() @@ -260,14 +342,25 @@ def _get_expected_reduction_dtype(self, arr, op_name: str): cmp_dtype = "Int32" if is_windows_or_32bit else "Int64" elif tm.is_unsigned_integer_dtype(arr.dtype): cmp_dtype = "UInt32" if is_windows_or_32bit else "UInt64" + elif arr.dtype.kind == "b": + if op_name in ["mean", "median", "var", "std", "skew"]: + cmp_dtype = "Float64" + elif op_name in ["min", "max"]: + cmp_dtype = "boolean" + elif op_name in ["sum", "prod"]: + cmp_dtype = "Int32" if is_windows_or_32bit else "Int64" + else: + raise TypeError("not supposed to reach this") else: raise TypeError("not supposed to reach this") return cmp_dtype -@pytest.mark.skip(reason="Tested in tests/reductions/test_reductions.py") class TestBooleanReduce(base.BaseBooleanReduceTests): - pass + @pytest.fixture(autouse=True) + def maybe_skip(self, dtype): + if dtype.kind != "b": + pytest.skip(reason="Tested in tests/reductions/test_reductions.py") class TestAccumulation(base.BaseAccumulateTests): @@ -292,6 +385,11 @@ def check_accumulate(self, ser: pd.Series, op_name: str, skipna: bool): # Incompatible types in assignment (expression has type # "Union[dtype[Any], ExtensionDtype]", variable has type "str") expected_dtype = ser.dtype # type: ignore[assignment] + elif ser.dtype.kind == "b": + if op_name in ("cummin", "cummax"): + expected_dtype = "boolean" + else: + expected_dtype = f"Int{length}" if op_name == "cumsum": result = getattr(ser, op_name)(skipna=skipna) @@ -325,6 +423,17 @@ def check_accumulate(self, ser: pd.Series, op_name: str, skipna: bool): raise NotImplementedError(f"{op_name} not supported") +class TestUnaryOps(base.BaseUnaryOpsTests): + def test_invert(self, data, request): + if data.dtype.kind == "f": + mark = pytest.mark.xfail( + reason="Looks like the base class test implicitly assumes " + "boolean/integer dtypes" + ) + request.node.add_marker(mark) + super().test_invert(data) + + class TestPrinting(base.BasePrintingTests): pass diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 78e9f6111a230..0599f44e1d7d7 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1358,6 +1358,30 @@ def test_apply_to_nullable_integer_returns_float(values, function): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("min_count", [0, 10]) +def test_groupby_sum_mincount_boolean(min_count): + b = True + a = False + c = b + na = np.nan + dfg = pd.array([b, b, na, na, a, a, b, c], dtype="boolean") + + df = DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": dfg[:-1]}) + result = df.groupby("A").sum(min_count=min_count) + if min_count == 0: + expected = DataFrame( + {"B": pd.array([3, 0, 0], dtype="Int64")}, + index=Index([1, 2, 3], name="A"), + ) + tm.assert_frame_equal(result, expected) + else: + expected = DataFrame( + {"B": pd.array([pd.NA] * 3, dtype="Int64")}, + index=Index([1, 2, 3], name="A"), + ) + tm.assert_frame_equal(result, expected) + + def test_groupby_sum_below_mincount_nullable_integer(): # https://github.com/pandas-dev/pandas/issues/32861 df = DataFrame({"a": [0, 1, 2], "b": [0, 1, 2], "c": [0, 1, 2]}, dtype="Int64") From caa7d0d02a5f724eabe519dc325890e883d5afb1 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 4 Aug 2023 16:00:26 -0700 Subject: [PATCH 2/3] REF: test_masked_numeric->test_masked --- pandas/tests/extension/{test_masked_numeric.py => test_masked.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename pandas/tests/extension/{test_masked_numeric.py => test_masked.py} (100%) diff --git a/pandas/tests/extension/test_masked_numeric.py b/pandas/tests/extension/test_masked.py similarity index 100% rename from pandas/tests/extension/test_masked_numeric.py rename to pandas/tests/extension/test_masked.py From 383e6cf1e8c6cd4ccdd629efb224fcd2facb604a Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 5 Aug 2023 10:22:24 -0700 Subject: [PATCH 3/3] remove unnecessary slicing --- pandas/tests/groupby/test_function.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 0599f44e1d7d7..0abf6428730ff 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1362,11 +1362,10 @@ def test_apply_to_nullable_integer_returns_float(values, function): def test_groupby_sum_mincount_boolean(min_count): b = True a = False - c = b na = np.nan - dfg = pd.array([b, b, na, na, a, a, b, c], dtype="boolean") + dfg = pd.array([b, b, na, na, a, a, b], dtype="boolean") - df = DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": dfg[:-1]}) + df = DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": dfg}) result = df.groupby("A").sum(min_count=min_count) if min_count == 0: expected = DataFrame(