BUG: preserve nullable dtype for float result in IntegerArray/BooleanArray arithmetic ops (#38178)

jorisvandenbossche · web-flow · commit 5b2e162c281f · 2020-12-01T10:32:47.000+01:00
diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
@@ -196,6 +196,9 @@ Alternatively, you can also use the dtype object:
 
    pd.Series([1.5, None], dtype=pd.Float32Dtype())
 
+Operations with the existing integer or boolean nullable data types that
+give float results will now also use the nullable floating data types (:issue:`38178`).
+
 .. warning::
 
    Experimental: the new floating data types are currently experimental, and their
diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py
@@ -706,10 +706,11 @@ def _maybe_mask_result(self, result, mask, other, op_name: str):
         if (is_float_dtype(other) or is_float(other)) or (
             op_name in ["rtruediv", "truediv"]
         ):
-            result[mask] = np.nan
-            return result
+            from pandas.core.arrays import FloatingArray
+
+            return FloatingArray(result, mask, copy=False)
 
-        if is_bool_dtype(result):
+        elif is_bool_dtype(result):
             return BooleanArray(result, mask, copy=False)
 
         elif is_integer_dtype(result):
diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py
@@ -539,13 +539,15 @@ def _cmp_method(self, other, op):
         return BooleanArray(result, mask)
 
     def _arith_method(self, other, op):
+        from pandas.core.arrays import FloatingArray
+
         op_name = op.__name__
         omask = None
 
         if getattr(other, "ndim", 0) > 1:
             raise NotImplementedError("can only perform ops with 1-d structures")
 
-        if isinstance(other, IntegerArray):
+        if isinstance(other, (IntegerArray, FloatingArray)):
             other, omask = other._data, other._mask
 
         elif is_list_like(other):
@@ -636,8 +638,9 @@ def _maybe_mask_result(self, result, mask, other, op_name: str):
         if (is_float_dtype(other) or is_float(other)) or (
             op_name in ["rtruediv", "truediv"]
         ):
-            result[mask] = np.nan
-            return result
+            from pandas.core.arrays import FloatingArray
+
+            return FloatingArray(result, mask, copy=False)
 
         if result.dtype == "timedelta64[ns]":
             from pandas.core.arrays import TimedeltaArray
diff --git a/pandas/tests/arrays/boolean/test_arithmetic.py b/pandas/tests/arrays/boolean/test_arithmetic.py
@@ -5,6 +5,7 @@
 
 import pandas as pd
 import pandas._testing as tm
+from pandas.arrays import FloatingArray
 
 
 @pytest.fixture
@@ -51,13 +52,15 @@ def test_sub(left_array, right_array):
 
 
 def test_div(left_array, right_array):
-    # for now division gives a float numpy array
     result = left_array / right_array
-    expected = np.array(
-        [1.0, np.inf, np.nan, 0.0, np.nan, np.nan, np.nan, np.nan, np.nan],
-        dtype="float64",
+    expected = FloatingArray(
+        np.array(
+            [1.0, np.inf, np.nan, 0.0, np.nan, np.nan, np.nan, np.nan, np.nan],
+            dtype="float64",
+        ),
+        np.array([False, False, True, False, False, True, True, True, True]),
     )
-    tm.assert_numpy_array_equal(result, expected)
+    tm.assert_extension_array_equal(result, expected)
 
 
 @pytest.mark.parametrize(
diff --git a/pandas/tests/arrays/boolean/test_function.py b/pandas/tests/arrays/boolean/test_function.py
@@ -85,6 +85,13 @@ def test_value_counts_na():
     tm.assert_series_equal(result, expected)
 
 
+def test_value_counts_with_normalize():
+    s = pd.Series([True, False, pd.NA], dtype="boolean")
+    result = s.value_counts(normalize=True)
+    expected = pd.Series([1, 1], index=[False, True], dtype="Float64") / 2
+    tm.assert_series_equal(result, expected)
+
+
 def test_diff():
     a = pd.array(
         [True, True, False, False, True, None, True, None, False], dtype="boolean"
diff --git a/pandas/tests/arrays/floating/test_function.py b/pandas/tests/arrays/floating/test_function.py
@@ -113,6 +113,13 @@ def test_value_counts_empty():
     tm.assert_series_equal(result, expected)
 
 
+def test_value_counts_with_normalize():
+    s = pd.Series([0.1, 0.2, 0.1, pd.NA], dtype="Float64")
+    result = s.value_counts(normalize=True)
+    expected = pd.Series([2, 1], index=[0.1, 0.2], dtype="Float64") / 3
+    tm.assert_series_equal(result, expected)
+
+
 @pytest.mark.parametrize("skipna", [True, False])
 @pytest.mark.parametrize("min_count", [0, 4])
 def test_floating_array_sum(skipna, min_count, dtype):
diff --git a/pandas/tests/arrays/integer/test_arithmetic.py b/pandas/tests/arrays/integer/test_arithmetic.py
@@ -7,7 +7,7 @@
 
 import pandas as pd
 import pandas._testing as tm
-from pandas.core.arrays import integer_array
+from pandas.core.arrays import FloatingArray, integer_array
 import pandas.core.ops as ops
 
 # Basic test for the arithmetic array ops
@@ -45,24 +45,26 @@ def test_sub(dtype):
 
 
 def test_div(dtype):
-    # for now division gives a float numpy array
     a = pd.array([1, 2, 3, None, 5], dtype=dtype)
     b = pd.array([0, 1, None, 3, 4], dtype=dtype)
 
     result = a / b
-    expected = np.array([np.inf, 2, np.nan, np.nan, 1.25], dtype="float64")
-    tm.assert_numpy_array_equal(result, expected)
+    expected = pd.array([np.inf, 2, None, None, 1.25], dtype="Float64")
+    tm.assert_extension_array_equal(result, expected)
 
 
 @pytest.mark.parametrize("zero, negative", [(0, False), (0.0, False), (-0.0, True)])
 def test_divide_by_zero(zero, negative):
     # https://github.com/pandas-dev/pandas/issues/27398
     a = pd.array([0, 1, -1, None], dtype="Int64")
     result = a / zero
-    expected = np.array([np.nan, np.inf, -np.inf, np.nan])
+    expected = FloatingArray(
+        np.array([np.nan, np.inf, -np.inf, 1], dtype="float64"),
+        np.array([False, False, False, True]),
+    )
     if negative:
         expected *= -1
-    tm.assert_numpy_array_equal(result, expected)
+    tm.assert_extension_array_equal(result, expected)
 
 
 def test_floordiv(dtype):
@@ -99,8 +101,11 @@ def test_pow_scalar():
     tm.assert_extension_array_equal(result, expected)
 
     result = a ** np.nan
-    expected = np.array([np.nan, np.nan, 1, np.nan, np.nan], dtype="float64")
-    tm.assert_numpy_array_equal(result, expected)
+    expected = FloatingArray(
+        np.array([np.nan, np.nan, 1, np.nan, np.nan], dtype="float64"),
+        np.array([False, False, False, True, False]),
+    )
+    tm.assert_extension_array_equal(result, expected)
 
     # reversed
     a = a[1:]  # Can't raise integers to negative powers.
@@ -118,8 +123,11 @@ def test_pow_scalar():
     tm.assert_extension_array_equal(result, expected)
 
     result = np.nan ** a
-    expected = np.array([1, np.nan, np.nan, np.nan], dtype="float64")
-    tm.assert_numpy_array_equal(result, expected)
+    expected = FloatingArray(
+        np.array([1, np.nan, np.nan, np.nan], dtype="float64"),
+        np.array([False, False, True, False]),
+    )
+    tm.assert_extension_array_equal(result, expected)
 
 
 def test_pow_array():
@@ -133,10 +141,10 @@ def test_pow_array():
 def test_rpow_one_to_na():
     # https://github.com/pandas-dev/pandas/issues/22022
     # https://github.com/pandas-dev/pandas/issues/29997
-    arr = integer_array([np.nan, np.nan])
+    arr = pd.array([np.nan, np.nan], dtype="Int64")
     result = np.array([1.0, 2.0]) ** arr
-    expected = np.array([1.0, np.nan])
-    tm.assert_numpy_array_equal(result, expected)
+    expected = pd.array([1.0, np.nan], dtype="Float64")
+    tm.assert_extension_array_equal(result, expected)
 
 
 @pytest.mark.parametrize("other", [0, 0.5])
@@ -198,11 +206,19 @@ def test_arith_coerce_scalar(data, all_arithmetic_operators):
 
     result = op(s, other)
     expected = op(s.astype(float), other)
+    expected = expected.astype("Float64")
     # rfloordiv results in nan instead of inf
     if all_arithmetic_operators == "__rfloordiv__" and _np_version_under1p20:
         # for numpy 1.20 https://github.com/numpy/numpy/pull/16161
         #  updated floordiv, now matches our behavior defined in core.ops
-        expected[(expected == np.inf) | (expected == -np.inf)] = np.nan
+        mask = (
+            ((expected == np.inf) | (expected == -np.inf)).fillna(False).to_numpy(bool)
+        )
+        expected.array._data[mask] = np.nan
+    # rmod results in NaN that wasn't NA in original nullable Series -> unmask it
+    elif all_arithmetic_operators == "__rmod__":
+        mask = (s == 0).fillna(False).to_numpy(bool)
+        expected.array._mask[mask] = False
 
     tm.assert_series_equal(result, expected)
 
@@ -215,7 +231,7 @@ def test_arithmetic_conversion(all_arithmetic_operators, other):
 
     s = pd.Series([1, 2, 3], dtype="Int64")
     result = op(s, other)
-    assert result.dtype is np.dtype("float")
+    assert result.dtype == "Float64"
 
 
 def test_cross_type_arithmetic():
diff --git a/pandas/tests/arrays/integer/test_function.py b/pandas/tests/arrays/integer/test_function.py
@@ -127,6 +127,14 @@ def test_value_counts_empty():
     tm.assert_series_equal(result, expected)
 
 
+def test_value_counts_with_normalize():
+    # GH 33172
+    s = pd.Series([1, 2, 1, pd.NA], dtype="Int64")
+    result = s.value_counts(normalize=True)
+    expected = pd.Series([2, 1], index=[1, 2], dtype="Float64") / 3
+    tm.assert_series_equal(result, expected)
+
+
 @pytest.mark.parametrize("skipna", [True, False])
 @pytest.mark.parametrize("min_count", [0, 4])
 def test_integer_array_sum(skipna, min_count, any_nullable_int_dtype):
diff --git a/pandas/tests/arrays/masked/test_arithmetic.py b/pandas/tests/arrays/masked/test_arithmetic.py
@@ -43,11 +43,7 @@ def test_array_scalar_like_equivalence(data, all_arithmetic_operators):
     for scalar in [scalar, data.dtype.type(scalar)]:
         result = op(data, scalar)
         expected = op(data, scalar_array)
-        if isinstance(expected, ExtensionArray):
-            tm.assert_extension_array_equal(result, expected)
-        else:
-            # TODO div still gives float ndarray -> remove this once we have Float EA
-            tm.assert_numpy_array_equal(result, expected)
+        tm.assert_extension_array_equal(result, expected)
 
 
 def test_array_NA(data, all_arithmetic_operators):
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
@@ -495,6 +495,18 @@ def test_value_counts_na(dtype, request):
     tm.assert_series_equal(result, expected)
 
 
+def test_value_counts_with_normalize(dtype, request):
+    if dtype == "arrow_string":
+        reason = "TypeError: boolean value of NA is ambiguous"
+        mark = pytest.mark.xfail(reason=reason)
+        request.node.add_marker(mark)
+
+    s = pd.Series(["a", "b", "a", pd.NA], dtype=dtype)
+    result = s.value_counts(normalize=True)
+    expected = pd.Series([2, 1], index=["a", "b"], dtype="Float64") / 3
+    tm.assert_series_equal(result, expected)
+
+
 @pytest.mark.parametrize(
     "values, expected",
     [
diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py
@@ -130,7 +130,7 @@ def _check_op(self, s, op, other, op_name, exc=NotImplementedError):
             elif op_name in ("__truediv__", "__rtruediv__"):
                 # combine with bools does not generate the correct result
                 #  (numpy behaviour for div is to regard the bools as numeric)
-                expected = s.astype(float).combine(other, op)
+                expected = s.astype(float).combine(other, op).astype("Float64")
             if op_name == "__rpow__":
                 # for rpow, combine does not propagate NaN
                 expected[result.isna()] = np.nan
@@ -235,6 +235,10 @@ def test_searchsorted(self, data_for_sorting, as_series):
     def test_value_counts(self, all_data, dropna):
         return super().test_value_counts(all_data, dropna)
 
+    @pytest.mark.skip(reason="uses nullable integer")
+    def test_value_counts_with_normalize(self, data):
+        pass
+
     def test_argmin_argmax(self, data_for_sorting, data_missing_for_sorting):
         # override because there are only 2 unique values
 
diff --git a/pandas/tests/extension/test_floating.py b/pandas/tests/extension/test_floating.py
@@ -184,6 +184,10 @@ def test_value_counts(self, all_data, dropna):
 
         self.assert_series_equal(result, expected)
 
+    @pytest.mark.skip(reason="uses nullable integer")
+    def test_value_counts_with_normalize(self, data):
+        pass
+
 
 class TestCasting(base.BaseCastingTests):
     pass
diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py
@@ -130,10 +130,7 @@ def _check_op(self, s, op, other, op_name, exc=NotImplementedError):
             expected = s.combine(other, op)
 
             if op_name in ("__rtruediv__", "__truediv__", "__div__"):
-                expected = expected.fillna(np.nan).astype(float)
-                if op_name == "__rtruediv__":
-                    # TODO reverse operators result in object dtype
-                    result = result.astype(float)
+                expected = expected.fillna(np.nan).astype("Float64")
             elif op_name.startswith("__r"):
                 # TODO reverse operators result in object dtype
                 # see https://github.com/pandas-dev/pandas/issues/22024
@@ -224,6 +221,10 @@ def test_value_counts(self, all_data, dropna):
 
         self.assert_series_equal(result, expected)
 
+    @pytest.mark.skip(reason="uses nullable integer")
+    def test_value_counts_with_normalize(self, data):
+        pass
+
 
 class TestCasting(base.BaseCastingTests):
     pass
diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
@@ -118,6 +118,10 @@ class TestMethods(base.BaseMethodsTests):
     def test_value_counts(self, all_data, dropna):
         return super().test_value_counts(all_data, dropna)
 
+    @pytest.mark.skip(reason="returns nullable")
+    def test_value_counts_with_normalize(self, data):
+        pass
+
 
 class TestCasting(base.BaseCastingTests):
     pass
diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py
@@ -838,14 +838,8 @@ class TestInplaceOperations:
         (
             ("Int64", "Int64", "Int64", "Int64"),
             ("float", "float", "float", "float"),
-            ("Int64", "float", "float", "float"),
-            pytest.param(
-                "Int64",
-                "Float64",
-                "Float64",
-                "Float64",
-                marks=pytest.mark.xfail(reason="Not implemented yet"),
-            ),
+            ("Int64", "float", "Float64", "Float64"),
+            ("Int64", "Float64", "Float64", "Float64"),
         ),
     )
     def test_series_inplace_ops(self, dtype1, dtype2, dtype_expected, dtype_mul):