Skip to content

BUG: preserve nullable dtype for float result in IntegerArray/BooleanArray arithmetic ops #38178

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,9 @@ Alternatively, you can also use the dtype object:

pd.Series([1.5, None], dtype=pd.Float32Dtype())

Operations with the existing integer or boolean nullable data types that
give float results will now also use the nullable floating data types (:issue:`38178`).

.. warning::

Experimental: the new floating data types are currently experimental, and their
Expand Down
7 changes: 4 additions & 3 deletions pandas/core/arrays/boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -706,10 +706,11 @@ def _maybe_mask_result(self, result, mask, other, op_name: str):
if (is_float_dtype(other) or is_float(other)) or (
op_name in ["rtruediv", "truediv"]
):
result[mask] = np.nan
return result
from pandas.core.arrays import FloatingArray

return FloatingArray(result, mask, copy=False)

if is_bool_dtype(result):
elif is_bool_dtype(result):
return BooleanArray(result, mask, copy=False)

elif is_integer_dtype(result):
Expand Down
9 changes: 6 additions & 3 deletions pandas/core/arrays/integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -539,13 +539,15 @@ def _cmp_method(self, other, op):
return BooleanArray(result, mask)

def _arith_method(self, other, op):
from pandas.core.arrays import FloatingArray

op_name = op.__name__
omask = None

if getattr(other, "ndim", 0) > 1:
raise NotImplementedError("can only perform ops with 1-d structures")

if isinstance(other, IntegerArray):
if isinstance(other, (IntegerArray, FloatingArray)):
other, omask = other._data, other._mask

elif is_list_like(other):
Expand Down Expand Up @@ -636,8 +638,9 @@ def _maybe_mask_result(self, result, mask, other, op_name: str):
if (is_float_dtype(other) or is_float(other)) or (
op_name in ["rtruediv", "truediv"]
):
result[mask] = np.nan
return result
from pandas.core.arrays import FloatingArray

return FloatingArray(result, mask, copy=False)

if result.dtype == "timedelta64[ns]":
from pandas.core.arrays import TimedeltaArray
Expand Down
13 changes: 8 additions & 5 deletions pandas/tests/arrays/boolean/test_arithmetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import pandas as pd
import pandas._testing as tm
from pandas.arrays import FloatingArray


@pytest.fixture
Expand Down Expand Up @@ -51,13 +52,15 @@ def test_sub(left_array, right_array):


def test_div(left_array, right_array):
# for now division gives a float numpy array
result = left_array / right_array
expected = np.array(
[1.0, np.inf, np.nan, 0.0, np.nan, np.nan, np.nan, np.nan, np.nan],
dtype="float64",
expected = FloatingArray(
np.array(
[1.0, np.inf, np.nan, 0.0, np.nan, np.nan, np.nan, np.nan, np.nan],
dtype="float64",
),
np.array([False, False, True, False, False, True, True, True, True]),
)
tm.assert_numpy_array_equal(result, expected)
tm.assert_extension_array_equal(result, expected)


@pytest.mark.parametrize(
Expand Down
7 changes: 7 additions & 0 deletions pandas/tests/arrays/boolean/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,13 @@ def test_value_counts_na():
tm.assert_series_equal(result, expected)


def test_value_counts_with_normalize():
s = pd.Series([True, False, pd.NA], dtype="boolean")
result = s.value_counts(normalize=True)
expected = pd.Series([1, 1], index=[False, True], dtype="Float64") / 2
tm.assert_series_equal(result, expected)


def test_diff():
a = pd.array(
[True, True, False, False, True, None, True, None, False], dtype="boolean"
Expand Down
7 changes: 7 additions & 0 deletions pandas/tests/arrays/floating/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,13 @@ def test_value_counts_empty():
tm.assert_series_equal(result, expected)


def test_value_counts_with_normalize():
s = pd.Series([0.1, 0.2, 0.1, pd.NA], dtype="Float64")
result = s.value_counts(normalize=True)
expected = pd.Series([2, 1], index=[0.1, 0.2], dtype="Float64") / 3
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize("skipna", [True, False])
@pytest.mark.parametrize("min_count", [0, 4])
def test_floating_array_sum(skipna, min_count, dtype):
Expand Down
46 changes: 31 additions & 15 deletions pandas/tests/arrays/integer/test_arithmetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

import pandas as pd
import pandas._testing as tm
from pandas.core.arrays import integer_array
from pandas.core.arrays import FloatingArray, integer_array
import pandas.core.ops as ops

# Basic test for the arithmetic array ops
Expand Down Expand Up @@ -45,24 +45,26 @@ def test_sub(dtype):


def test_div(dtype):
# for now division gives a float numpy array
a = pd.array([1, 2, 3, None, 5], dtype=dtype)
b = pd.array([0, 1, None, 3, 4], dtype=dtype)

result = a / b
expected = np.array([np.inf, 2, np.nan, np.nan, 1.25], dtype="float64")
tm.assert_numpy_array_equal(result, expected)
expected = pd.array([np.inf, 2, None, None, 1.25], dtype="Float64")
tm.assert_extension_array_equal(result, expected)


@pytest.mark.parametrize("zero, negative", [(0, False), (0.0, False), (-0.0, True)])
def test_divide_by_zero(zero, negative):
# https://github.com/pandas-dev/pandas/issues/27398
a = pd.array([0, 1, -1, None], dtype="Int64")
result = a / zero
expected = np.array([np.nan, np.inf, -np.inf, np.nan])
expected = FloatingArray(
np.array([np.nan, np.inf, -np.inf, 1], dtype="float64"),
np.array([False, False, False, True]),
)
if negative:
expected *= -1
tm.assert_numpy_array_equal(result, expected)
tm.assert_extension_array_equal(result, expected)


def test_floordiv(dtype):
Expand Down Expand Up @@ -99,8 +101,11 @@ def test_pow_scalar():
tm.assert_extension_array_equal(result, expected)

result = a ** np.nan
expected = np.array([np.nan, np.nan, 1, np.nan, np.nan], dtype="float64")
tm.assert_numpy_array_equal(result, expected)
expected = FloatingArray(
np.array([np.nan, np.nan, 1, np.nan, np.nan], dtype="float64"),
np.array([False, False, False, True, False]),
)
tm.assert_extension_array_equal(result, expected)

# reversed
a = a[1:] # Can't raise integers to negative powers.
Expand All @@ -118,8 +123,11 @@ def test_pow_scalar():
tm.assert_extension_array_equal(result, expected)

result = np.nan ** a
expected = np.array([1, np.nan, np.nan, np.nan], dtype="float64")
tm.assert_numpy_array_equal(result, expected)
expected = FloatingArray(
np.array([1, np.nan, np.nan, np.nan], dtype="float64"),
np.array([False, False, True, False]),
)
tm.assert_extension_array_equal(result, expected)


def test_pow_array():
Expand All @@ -133,10 +141,10 @@ def test_pow_array():
def test_rpow_one_to_na():
# https://github.com/pandas-dev/pandas/issues/22022
# https://github.com/pandas-dev/pandas/issues/29997
arr = integer_array([np.nan, np.nan])
arr = pd.array([np.nan, np.nan], dtype="Int64")
result = np.array([1.0, 2.0]) ** arr
expected = np.array([1.0, np.nan])
tm.assert_numpy_array_equal(result, expected)
expected = pd.array([1.0, np.nan], dtype="Float64")
tm.assert_extension_array_equal(result, expected)


@pytest.mark.parametrize("other", [0, 0.5])
Expand Down Expand Up @@ -198,11 +206,19 @@ def test_arith_coerce_scalar(data, all_arithmetic_operators):

result = op(s, other)
expected = op(s.astype(float), other)
expected = expected.astype("Float64")
# rfloordiv results in nan instead of inf
if all_arithmetic_operators == "__rfloordiv__" and _np_version_under1p20:
# for numpy 1.20 https://github.com/numpy/numpy/pull/16161
# updated floordiv, now matches our behavior defined in core.ops
expected[(expected == np.inf) | (expected == -np.inf)] = np.nan
mask = (
((expected == np.inf) | (expected == -np.inf)).fillna(False).to_numpy(bool)
)
expected.array._data[mask] = np.nan
# rmod results in NaN that wasn't NA in original nullable Series -> unmask it
elif all_arithmetic_operators == "__rmod__":
mask = (s == 0).fillna(False).to_numpy(bool)
expected.array._mask[mask] = False

tm.assert_series_equal(result, expected)

Expand All @@ -215,7 +231,7 @@ def test_arithmetic_conversion(all_arithmetic_operators, other):

s = pd.Series([1, 2, 3], dtype="Int64")
result = op(s, other)
assert result.dtype is np.dtype("float")
assert result.dtype == "Float64"


def test_cross_type_arithmetic():
Expand Down
8 changes: 8 additions & 0 deletions pandas/tests/arrays/integer/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,14 @@ def test_value_counts_empty():
tm.assert_series_equal(result, expected)


def test_value_counts_with_normalize():
# GH 33172
s = pd.Series([1, 2, 1, pd.NA], dtype="Int64")
result = s.value_counts(normalize=True)
expected = pd.Series([2, 1], index=[1, 2], dtype="Float64") / 3
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize("skipna", [True, False])
@pytest.mark.parametrize("min_count", [0, 4])
def test_integer_array_sum(skipna, min_count, any_nullable_int_dtype):
Expand Down
6 changes: 1 addition & 5 deletions pandas/tests/arrays/masked/test_arithmetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,7 @@ def test_array_scalar_like_equivalence(data, all_arithmetic_operators):
for scalar in [scalar, data.dtype.type(scalar)]:
result = op(data, scalar)
expected = op(data, scalar_array)
if isinstance(expected, ExtensionArray):
tm.assert_extension_array_equal(result, expected)
else:
# TODO div still gives float ndarray -> remove this once we have Float EA
tm.assert_numpy_array_equal(result, expected)
tm.assert_extension_array_equal(result, expected)


def test_array_NA(data, all_arithmetic_operators):
Expand Down
12 changes: 12 additions & 0 deletions pandas/tests/arrays/string_/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -495,6 +495,18 @@ def test_value_counts_na(dtype, request):
tm.assert_series_equal(result, expected)


def test_value_counts_with_normalize(dtype, request):
if dtype == "arrow_string":
reason = "TypeError: boolean value of NA is ambiguous"
mark = pytest.mark.xfail(reason=reason)
request.node.add_marker(mark)

s = pd.Series(["a", "b", "a", pd.NA], dtype=dtype)
result = s.value_counts(normalize=True)
expected = pd.Series([2, 1], index=["a", "b"], dtype="Float64") / 3
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize(
"values, expected",
[
Expand Down
6 changes: 5 additions & 1 deletion pandas/tests/extension/test_boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ def _check_op(self, s, op, other, op_name, exc=NotImplementedError):
elif op_name in ("__truediv__", "__rtruediv__"):
# combine with bools does not generate the correct result
# (numpy behaviour for div is to regard the bools as numeric)
expected = s.astype(float).combine(other, op)
expected = s.astype(float).combine(other, op).astype("Float64")
if op_name == "__rpow__":
# for rpow, combine does not propagate NaN
expected[result.isna()] = np.nan
Expand Down Expand Up @@ -235,6 +235,10 @@ def test_searchsorted(self, data_for_sorting, as_series):
def test_value_counts(self, all_data, dropna):
return super().test_value_counts(all_data, dropna)

@pytest.mark.skip(reason="uses nullable integer")
def test_value_counts_with_normalize(self, data):
pass

def test_argmin_argmax(self, data_for_sorting, data_missing_for_sorting):
# override because there are only 2 unique values

Expand Down
4 changes: 4 additions & 0 deletions pandas/tests/extension/test_floating.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,10 @@ def test_value_counts(self, all_data, dropna):

self.assert_series_equal(result, expected)

@pytest.mark.skip(reason="uses nullable integer")
def test_value_counts_with_normalize(self, data):
pass


class TestCasting(base.BaseCastingTests):
pass
Expand Down
9 changes: 5 additions & 4 deletions pandas/tests/extension/test_integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,10 +130,7 @@ def _check_op(self, s, op, other, op_name, exc=NotImplementedError):
expected = s.combine(other, op)

if op_name in ("__rtruediv__", "__truediv__", "__div__"):
expected = expected.fillna(np.nan).astype(float)
if op_name == "__rtruediv__":
# TODO reverse operators result in object dtype
result = result.astype(float)
expected = expected.fillna(np.nan).astype("Float64")
elif op_name.startswith("__r"):
# TODO reverse operators result in object dtype
# see https://github.com/pandas-dev/pandas/issues/22024
Expand Down Expand Up @@ -224,6 +221,10 @@ def test_value_counts(self, all_data, dropna):

self.assert_series_equal(result, expected)

@pytest.mark.skip(reason="uses nullable integer")
def test_value_counts_with_normalize(self, data):
pass


class TestCasting(base.BaseCastingTests):
pass
Expand Down
4 changes: 4 additions & 0 deletions pandas/tests/extension/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,10 @@ class TestMethods(base.BaseMethodsTests):
def test_value_counts(self, all_data, dropna):
return super().test_value_counts(all_data, dropna)

@pytest.mark.skip(reason="returns nullable")
def test_value_counts_with_normalize(self, data):
pass


class TestCasting(base.BaseCastingTests):
pass
Expand Down
10 changes: 2 additions & 8 deletions pandas/tests/series/test_arithmetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -838,14 +838,8 @@ class TestInplaceOperations:
(
("Int64", "Int64", "Int64", "Int64"),
("float", "float", "float", "float"),
("Int64", "float", "float", "float"),
pytest.param(
"Int64",
"Float64",
"Float64",
"Float64",
marks=pytest.mark.xfail(reason="Not implemented yet"),
),
("Int64", "float", "Float64", "Float64"),
("Int64", "Float64", "Float64", "Float64"),
),
)
def test_series_inplace_ops(self, dtype1, dtype2, dtype_expected, dtype_mul):
Expand Down