Skip to content

ENH: recognize Decimal("NaN") in pd.isna #39409

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Feb 25, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -371,7 +371,7 @@ Missing
^^^^^^^

- Bug in :class:`Grouper` now correctly propagates ``dropna`` argument and :meth:`DataFrameGroupBy.transform` now correctly handles missing values for ``dropna=True`` (:issue:`35612`)
-
- Bug in :func:`isna`, and :meth:`Series.isna`, :meth:`Index.isna`, :meth:`DataFrame.isna` (and the corresponding ``notna`` functions) not recognizing ``Decimal("NaN")`` objects (:issue:`39409`)
-

MultiIndex
Expand Down
18 changes: 17 additions & 1 deletion pandas/_libs/missing.pyx
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from decimal import Decimal
import numbers

import cython
Expand Down Expand Up @@ -36,6 +37,8 @@ cdef:

bint is_32bit = not IS64

type cDecimal = Decimal # for faster isinstance checks


cpdef bint is_matching_na(object left, object right, bint nan_matches_none=False):
"""
Expand Down Expand Up @@ -86,6 +89,8 @@ cpdef bint is_matching_na(object left, object right, bint nan_matches_none=False
and util.is_timedelta64_object(right)
and get_timedelta64_value(right) == NPY_NAT
)
elif is_decimal_na(left):
return is_decimal_na(right)
return False


Expand Down Expand Up @@ -113,7 +118,18 @@ cpdef bint checknull(object val):
The difference between `checknull` and `checknull_old` is that `checknull`
does *not* consider INF or NEGINF to be NA.
"""
return val is C_NA or is_null_datetimelike(val, inat_is_null=False)
return (
val is C_NA
or is_null_datetimelike(val, inat_is_null=False)
or is_decimal_na(val)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the list of what is considered null in docstring maybe could be updated.

also for consistency when using pandas.options.mode.use_inf_as_na, what about checknull_old?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the list of what is considered null in docstring maybe could be updated.

just added this to my next "collected misc" branch

also for consistency when using pandas.options.mode.use_inf_as_na, what about checknull_old?

i guess you're referring to Decimal("inf")? my inclination is to let that sleeping dog lie

)


cdef inline bint is_decimal_na(object val):
"""
Is this a decimal.Decimal object Decimal("NAN").
"""
return isinstance(val, cDecimal) and val != val


cpdef bint checknull_old(object val):
Expand Down
3 changes: 2 additions & 1 deletion pandas/_testing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import collections
from datetime import datetime
from decimal import Decimal
from functools import wraps
import operator
import os
Expand Down Expand Up @@ -146,7 +147,7 @@
+ BYTES_DTYPES
)

NULL_OBJECTS = [None, np.nan, pd.NaT, float("nan"), pd.NA]
NULL_OBJECTS = [None, np.nan, pd.NaT, float("nan"), pd.NA, Decimal("NaN")]

EMPTY_STRING_PATTERN = re.compile("^$")

Expand Down
19 changes: 3 additions & 16 deletions pandas/_testing/asserters.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import numpy as np

from pandas._libs.lib import no_default
from pandas._libs.missing import is_matching_na
import pandas._libs.testing as _testing

from pandas.core.dtypes.common import (
Expand Down Expand Up @@ -457,22 +458,8 @@ def assert_attr_equal(attr: str, left, right, obj: str = "Attributes"):

if left_attr is right_attr:
return True
elif (
is_number(left_attr)
and np.isnan(left_attr)
and is_number(right_attr)
and np.isnan(right_attr)
):
# np.nan
return True
elif (
isinstance(left_attr, (np.datetime64, np.timedelta64))
and isinstance(right_attr, (np.datetime64, np.timedelta64))
and type(left_attr) is type(right_attr)
and np.isnat(left_attr)
and np.isnat(right_attr)
):
# np.datetime64("nat") or np.timedelta64("nat")
elif is_matching_na(left_attr, right_attr):
# e.g. both np.nan, both NaT, both pd.NA, ...
return True

try:
Expand Down
2 changes: 1 addition & 1 deletion pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,7 +304,7 @@ def nselect_method(request):
# ----------------------------------------------------------------
# Missing values & co.
# ----------------------------------------------------------------
@pytest.fixture(params=tm.NULL_OBJECTS, ids=str)
@pytest.fixture(params=tm.NULL_OBJECTS, ids=lambda x: type(x).__name__)
def nulls_fixture(request):
"""
Fixture for each null type in pandas.
Expand Down
19 changes: 12 additions & 7 deletions pandas/core/dtypes/missing.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
missing types & inference
"""
from decimal import Decimal
from functools import partial

import numpy as np
Expand Down Expand Up @@ -610,20 +611,24 @@ def is_valid_na_for_dtype(obj, dtype: DtypeObj) -> bool:
"""
if not lib.is_scalar(obj) or not isna(obj):
return False
if dtype.kind == "M":
elif dtype.kind == "M":
if isinstance(dtype, np.dtype):
# i.e. not tzaware
return not isinstance(obj, np.timedelta64)
return not isinstance(obj, (np.timedelta64, Decimal))
# we have to rule out tznaive dt64("NaT")
return not isinstance(obj, (np.timedelta64, np.datetime64))
if dtype.kind == "m":
return not isinstance(obj, np.datetime64)
if dtype.kind in ["i", "u", "f", "c"]:
return not isinstance(obj, (np.timedelta64, np.datetime64, Decimal))
elif dtype.kind == "m":
return not isinstance(obj, (np.datetime64, Decimal))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same

elif dtype.kind in ["i", "u", "f", "c"]:
# Numeric
return obj is not NaT and not isinstance(obj, (np.datetime64, np.timedelta64))

elif dtype == np.dtype(object):
# This is needed for Categorical, but is kind of weird
return True

# must be PeriodDType
return not isinstance(obj, (np.datetime64, np.timedelta64))
return not isinstance(obj, (np.datetime64, np.timedelta64, Decimal))


def isna_all(arr: ArrayLike) -> bool:
Expand Down
16 changes: 15 additions & 1 deletion pandas/tests/dtypes/cast/test_promote.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""

import datetime
from decimal import Decimal

import numpy as np
import pytest
Expand Down Expand Up @@ -538,7 +539,20 @@ def test_maybe_promote_any_numpy_dtype_with_na(any_numpy_dtype_reduced, nulls_fi
fill_value = nulls_fixture
dtype = np.dtype(any_numpy_dtype_reduced)

if is_integer_dtype(dtype) and fill_value is not NaT:
if isinstance(fill_value, Decimal):
# Subject to change, but ATM (When Decimal(NAN) is being added to nulls_fixture)
# this is the existing behavior in maybe_promote,
# hinges on is_valid_na_for_dtype
if dtype.kind in ["i", "u", "f", "c"]:
if dtype.kind in ["i", "u"]:
expected_dtype = np.dtype(np.float64)
else:
expected_dtype = dtype
exp_val_for_scalar = np.nan
else:
expected_dtype = np.dtype(object)
exp_val_for_scalar = fill_value
elif is_integer_dtype(dtype) and fill_value is not NaT:
# integer + other missing value (np.nan / None) casts to float
expected_dtype = np.float64
exp_val_for_scalar = np.nan
Expand Down
61 changes: 48 additions & 13 deletions pandas/tests/dtypes/test_missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -317,6 +317,43 @@ def test_period(self):
tm.assert_series_equal(isna(s), exp)
tm.assert_series_equal(notna(s), ~exp)

def test_decimal(self):
# scalars GH#23530
a = Decimal(1.0)
assert pd.isna(a) is False
assert pd.notna(a) is True

b = Decimal("NaN")
assert pd.isna(b) is True
assert pd.notna(b) is False

# array
arr = np.array([a, b])
expected = np.array([False, True])
result = pd.isna(arr)
tm.assert_numpy_array_equal(result, expected)

result = pd.notna(arr)
tm.assert_numpy_array_equal(result, ~expected)

# series
ser = Series(arr)
expected = Series(expected)
result = pd.isna(ser)
tm.assert_series_equal(result, expected)

result = pd.notna(ser)
tm.assert_series_equal(result, ~expected)

# index
idx = pd.Index(arr)
expected = np.array([False, True])
result = pd.isna(idx)
tm.assert_numpy_array_equal(result, expected)

result = pd.notna(idx)
tm.assert_numpy_array_equal(result, ~expected)


@pytest.mark.parametrize("dtype_equal", [True, False])
def test_array_equivalent(dtype_equal):
Expand Down Expand Up @@ -619,24 +656,22 @@ def test_empty_like(self):


class TestLibMissing:
def test_checknull(self):
for value in na_vals:
assert libmissing.checknull(value)
@pytest.mark.parametrize("func", [libmissing.checknull, isna])
def test_checknull(self, func):
for value in na_vals + sometimes_na_vals:
assert func(value)

for value in inf_vals:
assert not libmissing.checknull(value)
assert not func(value)

for value in int_na_vals:
assert not libmissing.checknull(value)

for value in sometimes_na_vals:
assert not libmissing.checknull(value)
assert not func(value)

for value in never_na_vals:
assert not libmissing.checknull(value)
assert not func(value)

def test_checknull_old(self):
for value in na_vals:
for value in na_vals + sometimes_na_vals:
assert libmissing.checknull_old(value)

for value in inf_vals:
Expand All @@ -645,9 +680,6 @@ def test_checknull_old(self):
for value in int_na_vals:
assert not libmissing.checknull_old(value)

for value in sometimes_na_vals:
assert not libmissing.checknull_old(value)

for value in never_na_vals:
assert not libmissing.checknull_old(value)

Expand Down Expand Up @@ -682,6 +714,9 @@ def test_is_matching_na(self, nulls_fixture, nulls_fixture2):
elif is_float(left) and is_float(right):
# np.nan vs float("NaN") we consider as matching
assert libmissing.is_matching_na(left, right)
elif type(left) is type(right):
# e.g. both Decimal("NaN")
assert libmissing.is_matching_na(left, right)
else:
assert not libmissing.is_matching_na(left, right)

Expand Down
3 changes: 2 additions & 1 deletion pandas/tests/extension/base/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@ def test_contains(self, data, data_missing):

# the data can never contain other nan-likes than na_value
for na_value_obj in tm.NULL_OBJECTS:
if na_value_obj is na_value:
if na_value_obj is na_value or type(na_value_obj) == type(na_value):
# type check for e.g. two instances of Decimal("NAN")
continue
assert na_value_obj not in data
assert na_value_obj not in data_missing
Expand Down
13 changes: 0 additions & 13 deletions pandas/tests/extension/decimal/test_decimal.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,13 +178,6 @@ class TestBooleanReduce(Reduce, base.BaseBooleanReduceTests):
class TestMethods(BaseDecimal, base.BaseMethodsTests):
@pytest.mark.parametrize("dropna", [True, False])
def test_value_counts(self, all_data, dropna, request):
if any(x != x for x in all_data):
mark = pytest.mark.xfail(
reason="tm.assert_series_equal incorrectly raises",
raises=AssertionError,
)
request.node.add_marker(mark)

all_data = all_data[:10]
if dropna:
other = np.array(all_data[~all_data.isna()])
Expand Down Expand Up @@ -212,12 +205,6 @@ class TestCasting(BaseDecimal, base.BaseCastingTests):


class TestGroupby(BaseDecimal, base.BaseGroupbyTests):
def test_groupby_apply_identity(self, data_for_grouping, request):
if any(x != x for x in data_for_grouping):
mark = pytest.mark.xfail(reason="tm.assert_series_equal raises incorrectly")
request.node.add_marker(mark)
super().test_groupby_apply_identity(data_for_grouping)

def test_groupby_agg_extension(self, data_for_grouping):
super().test_groupby_agg_extension(data_for_grouping)

Expand Down
6 changes: 6 additions & 0 deletions pandas/tests/indexes/test_index_new.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
"""
Tests for the Index constructor conducting inference.
"""
from decimal import Decimal

import numpy as np
import pytest

Expand Down Expand Up @@ -89,6 +91,10 @@ def test_constructor_infer_periodindex(self):
def test_constructor_infer_nat_dt_like(
self, pos, klass, dtype, ctor, nulls_fixture, request
):
if isinstance(nulls_fixture, Decimal):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe should have a nulls_fixture_compatible_datetimelike ?

# We dont cast these to datetime64/timedelta64
return

expected = klass([NaT, NaT])
assert expected.dtype == dtype
data = [ctor]
Expand Down
6 changes: 4 additions & 2 deletions pandas/tests/indexes/test_numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,12 +104,14 @@ def test_numeric_compat(self):
def test_insert_na(self, nulls_fixture):
# GH 18295 (test missing)
index = self.create_index()
na_val = nulls_fixture

if nulls_fixture is pd.NaT:
if na_val is pd.NaT:
expected = Index([index[0], pd.NaT] + list(index[1:]), dtype=object)
else:
expected = Float64Index([index[0], np.nan] + list(index[1:]))
result = index.insert(1, nulls_fixture)

result = index.insert(1, na_val)
tm.assert_index_equal(result, expected)


Expand Down
7 changes: 6 additions & 1 deletion pandas/tests/io/json/test_pandas.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import datetime
from datetime import timedelta
from decimal import Decimal
from io import StringIO
import json
import os
Expand Down Expand Up @@ -1742,8 +1743,12 @@ def test_json_pandas_na(self):
result = DataFrame([[pd.NA]]).to_json()
assert result == '{"0":{"0":null}}'

def test_json_pandas_nulls(self, nulls_fixture):
def test_json_pandas_nulls(self, nulls_fixture, request):
# GH 31615
if isinstance(nulls_fixture, Decimal):
mark = pytest.mark.xfail(reason="not implemented")
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

xref #28609 cc @WillAyd

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you wanted to fix this here it would just require adding the same condition in the ujson code that we have for checking floats.

if (npy_isnan(val) || npy_isinf(val)) {

The decimal check is only a few branches below that

request.node.add_marker(mark)

result = DataFrame([[nulls_fixture]]).to_json()
assert result == '{"0":{"0":null}}'

Expand Down
11 changes: 9 additions & 2 deletions pandas/tests/tools/test_to_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
datetime,
timedelta,
)
from decimal import Decimal
import locale

from dateutil.parser import parse
Expand Down Expand Up @@ -2446,9 +2447,15 @@ def test_nullable_integer_to_datetime():

@pytest.mark.parametrize("klass", [np.array, list])
def test_na_to_datetime(nulls_fixture, klass):
result = pd.to_datetime(klass([nulls_fixture]))

assert result[0] is pd.NaT
if isinstance(nulls_fixture, Decimal):
with pytest.raises(TypeError, match="not convertible to datetime"):
pd.to_datetime(klass([nulls_fixture]))

else:
result = pd.to_datetime(klass([nulls_fixture]))

assert result[0] is pd.NaT


def test_empty_string_datetime_coerce__format():
Expand Down
Loading