Skip to content

BUG: get_indexer_non_unique with np.datetime64("NaT") and np.timedelta64("NaT") #43870

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 28 commits into from
Oct 18, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
09183e8
CLN: removing x and replacing with starget
alexhlim Oct 2, 2021
42070ec
Merge branch 'master' into nui-fixes
alexhlim Oct 2, 2021
c6bbdc1
Merge branch 'master' into nui-fixes
alexhlim Oct 3, 2021
62d5335
TST: adding NaT non unique tests
alexhlim Oct 3, 2021
8271d58
BUG: check np.datetime64('NaT') and np.timedelta64('NaT') in get_inde…
alexhlim Oct 4, 2021
da2dc44
DOC: adding whatsnew
alexhlim Oct 4, 2021
801d8da
CLN: putting parens around each condition
alexhlim Oct 7, 2021
00e0f68
CLN: refactor with is_dt64nat and istd64nat
alexhlim Oct 7, 2021
da54caa
TST: separate tests + use np nat fixtures
alexhlim Oct 8, 2021
566e096
CLN: short circuit np nat check for object dtype
alexhlim Oct 8, 2021
d20027d
CLN: reverting starget change for another PR
alexhlim Oct 8, 2021
299a45a
CLN: fixing docstring
alexhlim Oct 10, 2021
8878edf
CLN: fixing np_nat_fixture2 comment
alexhlim Oct 10, 2021
c4acbda
CLN: forgot to undo this line for stargets
alexhlim Oct 10, 2021
5c94e8b
DOC: TODO for np nats
alexhlim Oct 11, 2021
af33f5f
DOC: adding comments
alexhlim Oct 11, 2021
1490968
TST: ensure numpy doesn't downcast nats
alexhlim Oct 14, 2021
a5db551
Merge branch 'master' into nui-dt64nat-td64nat
alexhlim Oct 14, 2021
77216f7
CLN: reverting back to original is_matching_na check + adding time un…
alexhlim Oct 18, 2021
6b4179e
TST: updating np_nat_objects fixtures with date units
alexhlim Oct 18, 2021
94e7add
CLN: consoldiating all object na checks into one
alexhlim Oct 18, 2021
da4300c
Merge branch 'master' into nui-dt64nat-td64nat
alexhlim Oct 18, 2021
0d1e260
CLN: fixing typo in comment + check match target
alexhlim Oct 18, 2021
802c261
CLN: condensing np_nat_objects fixture
alexhlim Oct 18, 2021
14e0868
CLN: fixing comment typos
alexhlim Oct 18, 2021
14e5c0d
TST: added matching-but-not-identical for Decimal(NaN)
alexhlim Oct 18, 2021
54aa23d
TST: edge case with np.nan before float(NaN) while searching for np.nan
alexhlim Oct 18, 2021
551c0b1
Merge branch 'master' into nui-dt64nat-td64nat
alexhlim Oct 18, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -498,6 +498,7 @@ Indexing
- Bug in :meth:`DataFrame.nlargest` and :meth:`Series.nlargest` where sorted result did not count indexes containing ``np.nan`` (:issue:`28984`)
- Bug in indexing on a non-unique object-dtype :class:`Index` with an NA scalar (e.g. ``np.nan``) (:issue:`43711`)
- Bug in :meth:`Series.__setitem__` with object dtype when setting an array with matching size and dtype='datetime64[ns]' or dtype='timedelta64[ns]' incorrectly converting the datetime/timedeltas to integers (:issue:`43868`)
- Bug in :meth:`Index.get_indexer_non_unique` when index contains multiple ``np.datetime64("NaT")`` and ``np.timedelta64("NaT")`` (:issue:`43869`)
-

Missing
Expand Down
63 changes: 44 additions & 19 deletions pandas/_libs/index.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -315,14 +315,14 @@ cdef class IndexEngine:
missing : np.ndarray[np.intp]
"""
cdef:
ndarray values, x
ndarray values
ndarray[intp_t] result, missing
set stargets, remaining_stargets
set stargets, remaining_stargets, found_nas
dict d = {}
object val
Py_ssize_t count = 0, count_missing = 0
Py_ssize_t i, j, n, n_t, n_alloc, start, end
bint d_has_nan = False, stargets_has_nan = False, need_nan_check = True
bint check_na_values = False

values = self.values
stargets = set(targets)
Expand Down Expand Up @@ -357,33 +357,58 @@ cdef class IndexEngine:
if stargets:
# otherwise, map by iterating through all items in the index

# short-circuit na check
if values.dtype == object:
check_na_values = True
# keep track of nas in values
found_nas = set()

for i in range(n):
val = values[i]

# GH#43870
# handle lookup for nas
# (ie. np.nan, float("NaN"), Decimal("NaN"), dt64nat, td64nat)
if check_na_values and checknull(val):
match = [na for na in found_nas if is_matching_na(val, na)]

# matching na not found
if not len(match):
found_nas.add(val)

# add na to stargets to utilize `in` for stargets/d lookup
match_stargets = [
x for x in stargets if is_matching_na(val, x)
]

if len(match_stargets):
# add our 'standardized' na
stargets.add(val)

# matching na found
else:
assert len(match) == 1
val = match[0]

if val in stargets:
if val not in d:
d[val] = []
d[val].append(i)

elif util.is_nan(val):
# GH#35392
if need_nan_check:
# Do this check only once
stargets_has_nan = any(util.is_nan(val) for x in stargets)
need_nan_check = False

if stargets_has_nan:
if not d_has_nan:
# use a canonical nan object
d[np.nan] = []
d_has_nan = True
d[np.nan].append(i)

for i in range(n_t):
val = targets[i]

# ensure there are nas in values before looking for a matching na
if check_na_values and checknull(val):
match = [na for na in found_nas if is_matching_na(val, na)]
if len(match):
assert len(match) == 1
val = match[0]

# found
if val in d or (d_has_nan and util.is_nan(val)):
key = val if not util.is_nan(val) else np.nan
if val in d:
key = val

for j in d[key]:

# realloc if needed
Expand Down
13 changes: 7 additions & 6 deletions pandas/_libs/missing.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,12 @@ from pandas._libs cimport util
from pandas._libs.tslibs.nattype cimport (
c_NaT as NaT,
checknull_with_nat,
is_dt64nat,
is_null_datetimelike,
is_td64nat,
)
from pandas._libs.tslibs.np_datetime cimport (
get_datetime64_unit,
get_datetime64_value,
get_timedelta64_value,
)
Expand Down Expand Up @@ -82,12 +85,14 @@ cpdef bint is_matching_na(object left, object right, bint nan_matches_none=False
get_datetime64_value(left) == NPY_NAT
and util.is_datetime64_object(right)
and get_datetime64_value(right) == NPY_NAT
and get_datetime64_unit(left) == get_datetime64_unit(right)
)
elif util.is_timedelta64_object(left):
return (
get_timedelta64_value(left) == NPY_NAT
and util.is_timedelta64_object(right)
and get_timedelta64_value(right) == NPY_NAT
and get_datetime64_unit(left) == get_datetime64_unit(right)
)
elif is_decimal_na(left):
return is_decimal_na(right)
Expand Down Expand Up @@ -345,20 +350,16 @@ def isneginf_scalar(val: object) -> bool:
cdef inline bint is_null_datetime64(v):
# determine if we have a null for a datetime (or integer versions),
# excluding np.timedelta64('nat')
if checknull_with_nat(v):
if checknull_with_nat(v) or is_dt64nat(v):
return True
elif util.is_datetime64_object(v):
return get_datetime64_value(v) == NPY_NAT
return False


cdef inline bint is_null_timedelta64(v):
# determine if we have a null for a timedelta (or integer versions),
# excluding np.datetime64('nat')
if checknull_with_nat(v):
if checknull_with_nat(v) or is_td64nat(v):
return True
elif util.is_timedelta64_object(v):
return get_timedelta64_value(v) == NPY_NAT
return False


Expand Down
2 changes: 2 additions & 0 deletions pandas/_libs/tslibs/nattype.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,6 @@ cdef _NaT c_NaT


cdef bint checknull_with_nat(object val)
cdef bint is_dt64nat(object val)
cdef bint is_td64nat(object val)
cpdef bint is_null_datetimelike(object val, bint inat_is_null=*)
16 changes: 16 additions & 0 deletions pandas/_libs/tslibs/nattype.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1133,6 +1133,22 @@ cdef inline bint checknull_with_nat(object val):
"""
return val is None or util.is_nan(val) or val is c_NaT

cdef inline bint is_dt64nat(object val):
"""
Is this a np.datetime64 object np.datetime64("NaT").
"""
if util.is_datetime64_object(val):
return get_datetime64_value(val) == NPY_NAT
return False

cdef inline bint is_td64nat(object val):
"""
Is this a np.timedelta64 object np.timedelta64("NaT").
"""
if util.is_timedelta64_object(val):
return get_timedelta64_value(val) == NPY_NAT
return False


cpdef bint is_null_datetimelike(object val, bint inat_is_null=True):
"""
Expand Down
19 changes: 19 additions & 0 deletions pandas/_testing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,25 @@
)

NULL_OBJECTS = [None, np.nan, pd.NaT, float("nan"), pd.NA, Decimal("NaN")]
NP_NAT_OBJECTS = [
cls("NaT", unit)
for cls in [np.datetime64, np.timedelta64]
for unit in [
"Y",
"M",
"W",
"D",
"h",
"m",
"s",
"ms",
"us",
"ns",
"ps",
"fs",
"as",
]
]

EMPTY_STRING_PATTERN = re.compile("^$")

Expand Down
13 changes: 13 additions & 0 deletions pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,19 @@ def unique_nulls_fixture(request):
# Generate cartesian product of unique_nulls_fixture:
unique_nulls_fixture2 = unique_nulls_fixture


@pytest.fixture(params=tm.NP_NAT_OBJECTS, ids=lambda x: type(x).__name__)
def np_nat_fixture(request):
"""
Fixture for each NaT type in numpy.
"""
return request.param


# Generate cartesian product of np_nat_fixture:
np_nat_fixture2 = np_nat_fixture


# ----------------------------------------------------------------
# Classes
# ----------------------------------------------------------------
Expand Down
53 changes: 53 additions & 0 deletions pandas/tests/indexes/object/test_indexing.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from decimal import Decimal

import numpy as np
import pytest

Expand Down Expand Up @@ -90,12 +92,63 @@ def test_get_indexer_non_unique_nas(self, nulls_fixture):
# matching-but-not-identical nans
if is_matching_na(nulls_fixture, float("NaN")):
index = Index(["a", float("NaN"), "b", float("NaN")])
match_but_not_identical = True
elif is_matching_na(nulls_fixture, Decimal("NaN")):
index = Index(["a", Decimal("NaN"), "b", Decimal("NaN")])
match_but_not_identical = True
else:
match_but_not_identical = False

if match_but_not_identical:
indexer, missing = index.get_indexer_non_unique([nulls_fixture])

expected_indexer = np.array([1, 3], dtype=np.intp)
tm.assert_numpy_array_equal(indexer, expected_indexer)
tm.assert_numpy_array_equal(missing, expected_missing)

@pytest.mark.filterwarnings("ignore:elementwise comp:DeprecationWarning")
def test_get_indexer_non_unique_np_nats(self, np_nat_fixture, np_nat_fixture2):
expected_missing = np.array([], dtype=np.intp)
# matching-but-not-identical nats
if is_matching_na(np_nat_fixture, np_nat_fixture2):
# ensure nats are different objects
index = Index(
np.array(
["2021-10-02", np_nat_fixture.copy(), np_nat_fixture2.copy()],
dtype=object,
),
dtype=object,
)
# pass as index to prevent target from being casted to DatetimeIndex
indexer, missing = index.get_indexer_non_unique(
Index([np_nat_fixture], dtype=object)
)
expected_indexer = np.array([1, 2], dtype=np.intp)
tm.assert_numpy_array_equal(indexer, expected_indexer)
tm.assert_numpy_array_equal(missing, expected_missing)
# dt64nat vs td64nat
else:
index = Index(
np.array(
[
"2021-10-02",
np_nat_fixture,
np_nat_fixture2,
np_nat_fixture,
np_nat_fixture2,
],
dtype=object,
),
dtype=object,
)
# pass as index to prevent target from being casted to DatetimeIndex
indexer, missing = index.get_indexer_non_unique(
Index([np_nat_fixture], dtype=object)
)
expected_indexer = np.array([1, 3], dtype=np.intp)
tm.assert_numpy_array_equal(indexer, expected_indexer)
tm.assert_numpy_array_equal(missing, expected_missing)


class TestSliceLocs:
@pytest.mark.parametrize(
Expand Down
5 changes: 5 additions & 0 deletions pandas/tests/indexes/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,6 +320,11 @@ def test_maybe_cast_slice_bound_kind_deprecated(index):
np.array([1, 2], dtype=np.intp),
),
(["a", "b", "a", np.nan], [np.nan], np.array([3], dtype=np.intp)),
(
np.array(["b", np.nan, float("NaN"), "b"], dtype=object),
Index([np.nan], dtype=object),
np.array([1, 2], dtype=np.intp),
),
],
)
def test_get_indexer_non_unique_multiple_nans(idx, target, expected):
Expand Down