Skip to content

BUG: get_indexer_non_unique with np.datetime64("NaT") and np.timedelta64("NaT") #43870

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 28 commits into from
Oct 18, 2021
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
09183e8
CLN: removing x and replacing with starget
alexhlim Oct 2, 2021
42070ec
Merge branch 'master' into nui-fixes
alexhlim Oct 2, 2021
c6bbdc1
Merge branch 'master' into nui-fixes
alexhlim Oct 3, 2021
62d5335
TST: adding NaT non unique tests
alexhlim Oct 3, 2021
8271d58
BUG: check np.datetime64('NaT') and np.timedelta64('NaT') in get_inde…
alexhlim Oct 4, 2021
da2dc44
DOC: adding whatsnew
alexhlim Oct 4, 2021
801d8da
CLN: putting parens around each condition
alexhlim Oct 7, 2021
00e0f68
CLN: refactor with is_dt64nat and istd64nat
alexhlim Oct 7, 2021
da54caa
TST: separate tests + use np nat fixtures
alexhlim Oct 8, 2021
566e096
CLN: short circuit np nat check for object dtype
alexhlim Oct 8, 2021
d20027d
CLN: reverting starget change for another PR
alexhlim Oct 8, 2021
299a45a
CLN: fixing docstring
alexhlim Oct 10, 2021
8878edf
CLN: fixing np_nat_fixture2 comment
alexhlim Oct 10, 2021
c4acbda
CLN: forgot to undo this line for stargets
alexhlim Oct 10, 2021
5c94e8b
DOC: TODO for np nats
alexhlim Oct 11, 2021
af33f5f
DOC: adding comments
alexhlim Oct 11, 2021
1490968
TST: ensure numpy doesn't downcast nats
alexhlim Oct 14, 2021
a5db551
Merge branch 'master' into nui-dt64nat-td64nat
alexhlim Oct 14, 2021
77216f7
CLN: reverting back to original is_matching_na check + adding time un…
alexhlim Oct 18, 2021
6b4179e
TST: updating np_nat_objects fixtures with date units
alexhlim Oct 18, 2021
94e7add
CLN: consoldiating all object na checks into one
alexhlim Oct 18, 2021
da4300c
Merge branch 'master' into nui-dt64nat-td64nat
alexhlim Oct 18, 2021
0d1e260
CLN: fixing typo in comment + check match target
alexhlim Oct 18, 2021
802c261
CLN: condensing np_nat_objects fixture
alexhlim Oct 18, 2021
14e0868
CLN: fixing comment typos
alexhlim Oct 18, 2021
14e5c0d
TST: added matching-but-not-identical for Decimal(NaN)
alexhlim Oct 18, 2021
54aa23d
TST: edge case with np.nan before float(NaN) while searching for np.nan
alexhlim Oct 18, 2021
551c0b1
Merge branch 'master' into nui-dt64nat-td64nat
alexhlim Oct 18, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -447,6 +447,7 @@ Indexing
- Bug in :meth:`DataFrame.nlargest` and :meth:`Series.nlargest` where sorted result did not count indexes containing ``np.nan`` (:issue:`28984`)
- Bug in indexing on a non-unique object-dtype :class:`Index` with an NA scalar (e.g. ``np.nan``) (:issue:`43711`)
- Bug in :meth:`Series.__setitem__` with object dtype when setting an array with matching size and dtype='datetime64[ns]' or dtype='timedelta64[ns]' incorrectly converting the datetime/timedeltas to integers (:issue:`43868`)
- Bug in :meth:`Index.get_indexer_non_unique` when index contains multiple ``np.datetime64("NaT")`` and ``np.timedelta64("NaT")`` (:issue:`43869`)
-

Missing
Expand Down
70 changes: 65 additions & 5 deletions pandas/_libs/index.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,11 @@ cnp.import_array()

from pandas._libs cimport util
from pandas._libs.hashtable cimport HashTable
from pandas._libs.tslibs.nattype cimport c_NaT as NaT
from pandas._libs.tslibs.nattype cimport (
c_NaT as NaT,
is_dt64nat,
is_td64nat,
)
from pandas._libs.tslibs.period cimport is_period_object
from pandas._libs.tslibs.timedeltas cimport _Timedelta
from pandas._libs.tslibs.timestamps cimport _Timestamp
Expand Down Expand Up @@ -319,10 +323,14 @@ cdef class IndexEngine:
ndarray[intp_t] result, missing
set stargets, remaining_stargets
dict d = {}
object val
object val, dt64nat, td64nat
Py_ssize_t count = 0, count_missing = 0
Py_ssize_t i, j, n, n_t, n_alloc, start, end
bint d_has_nan = False, stargets_has_nan = False, need_nan_check = True
d_has_dt64nat = False, stargets_has_dt64nat = False,
need_dt64nat_check = True, d_has_td64nat = False,
stargets_has_td64nat = False, need_td64nat_check = True,
need_np_nat_check = False

values = self.values
stargets = set(targets)
Expand Down Expand Up @@ -357,6 +365,11 @@ cdef class IndexEngine:
if stargets:
# otherwise, map by iterating through all items in the index

# determine if we need to check for numpy nats
# ie. np.datetime64("NaT") np.timedelta64("NaT")
if values.dtype == object:
need_np_nat_check = True

for i in range(n):
val = values[i]
if val in stargets:
Expand All @@ -378,12 +391,59 @@ cdef class IndexEngine:
d_has_nan = True
d[np.nan].append(i)

elif need_np_nat_check:
if is_dt64nat(val):
if need_dt64nat_check:
# Do this check only once
stargets_has_dt64nat = any(
is_dt64nat(starget) for starget in stargets
)
need_dt64nat_check = False

if stargets_has_dt64nat:
if not d_has_dt64nat:
# store to ensure future access to `d` uses same key
dt64nat = np.datetime64("NaT")
d[dt64nat] = []
d_has_dt64nat = True
d[dt64nat].append(i)

elif is_td64nat(val):
if need_td64nat_check:
# Do this check only once
stargets_has_td64nat = any(
is_td64nat(starget) for starget in stargets
)
need_td64nat_check = False

if stargets_has_td64nat:
if not d_has_td64nat:
# store to ensure future access to `d` uses same key
td64nat = np.timedelta64("NaT")
d[td64nat] = []
d_has_td64nat = True
d[td64nat].append(i)

for i in range(n_t):
val = targets[i]

# found
if val in d or (d_has_nan and util.is_nan(val)):
key = val if not util.is_nan(val) else np.nan
# cannot search for nan/nat target using `in`,
# need to lookup key using d_has_...
# and confirm na type via util function
if (
val in d
or (d_has_nan and util.is_nan(val))
or (d_has_dt64nat and is_dt64nat(val))
or (d_has_td64nat and is_td64nat(val))
):
key = val
if d_has_nan and util.is_nan(key):
key = np.nan
elif d_has_dt64nat and is_dt64nat(key):
key = dt64nat
elif d_has_td64nat and is_td64nat(key):
key = td64nat

for j in d[key]:

# realloc if needed
Expand Down
26 changes: 8 additions & 18 deletions pandas/_libs/missing.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@ from pandas._libs cimport util
from pandas._libs.tslibs.nattype cimport (
c_NaT as NaT,
checknull_with_nat,
is_dt64nat,
is_null_datetimelike,
is_td64nat,
)
from pandas._libs.tslibs.np_datetime cimport (
get_datetime64_value,
Expand Down Expand Up @@ -77,18 +79,10 @@ cpdef bint is_matching_na(object left, object right, bint nan_matches_none=False
and util.is_complex_object(right)
and util.is_nan(right)
)
elif util.is_datetime64_object(left):
return (
get_datetime64_value(left) == NPY_NAT
and util.is_datetime64_object(right)
and get_datetime64_value(right) == NPY_NAT
)
elif util.is_timedelta64_object(left):
return (
get_timedelta64_value(left) == NPY_NAT
and util.is_timedelta64_object(right)
and get_timedelta64_value(right) == NPY_NAT
)
elif is_dt64nat(left):
return is_dt64nat(right)
elif is_td64nat(left):
return is_td64nat(right)
elif is_decimal_na(left):
return is_decimal_na(right)
return False
Expand Down Expand Up @@ -345,20 +339,16 @@ def isneginf_scalar(val: object) -> bool:
cdef inline bint is_null_datetime64(v):
# determine if we have a null for a datetime (or integer versions),
# excluding np.timedelta64('nat')
if checknull_with_nat(v):
if checknull_with_nat(v) or is_dt64nat(v):
return True
elif util.is_datetime64_object(v):
return get_datetime64_value(v) == NPY_NAT
return False


cdef inline bint is_null_timedelta64(v):
# determine if we have a null for a timedelta (or integer versions),
# excluding np.datetime64('nat')
if checknull_with_nat(v):
if checknull_with_nat(v) or is_td64nat(v):
return True
elif util.is_timedelta64_object(v):
return get_timedelta64_value(v) == NPY_NAT
return False


Expand Down
2 changes: 2 additions & 0 deletions pandas/_libs/tslibs/nattype.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,6 @@ cdef _NaT c_NaT


cdef bint checknull_with_nat(object val)
cdef bint is_dt64nat(object val)
cdef bint is_td64nat(object val)
cpdef bint is_null_datetimelike(object val, bint inat_is_null=*)
16 changes: 16 additions & 0 deletions pandas/_libs/tslibs/nattype.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1133,6 +1133,22 @@ cdef inline bint checknull_with_nat(object val):
"""
return val is None or util.is_nan(val) or val is c_NaT

cdef inline bint is_dt64nat(object val):
"""
Is this a np.datetime64 object np.datetime64("NaT").
"""
if util.is_datetime64_object(val):
return get_datetime64_value(val) == NPY_NAT
return False

cdef inline bint is_td64nat(object val):
"""
Is this a np.timedelta64 object np.timedelta64("NaT").
"""
if util.is_timedelta64_object(val):
return get_timedelta64_value(val) == NPY_NAT
return False


cpdef bint is_null_datetimelike(object val, bint inat_is_null=True):
"""
Expand Down
1 change: 1 addition & 0 deletions pandas/_testing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,7 @@
)

NULL_OBJECTS = [None, np.nan, pd.NaT, float("nan"), pd.NA, Decimal("NaN")]
NP_NAT_OBJECTS = [np.datetime64("NaT"), np.timedelta64("NaT")]

EMPTY_STRING_PATTERN = re.compile("^$")

Expand Down
13 changes: 13 additions & 0 deletions pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,19 @@ def unique_nulls_fixture(request):
# Generate cartesian product of unique_nulls_fixture:
unique_nulls_fixture2 = unique_nulls_fixture


@pytest.fixture(params=tm.NP_NAT_OBJECTS, ids=lambda x: type(x).__name__)
def np_nat_fixture(request):
"""
Fixture for each NaT type in numpy.
"""
return request.param


# Generate cartesian product of np_nat_fixture:
np_nat_fixture2 = np_nat_fixture


# ----------------------------------------------------------------
# Classes
# ----------------------------------------------------------------
Expand Down
46 changes: 46 additions & 0 deletions pandas/tests/indexes/object/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,52 @@ def test_get_indexer_non_unique_nas(self, nulls_fixture):
tm.assert_numpy_array_equal(indexer, expected_indexer)
tm.assert_numpy_array_equal(missing, expected_missing)

# TODO: distinguish between different date/time units
# for datetime64("NaT") and timedelta64("NaT"):
# ie. np.datetime64("NaT") vs np.datetime64("NaT", "ns"), np.datetime64("NaT", "ms")
@pytest.mark.filterwarnings("ignore:elementwise comp:DeprecationWarning")
def test_get_indexer_non_unique_np_nats(self, np_nat_fixture, np_nat_fixture2):
expected_missing = np.array([], dtype=np.intp)
# matching-but-not-identical nats
if is_matching_na(np_nat_fixture, np_nat_fixture2):
# ensure nats are different objects
index = Index(
np.array(
["2021-10-02", np_nat_fixture.copy(), np_nat_fixture2.copy()],
dtype=object,
),
dtype=object,
)
# pass as index to prevent target from being casted to DatetimeIndex
indexer, missing = index.get_indexer_non_unique(
Index([np_nat_fixture], dtype=object)
)
expected_indexer = np.array([1, 2], dtype=np.intp)
tm.assert_numpy_array_equal(indexer, expected_indexer)
tm.assert_numpy_array_equal(missing, expected_missing)
# dt64nat vs td64nat
else:
index = Index(
np.array(
[
"2021-10-02",
np_nat_fixture,
np_nat_fixture2,
np_nat_fixture,
np_nat_fixture2,
],
dtype=object,
),
dtype=object,
)
# pass as index to prevent target from being casted to DatetimeIndex
indexer, missing = index.get_indexer_non_unique(
Index([np_nat_fixture], dtype=object)
)
expected_indexer = np.array([1, 3], dtype=np.intp)
tm.assert_numpy_array_equal(indexer, expected_indexer)
tm.assert_numpy_array_equal(missing, expected_missing)


class TestSliceLocs:
@pytest.mark.parametrize(
Expand Down