pandas-dev · jbrockmendel · Jan 24, 2023 · Jan 24, 2023 · Jan 24, 2023 · Jan 24, 2023
diff --git a/pandas/_libs/src/klib/khash_python.h b/pandas/_libs/src/klib/khash_python.h
@@ -7,7 +7,10 @@
 typedef npy_complex64 khcomplex64_t;
 typedef npy_complex128 khcomplex128_t;
 
+// get pandas_datetime_to_datetimestruct
+#include <../../tslibs/src/datetime/np_datetime.h>
 
+#include "datetime.h"
 
 // khash should report usage to tracemalloc
 #if PY_VERSION_HEX >= 0x03060000
@@ -330,6 +333,129 @@ Py_hash_t PANDAS_INLINE tupleobject_hash(PyTupleObject* key) {
 }
 
 
+// TODO: if nanos is most common/important, might be most performant to
+// make that canonical and others cast to that?
+Py_hash_t PANDAS_INLINE hash_datetime_value_and_reso(npy_datetime value, NPY_DATETIMEUNIT unit, npy_datetimestruct* dts) {
+    // If we cannot cast to pydatetime, then the question is if there are
+    // other-resolution datetime64 objects that we might be equal to whose
+    // hashes we need to match. We let year-reso objects return value, and make
+    // higher-resolution cases responsible for checking of they match.
+    if (unit == NPY_FR_Y) {
+        return value;
+    }
+    else if (unit == NPY_FR_M) {
+        if ((value % 12) == 0) {
+            return hash_datetime_value_and_reso(value / 12, NPY_FR_Y, dts);
+        }
+        return value;
+    }
+    else if (unit == NPY_FR_W) {
+        if (dts->day == 1) {
+            value = (dts->year - 1970) * 12 + dts->month;
+            return hash_datetime_value_and_reso(value, NPY_FR_M, dts);
+        }
+        return value;
+    }
+    else if (unit == NPY_FR_D) {
+        if ((value % 7) == 0) {
+            return hash_datetime_value_and_reso(value / 7, NPY_FR_W, dts);
+        }
+        return value;
+    }
+    else if (unit == NPY_FR_h) {
+        if ((value % 24) == 0) {
+            return hash_datetime_value_and_reso(value / 24, NPY_FR_D, dts);
+        }
+        return value;
+    }
+    else if (unit == NPY_FR_m) {
+        if ((value % 60) == 0) {
+            return hash_datetime_value_and_reso(value / 60, NPY_FR_h, dts);
+        }
+        return value;
+    }
+    else if (unit == NPY_FR_s) {
+        if ((value % 60) == 0) {
+            return hash_datetime_value_and_reso(value / 60, NPY_FR_m, dts);
+        }
+        return value;
+    }
+    else if (unit == NPY_FR_ms) {
+        if ((value % 1000) == 0) {
+            return hash_datetime_value_and_reso(value / 1000, NPY_FR_s, dts);
+        }
+        return value;
+    }
+    else if (unit == NPY_FR_us) {
+        if ((value % 1000) == 0) {
+            return hash_datetime_value_and_reso(value / 1000, NPY_FR_ns, dts);
+        }
+        return value;
+    }
+    else if (unit == NPY_FR_ns) {
+        if ((value % 1000) == 0) {
+            return hash_datetime_value_and_reso(value / 1000, NPY_FR_us, dts);
+        }
+        return value;
+    }
+    else if (unit == NPY_FR_ps) {
+        if ((value % 1000) == 0) {
+            return hash_datetime_value_and_reso(value / 1000, NPY_FR_ns, dts);
+        }
+        return value;
+    }
+    else if (unit == NPY_FR_fs) {
+        if ((value % 1000) == 0) {
+            return hash_datetime_value_and_reso(value / 1000, NPY_FR_ps, dts);
+        }
+        return value;
+    }
+    else if (unit == NPY_FR_as) {
+        if ((value % 1000) == 0) {
+            return hash_datetime_value_and_reso(value / 1000, NPY_FR_fs, dts);
+        }
+        return value;
+    }
+    else {
+        // i.e. NPY_FR_GENERIC
+        // we default to treating these like nanos
+        return hash_datetime_value_and_reso(value, NPY_FR_ns, dts);
+    }
+}
+
+
+// TODO: same thing for timedelta64 objects
+Py_hash_t np_datetime64_object_hash(PyObject* key) {
+    // GH#50690 numpy's hash implementation does not preserve comparabity
+    // either across resolutions or with standard library objects.
+    // See also Timestamp.__hash__
+
+    NPY_DATETIMEUNIT unit = (NPY_DATETIMEUNIT)((PyDatetimeScalarObject*)key)->obmeta.base;
+    npy_datetime value = ((PyDatetimeScalarObject*)key)->obval;
+    npy_datetimestruct dts;
+    PyObject* dt;
+
+    if (value == NPY_DATETIME_NAT) {
+        // np.datetime64("NaT") in any reso
+        return NPY_DATETIME_NAT;
+    }
+
+    pandas_datetime_to_datetimestruct(value, unit, &dts);
+
+    if ((dts.year > 0) && (dts.year <= 9999) && (dts.ps == 0) && (dts.as == 0)) {
+        // we CAN cast to pydatetime, so use that hash to ensure we compare
+        // as matching standard library datetimes (and pd.Timestamps)
+        PyDateTime_IMPORT;
-        PyDateTime_IMPORT;
+        if (PyDateTimeAPI == NULL) {
+            /* delayed import, may be nice to move to import time */
+            PyDateTime_IMPORT;
+            if (PyDateTimeAPI == NULL) {
+                return -1;
+            }
+        }
-        PyDateTime_IMPORT;
+        if (PyDateTimeAPI == NULL) {
+            /* delayed import, may be nice to move to import time */
+            PyDateTime_IMPORT;
+            if (PyDateTimeAPI == NULL) {
+                return -1;
+            }
+        }
+        dt = PyDateTime_FromDateAndTime(
+            dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us
+        );
+        return PyObject_Hash(dt);
+    }
+
+    return hash_datetime_value_and_reso(value, unit, &dts);
+}
+
+
 khuint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key) {
     Py_hash_t hash;
     // For PyObject_Hash holds:
@@ -351,6 +477,10 @@ khuint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key) {
     else if (PyTuple_CheckExact(key)) {
         hash = tupleobject_hash((PyTupleObject*)key);
     }
+    else if (PyObject_TypeCheck(key, &PyDatetimeArrType_Type)) {
+        // GH#50690
+        hash = np_datetime64_object_hash(key);
+    }
     else {
         hash = PyObject_Hash(key);
     }

diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py
@@ -187,3 +187,29 @@ def test_slice_locs_dup(self):
         assert index2.slice_locs(end="a") == (0, 6)
         assert index2.slice_locs("d", "b") == (0, 4)
         assert index2.slice_locs("c", "a") == (2, 6)
+
+
+def test_np_datetime64_objects():
+    # GH#50690
+    ms = np.datetime64(1, "ms")
+    us = np.datetime64(1000, "us")
+
+    left = Index([ms], dtype=object)
+    right = Index([us], dtype=object)
+
+    assert left[0] in right
+    assert right[0] in left
+
+    assert left.get_loc(right[0]) == 0
+    assert right.get_loc(left[0]) == 0
+
+    # non-monotonic cases go through different paths in cython code
+    sec = np.datetime64("9999-01-01", "s")
+    day = np.datetime64("2016-01-01", "D")
+    left2 = Index([ms, sec, day], dtype=object)
+
+    expected = np.array([0], dtype=np.intp)
+    res = left2[:1].get_indexer(right)
+    tm.assert_numpy_array_equal(res, expected)
+    res = left2.get_indexer(right)
+    tm.assert_numpy_array_equal(res, expected)
diff --git a/setup.py b/setup.py
@@ -453,7 +453,9 @@ def srcpath(name=None, suffix=".pyx", subdir="src"):
         "depends": (
             ["pandas/_libs/src/klib/khash_python.h", "pandas/_libs/src/klib/khash.h"]
             + _pxi_dep["hashtable"]
+            + tseries_depends
         ),
+        "sources": ["pandas/_libs/tslibs/src/datetime/np_datetime.c"],
     },
     "_libs.index": {
         "pyxfile": "_libs/index",
@@ -465,7 +467,8 @@ def srcpath(name=None, suffix=".pyx", subdir="src"):
     "_libs.interval": {
         "pyxfile": "_libs/interval",
         "include": klib_include,
-        "depends": _pxi_dep["interval"],
+        "depends": _pxi_dep["interval"] + tseries_depends,
+        "sources": ["pandas/_libs/tslibs/src/datetime/np_datetime.c"],
     },
     "_libs.join": {"pyxfile": "_libs/join", "include": klib_include},
     "_libs.lib": {