diff --git a/pandas/_libs/src/klib/khash_python.h b/pandas/_libs/src/klib/khash_python.h index 56afea049c1ec..d70005be918ee 100644 --- a/pandas/_libs/src/klib/khash_python.h +++ b/pandas/_libs/src/klib/khash_python.h @@ -7,7 +7,10 @@ typedef npy_complex64 khcomplex64_t; typedef npy_complex128 khcomplex128_t; +// get pandas_datetime_to_datetimestruct +#include <../../tslibs/src/datetime/np_datetime.h> +#include "datetime.h" // khash should report usage to tracemalloc #if PY_VERSION_HEX >= 0x03060000 @@ -305,6 +308,7 @@ khuint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key); #define _PandasHASH_XXROTATE(x) ((x << 13) | (x >> 19)) /* Rotate left 13 bits */ #endif + Py_hash_t PANDAS_INLINE tupleobject_hash(PyTupleObject* key) { Py_ssize_t i, len = Py_SIZE(key); PyObject **item = key->ob_item; @@ -315,9 +319,7 @@ Py_hash_t PANDAS_INLINE tupleobject_hash(PyTupleObject* key) { if (lane == (Py_uhash_t)-1) { return -1; } - acc += lane * _PandasHASH_XXPRIME_2; - acc = _PandasHASH_XXROTATE(acc); - acc *= _PandasHASH_XXPRIME_1; + acc = tuple_update_uhash(acc, lane); } /* Add input length, mangled to keep the historical value of hash(()). */ @@ -351,6 +353,10 @@ khuint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key) { else if (PyTuple_CheckExact(key)) { hash = tupleobject_hash((PyTupleObject*)key); } + else if (PyObject_TypeCheck(key, &PyDatetimeArrType_Type)) { + // GH#50690 + hash = np_datetime64_object_hash((PyDatetimeScalarObject *)key); + } else { hash = PyObject_Hash(key); } diff --git a/pandas/_libs/tslibs/np_datetime.pxd b/pandas/_libs/tslibs/np_datetime.pxd index 7e19eb084b59e..fbd2d09417472 100644 --- a/pandas/_libs/tslibs/np_datetime.pxd +++ b/pandas/_libs/tslibs/np_datetime.pxd @@ -72,6 +72,7 @@ cdef extern from "src/datetime/pd_datetime.h": pandas_timedeltastruct *result ) nogil + Py_hash_t hash_datetime_from_struct(npy_datetimestruct* dts) except? -1 void PandasDateTime_IMPORT() ctypedef enum FormatRequirement: diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime.c b/pandas/_libs/tslibs/src/datetime/np_datetime.c index 0b3a973cc9b6c..14fbfccd203fd 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime.c +++ b/pandas/_libs/tslibs/src/datetime/np_datetime.c @@ -25,8 +25,9 @@ This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt #include #include #include -#include "np_datetime.h" +#include "np_datetime.h" +#include "datetime.h" const int days_per_month_table[2][12] = { {31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}, @@ -1033,3 +1034,101 @@ PyArray_DatetimeMetaData get_datetime_metadata_from_dtype(PyArray_Descr *dtype) { return (((PyArray_DatetimeDTypeMetaData *)dtype->c_metadata)->meta); } + + +// we could use any hashing algorithm, this is the original CPython's for tuples + +#if SIZEOF_PY_UHASH_T > 4 +#define _PandasHASH_XXPRIME_1 ((Py_uhash_t)11400714785074694791ULL) +#define _PandasHASH_XXPRIME_2 ((Py_uhash_t)14029467366897019727ULL) +#define _PandasHASH_XXPRIME_5 ((Py_uhash_t)2870177450012600261ULL) +#define _PandasHASH_XXROTATE(x) ((x << 31) | (x >> 33)) /* Rotate left 31 bits */ +#else +#define _PandasHASH_XXPRIME_1 ((Py_uhash_t)2654435761UL) +#define _PandasHASH_XXPRIME_2 ((Py_uhash_t)2246822519UL) +#define _PandasHASH_XXPRIME_5 ((Py_uhash_t)374761393UL) +#define _PandasHASH_XXROTATE(x) ((x << 13) | (x >> 19)) /* Rotate left 13 bits */ +#endif + + +Py_uhash_t tuple_update_uhash(Py_uhash_t acc, Py_uhash_t lane) { + acc += lane * _PandasHASH_XXPRIME_2; + acc = _PandasHASH_XXROTATE(acc); + acc *= _PandasHASH_XXPRIME_1; + return acc; +} + +// https://github.com/pandas-dev/pandas/pull/50960 +Py_hash_t +hash_datetime_from_struct(npy_datetimestruct* dts) { + /* + * If we cannot cast to datetime, use the datetime struct values directly + * and mix them similar to a tuple. + */ + + Py_uhash_t acc = _PandasHASH_XXPRIME_5; +#if 64 <= SIZEOF_PY_UHASH_T + acc = tuple_update_uhash(acc, (Py_uhash_t)dts->year); +#else + /* Mix lower and uper bits of the year if int64 is larger */ + acc = tuple_update_uhash(acc, (Py_uhash_t)dts->year); + acc = tuple_update_uhash(acc, (Py_uhash_t)(dts->year >> SIZEOF_PY_UHASH_T)); +#endif + acc = tuple_update_uhash(acc, (Py_uhash_t)dts->month); + acc = tuple_update_uhash(acc, (Py_uhash_t)dts->day); + acc = tuple_update_uhash(acc, (Py_uhash_t)dts->min); + acc = tuple_update_uhash(acc, (Py_uhash_t)dts->sec); + acc = tuple_update_uhash(acc, (Py_uhash_t)dts->us); + acc = tuple_update_uhash(acc, (Py_uhash_t)dts->ps); + acc = tuple_update_uhash(acc, (Py_uhash_t)dts->as); + /* should be a need to mix length, as it is fixed anyway? */ + if (acc == (Py_uhash_t)-1) { + acc = (Py_uhash_t)-2; + } + return acc; +} + + +// TODO(jbrockmendel): same thing for timedelta64 objects +Py_hash_t np_datetime64_object_hash(PyDatetimeScalarObject* key) { + // GH#50690 numpy's hash implementation does not preserve comparabity + // either across resolutions or with standard library objects. + // See also Timestamp.__hash__ + + NPY_DATETIMEUNIT unit = (NPY_DATETIMEUNIT)key->obmeta.base; + npy_datetime value = key->obval; + npy_datetimestruct dts; + + if (value == NPY_DATETIME_NAT) { + // np.datetime64("NaT") in any reso + return NPY_DATETIME_NAT; + } + + pandas_datetime_to_datetimestruct(value, unit, &dts); + + if ((dts.year > 0) && (dts.year <= 9999) && (dts.ps == 0) && (dts.as == 0)) { + // we CAN cast to pydatetime, so use that hash to ensure we compare + // as matching standard library datetimes (and pd.Timestamps) + if (PyDateTimeAPI == NULL) { + /* delayed import, may be nice to move to import time */ + PyDateTime_IMPORT; + if (PyDateTimeAPI == NULL) { + return -1; + } + } + + PyObject* dt; + Py_hash_t hash; + + dt = PyDateTime_FromDateAndTime( + dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us); + if (dt == NULL) { + return -1; + } + hash = PyObject_Hash(dt); + Py_DECREF(dt); + return hash; + } + + return hash_datetime_from_struct(&dts); +} diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime.h b/pandas/_libs/tslibs/src/datetime/np_datetime.h index 68f72683ab2e4..0b11c813a3a71 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime.h +++ b/pandas/_libs/tslibs/src/datetime/np_datetime.h @@ -22,6 +22,7 @@ This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt #endif // NPY_NO_DEPRECATED_API #include +#include typedef struct { npy_int64 days; @@ -116,4 +117,8 @@ PyArray_DatetimeMetaData get_datetime_metadata_from_dtype( PyArray_Descr *dtype); +Py_hash_t np_datetime64_object_hash(PyDatetimeScalarObject* key); +Py_hash_t hash_datetime_from_struct(npy_datetimestruct* dts); +Py_uhash_t tuple_update_uhash(Py_uhash_t acc, Py_uhash_t lane); + #endif // PANDAS__LIBS_TSLIBS_SRC_DATETIME_NP_DATETIME_H_ diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 10a331f302cc4..20b453b8f6dd3 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -88,6 +88,7 @@ from pandas._libs.tslibs.np_datetime cimport ( get_datetime64_unit, get_datetime64_value, get_unit_from_dtype, + hash_datetime_from_struct, import_pandas_datetime, npy_datetimestruct, npy_datetimestruct_to_datetime, @@ -311,11 +312,12 @@ cdef class _Timestamp(ABCTimestamp): # ----------------------------------------------------------------- def __hash__(_Timestamp self): - if self.nanosecond: - return hash(self._value) - if not (1 <= self.year <= 9999): + cdef: + npy_datetimestruct dts + if not (1 <= self.year <= 9999) or self.nanosecond: # out of bounds for pydatetime - return hash(self._value) + pydatetime_to_dtstruct(self, &dts) + return hash_datetime_from_struct(&dts) if self.fold: return datetime.__hash__(self.replace(fold=0)) return datetime.__hash__(self) diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py index 87d3afc77d556..d48a59d418c30 100644 --- a/pandas/tests/indexes/object/test_indexing.py +++ b/pandas/tests/indexes/object/test_indexing.py @@ -187,3 +187,29 @@ def test_slice_locs_dup(self): assert index2.slice_locs(end="a") == (0, 6) assert index2.slice_locs("d", "b") == (0, 4) assert index2.slice_locs("c", "a") == (2, 6) + + +def test_np_datetime64_objects(): + # GH#50690 + ms = np.datetime64(1, "ms") + us = np.datetime64(1000, "us") + + left = Index([ms], dtype=object) + right = Index([us], dtype=object) + + assert left[0] in right + assert right[0] in left + + assert left.get_loc(right[0]) == 0 + assert right.get_loc(left[0]) == 0 + + # non-monotonic cases go through different paths in cython code + sec = np.datetime64("9999-01-01", "s") + day = np.datetime64("2016-01-01", "D") + left2 = Index([ms, sec, day], dtype=object) + + expected = np.array([0], dtype=np.intp) + res = left2[:1].get_indexer(right) + tm.assert_numpy_array_equal(res, expected) + res = left2.get_indexer(right) + tm.assert_numpy_array_equal(res, expected) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 07529fcbb49b7..d1991bb0a2349 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1151,6 +1151,15 @@ def test_isin_unsigned_dtype(self): class TestValueCounts: + def test_value_counts_datetime64_mismatched_units(self): + # GH#50960 np.datetime64 objects with different units that are still equal + arr = np.array( + [np.datetime64(1, "ms"), np.datetime64(1000, "us")], dtype=object + ) + res = algos.value_counts(arr) + expected = Series([2], index=arr[:1], name="count") + tm.assert_series_equal(res, expected) + def test_value_counts(self): np.random.seed(1234) from pandas.core.reshape.tile import cut @@ -1607,6 +1616,14 @@ def test_unique_complex_numbers(self, array, expected): result = pd.unique(array) tm.assert_numpy_array_equal(result, expected) + def test_unique_datetime64_mismatched_units(self): + # GH#50960 np.datetime64 objects with different units that are still equal + arr = np.array( + [np.datetime64(1, "ms"), np.datetime64(1000, "us")], dtype=object + ) + res = pd.unique(arr) + tm.assert_numpy_array_equal(res, arr[:1]) + class TestHashTable: @pytest.mark.parametrize( diff --git a/setup.py b/setup.py index 857cc4c71b70c..04194336a77c2 100755 --- a/setup.py +++ b/setup.py @@ -445,7 +445,8 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): "_libs.algos": { "pyxfile": "_libs/algos", "include": klib_include, - "depends": _pxi_dep["algos"], + "depends": _pxi_dep["algos"] + tseries_depends, + "sources": ["pandas/_libs/tslibs/src/datetime/np_datetime.c"], }, "_libs.arrays": {"pyxfile": "_libs/arrays"}, "_libs.groupby": {"pyxfile": "_libs/groupby"}, @@ -456,21 +457,27 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): "depends": ( ["pandas/_libs/src/klib/khash_python.h", "pandas/_libs/src/klib/khash.h"] + _pxi_dep["hashtable"] + + tseries_depends ), + "sources": ["pandas/_libs/tslibs/src/datetime/np_datetime.c"], }, "_libs.index": { "pyxfile": "_libs/index", "include": klib_include, - "depends": _pxi_dep["index"], + "depends": _pxi_dep["index"] + tseries_depends, }, "_libs.indexing": {"pyxfile": "_libs/indexing"}, "_libs.internals": {"pyxfile": "_libs/internals"}, "_libs.interval": { "pyxfile": "_libs/interval", "include": klib_include, - "depends": _pxi_dep["interval"], + "depends": _pxi_dep["interval"] + tseries_depends, + }, + "_libs.join": { + "pyxfile": "_libs/join", + "include": klib_include, + "sources": ["pandas/_libs/tslibs/src/datetime/np_datetime.c"], }, - "_libs.join": {"pyxfile": "_libs/join", "include": klib_include}, "_libs.lib": { "pyxfile": "_libs/lib", "depends": lib_depends + tseries_depends,