From 7cd9cffbba385b6c0dd2acb7da4a3b03591fad6d Mon Sep 17 00:00:00 2001 From: avm19 <52547519avm19@users.noreply.github.com> Date: Wed, 5 Mar 2025 05:42:41 +0000 Subject: [PATCH 1/9] Modify an existing test to cover the issue with na_pos > 128. --- pandas/tests/libs/test_hashtable.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index 50b561aefcf49..6a95cfc7355d8 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -149,18 +149,19 @@ def test_map_locations(self, table_type, dtype, writable): def test_map_locations_mask(self, table_type, dtype, writable): if table_type == ht.PyObjectHashTable: pytest.skip("Mask not supported for object") - N = 3 + N = 129 # must be > 128 to test GH#58924 table = table_type(uses_mask=True) keys = (np.arange(N) + N).astype(dtype) keys.flags.writeable = writable - table.map_locations(keys, np.array([False, False, True])) + mask = np.concatenate([np.repeat(False, N - 1), [True]], axis=0) + table.map_locations(keys, mask) for i in range(N - 1): assert table.get_item(keys[i]) == i with pytest.raises(KeyError, match=re.escape(str(keys[N - 1]))): table.get_item(keys[N - 1]) - assert table.get_na() == 2 + assert table.get_na() == N - 1 def test_lookup(self, table_type, dtype, writable): N = 3 From 5ba77579543bd85386da0a1b3ec0768b9a1aa8d2 Mon Sep 17 00:00:00 2001 From: avm19 <52547519avm19@users.noreply.github.com> Date: Wed, 5 Mar 2025 14:51:33 +0000 Subject: [PATCH 2/9] Change na_position type from int8_t and int64_t consistently to Py_ssize_t. --- pandas/_libs/hashtable.pxd | 24 +++++++++++----------- pandas/_libs/hashtable_class_helper.pxi.in | 6 +++--- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd index a5a3edad63403..0480ee54ffb4e 100644 --- a/pandas/_libs/hashtable.pxd +++ b/pandas/_libs/hashtable.pxd @@ -41,7 +41,7 @@ cdef class HashTable: cdef class UInt64HashTable(HashTable): cdef kh_uint64_t *table - cdef int64_t na_position + cdef Py_ssize_t na_position cdef bint uses_mask cpdef get_item(self, uint64_t val) @@ -51,7 +51,7 @@ cdef class UInt64HashTable(HashTable): cdef class Int64HashTable(HashTable): cdef kh_int64_t *table - cdef int64_t na_position + cdef Py_ssize_t na_position cdef bint uses_mask cpdef get_item(self, int64_t val) @@ -61,7 +61,7 @@ cdef class Int64HashTable(HashTable): cdef class UInt32HashTable(HashTable): cdef kh_uint32_t *table - cdef int64_t na_position + cdef Py_ssize_t na_position cdef bint uses_mask cpdef get_item(self, uint32_t val) @@ -71,7 +71,7 @@ cdef class UInt32HashTable(HashTable): cdef class Int32HashTable(HashTable): cdef kh_int32_t *table - cdef int64_t na_position + cdef Py_ssize_t na_position cdef bint uses_mask cpdef get_item(self, int32_t val) @@ -81,7 +81,7 @@ cdef class Int32HashTable(HashTable): cdef class UInt16HashTable(HashTable): cdef kh_uint16_t *table - cdef int64_t na_position + cdef Py_ssize_t na_position cdef bint uses_mask cpdef get_item(self, uint16_t val) @@ -91,7 +91,7 @@ cdef class UInt16HashTable(HashTable): cdef class Int16HashTable(HashTable): cdef kh_int16_t *table - cdef int64_t na_position + cdef Py_ssize_t na_position cdef bint uses_mask cpdef get_item(self, int16_t val) @@ -101,7 +101,7 @@ cdef class Int16HashTable(HashTable): cdef class UInt8HashTable(HashTable): cdef kh_uint8_t *table - cdef int64_t na_position + cdef Py_ssize_t na_position cdef bint uses_mask cpdef get_item(self, uint8_t val) @@ -111,7 +111,7 @@ cdef class UInt8HashTable(HashTable): cdef class Int8HashTable(HashTable): cdef kh_int8_t *table - cdef int64_t na_position + cdef Py_ssize_t na_position cdef bint uses_mask cpdef get_item(self, int8_t val) @@ -121,7 +121,7 @@ cdef class Int8HashTable(HashTable): cdef class Float64HashTable(HashTable): cdef kh_float64_t *table - cdef int64_t na_position + cdef Py_ssize_t na_position cdef bint uses_mask cpdef get_item(self, float64_t val) @@ -131,7 +131,7 @@ cdef class Float64HashTable(HashTable): cdef class Float32HashTable(HashTable): cdef kh_float32_t *table - cdef int64_t na_position + cdef Py_ssize_t na_position cdef bint uses_mask cpdef get_item(self, float32_t val) @@ -141,7 +141,7 @@ cdef class Float32HashTable(HashTable): cdef class Complex64HashTable(HashTable): cdef kh_complex64_t *table - cdef int64_t na_position + cdef Py_ssize_t na_position cdef bint uses_mask cpdef get_item(self, complex64_t val) @@ -151,7 +151,7 @@ cdef class Complex64HashTable(HashTable): cdef class Complex128HashTable(HashTable): cdef kh_complex128_t *table - cdef int64_t na_position + cdef Py_ssize_t na_position cdef bint uses_mask cpdef get_item(self, complex128_t val) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 210df09f07db6..eae393f33bfd3 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -535,7 +535,7 @@ cdef class {{name}}HashTable(HashTable): int ret = 0 {{c_type}} val khiter_t k - int8_t na_position = self.na_position + Py_ssize_t na_position = self.na_position if self.uses_mask and mask is None: raise NotImplementedError # pragma: no cover @@ -567,7 +567,7 @@ cdef class {{name}}HashTable(HashTable): Int64Vector self_locs = Int64Vector() Int64VectorData *l Int64VectorData *sl - int8_t na_position = self.na_position + Py_ssize_t na_position = self.na_position l = &locs.data sl = &self_locs.data @@ -609,7 +609,7 @@ cdef class {{name}}HashTable(HashTable): {{c_type}} val khiter_t k intp_t[::1] locs = np.empty(n, dtype=np.intp) - int8_t na_position = self.na_position + Py_ssize_t na_position = self.na_position if self.uses_mask and mask is None: raise NotImplementedError # pragma: no cover From 8edbe338deaa8773344b95445dc56ee44e2b4454 Mon Sep 17 00:00:00 2001 From: avm19 <52547519avm19@users.noreply.github.com> Date: Wed, 5 Mar 2025 17:32:20 +0000 Subject: [PATCH 3/9] Add What's New entry. --- doc/source/whatsnew/v2.3.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 09134763977c3..da57726bae841 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -133,7 +133,7 @@ Interval Indexing ^^^^^^^^ - Fixed bug in :meth:`Index.get_indexer` round-tripping through string dtype when ``infer_string`` is enabled (:issue:`55834`) -- +- Bug in :class:`MaskedIndexEngine` affecting :meth:`Index.get_indexer` and downstream methods when ``NaN`` is located at or after position 128 (:issue:`58924`) Missing ^^^^^^^ From e844fe35e5f51983844b8e20315f5cc595f0212b Mon Sep 17 00:00:00 2001 From: avm19 <52547519avm19@users.noreply.github.com> Date: Wed, 5 Mar 2025 17:50:08 +0000 Subject: [PATCH 4/9] Sort whatsnew entries alphabetically --- doc/source/whatsnew/v2.3.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index da57726bae841..999f7a1a96317 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -132,8 +132,8 @@ Interval Indexing ^^^^^^^^ -- Fixed bug in :meth:`Index.get_indexer` round-tripping through string dtype when ``infer_string`` is enabled (:issue:`55834`) - Bug in :class:`MaskedIndexEngine` affecting :meth:`Index.get_indexer` and downstream methods when ``NaN`` is located at or after position 128 (:issue:`58924`) +- Fixed bug in :meth:`Index.get_indexer` round-tripping through string dtype when ``infer_string`` is enabled (:issue:`55834`) Missing ^^^^^^^ From 53a5829b44855c89bdc04d5b194c9b9b8196bbd3 Mon Sep 17 00:00:00 2001 From: avm19 <52547519+avm19@users.noreply.github.com> Date: Wed, 5 Mar 2025 20:29:26 +0000 Subject: [PATCH 5/9] Improve the whatsnew entry. --- doc/source/whatsnew/v2.3.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 999f7a1a96317..4a16f7c78e51a 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -132,7 +132,7 @@ Interval Indexing ^^^^^^^^ -- Bug in :class:`MaskedIndexEngine` affecting :meth:`Index.get_indexer` and downstream methods when ``NaN`` is located at or after position 128 (:issue:`58924`) +- Bug in ``HashTable`` classes affecting :meth:`Index.get_indexer` and downstream methods when ``NaN`` is located at or after position 128 (:issue:`58924`) - Fixed bug in :meth:`Index.get_indexer` round-tripping through string dtype when ``infer_string`` is enabled (:issue:`55834`) Missing From d92cf2ca446c7c3c0a8e7c3ddde051c1c22c62e3 Mon Sep 17 00:00:00 2001 From: avm19 <52547519+avm19@users.noreply.github.com> Date: Wed, 5 Mar 2025 21:57:51 +0000 Subject: [PATCH 6/9] Move whatsnew entry from v2.3.0.rst to v3.0.0.rst. --- doc/source/whatsnew/v2.3.0.rst | 1 - doc/source/whatsnew/v3.0.0.rst | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 4a16f7c78e51a..d2f5d3a58c6e5 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -132,7 +132,6 @@ Interval Indexing ^^^^^^^^ -- Bug in ``HashTable`` classes affecting :meth:`Index.get_indexer` and downstream methods when ``NaN`` is located at or after position 128 (:issue:`58924`) - Fixed bug in :meth:`Index.get_indexer` round-tripping through string dtype when ``infer_string`` is enabled (:issue:`55834`) Missing diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 0df5a70d87655..7127633563d4e 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -697,6 +697,7 @@ Indexing - Bug in :meth:`DataFrame.from_records` throwing a ``ValueError`` when passed an empty list in ``index`` (:issue:`58594`) - Bug in :meth:`DataFrame.loc` with inconsistent behavior of loc-set with 2 given indexes to Series (:issue:`59933`) - Bug in :meth:`MultiIndex.insert` when a new value inserted to a datetime-like level gets cast to ``NaT`` and fails indexing (:issue:`60388`) +- Bug in ``HashTable`` classes affecting :meth:`Index.get_indexer` and downstream methods when ``NaN`` is located at or after position 128 (:issue:`58924`) - Bug in printing :attr:`Index.names` and :attr:`MultiIndex.levels` would not escape single quotes (:issue:`60190`) Missing From 56f7e54eb3d543db8fce014bc2a135316ff8f6b3 Mon Sep 17 00:00:00 2001 From: avm19 <52547519+avm19@users.noreply.github.com> Date: Thu, 6 Mar 2025 16:27:33 -0500 Subject: [PATCH 7/9] Update doc/source/whatsnew/v3.0.0.rst Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 7127633563d4e..bcfdb44e4b169 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -697,7 +697,7 @@ Indexing - Bug in :meth:`DataFrame.from_records` throwing a ``ValueError`` when passed an empty list in ``index`` (:issue:`58594`) - Bug in :meth:`DataFrame.loc` with inconsistent behavior of loc-set with 2 given indexes to Series (:issue:`59933`) - Bug in :meth:`MultiIndex.insert` when a new value inserted to a datetime-like level gets cast to ``NaT`` and fails indexing (:issue:`60388`) -- Bug in ``HashTable`` classes affecting :meth:`Index.get_indexer` and downstream methods when ``NaN`` is located at or after position 128 (:issue:`58924`) +- Bug in :meth:`Index.get_indexer` and similar methods when ``NaN`` is located at or after position 128 (:issue:`58924`) - Bug in printing :attr:`Index.names` and :attr:`MultiIndex.levels` would not escape single quotes (:issue:`60190`) Missing From 61525c2d78bc24142cdeb38e11b4dad22015507d Mon Sep 17 00:00:00 2001 From: avm19 <52547519+avm19@users.noreply.github.com> Date: Thu, 6 Mar 2025 21:35:14 +0000 Subject: [PATCH 8/9] Undo remove '-'. --- doc/source/whatsnew/v2.3.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index d2f5d3a58c6e5..09134763977c3 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -133,6 +133,7 @@ Interval Indexing ^^^^^^^^ - Fixed bug in :meth:`Index.get_indexer` round-tripping through string dtype when ``infer_string`` is enabled (:issue:`55834`) +- Missing ^^^^^^^ From 67ced03e51dbc2960ce3202ec7141257abc7a838 Mon Sep 17 00:00:00 2001 From: avm19 <52547519+avm19@users.noreply.github.com> Date: Thu, 6 Mar 2025 21:45:26 +0000 Subject: [PATCH 9/9] Sort whatsnew entries alphabetically. --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index bcfdb44e4b169..fb7321139fc1a 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -696,8 +696,8 @@ Indexing - Bug in :meth:`DataFrame.__getitem__` returning modified columns when called with ``slice`` in Python 3.12 (:issue:`57500`) - Bug in :meth:`DataFrame.from_records` throwing a ``ValueError`` when passed an empty list in ``index`` (:issue:`58594`) - Bug in :meth:`DataFrame.loc` with inconsistent behavior of loc-set with 2 given indexes to Series (:issue:`59933`) -- Bug in :meth:`MultiIndex.insert` when a new value inserted to a datetime-like level gets cast to ``NaT`` and fails indexing (:issue:`60388`) - Bug in :meth:`Index.get_indexer` and similar methods when ``NaN`` is located at or after position 128 (:issue:`58924`) +- Bug in :meth:`MultiIndex.insert` when a new value inserted to a datetime-like level gets cast to ``NaT`` and fails indexing (:issue:`60388`) - Bug in printing :attr:`Index.names` and :attr:`MultiIndex.levels` would not escape single quotes (:issue:`60190`) Missing