pandas-dev · jreback · Feb 26, 2022 · Feb 21, 2022 · Feb 21, 2022
diff --git a/pandas/_libs/hashtable.pyi b/pandas/_libs/hashtable.pyi
@@ -120,12 +120,6 @@ class HashTable:
     # TODO: `item` type is subclass-specific
     def get_item(self, item): ...  # TODO: return type?
     def set_item(self, item) -> None: ...
-    # FIXME: we don't actually have this for StringHashTable or ObjectHashTable?
-    def map(
-        self,
-        keys: np.ndarray,  # np.ndarray[subclass-specific]
-        values: np.ndarray,  # const int64_t[:]
-    ) -> None: ...
     def map_locations(
         self,
         values: np.ndarray,  # np.ndarray[subclass-specific]
@@ -177,11 +171,16 @@ class Float64HashTable(HashTable): ...
 class Float32HashTable(HashTable): ...
 
 class Int64HashTable(HashTable):
-    # Only Int64HashTable has get_labels_groupby
+    # Only Int64HashTable has get_labels_groupby, map_keys_to_values
     def get_labels_groupby(
         self,
-        values: np.ndarray,  # const int64_t[:]
+        values: npt.NDArray[np.int64],  # const int64_t[:]
     ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.int64],]: ...
+    def map_keys_to_values(
+        self,
+        keys: npt.NDArray[np.int64],
+        values: npt.NDArray[np.int64],  # const int64_t[:]
+    ) -> None: ...
 
 class Int32HashTable(HashTable): ...
 class Int16HashTable(HashTable): ...

diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -435,6 +435,7 @@ cdef class {{name}}HashTable(HashTable):
         }
 
     cpdef get_item(self, {{dtype}}_t val):
+        # Used in core.sorting, IndexEngine.get_loc
         cdef:
             khiter_t k
             {{c_type}} cval
@@ -446,6 +447,7 @@ cdef class {{name}}HashTable(HashTable):
             raise KeyError(val)
 
     cpdef set_item(self, {{dtype}}_t key, Py_ssize_t val):
+        # Used in libjoin
         cdef:
             khiter_t k
             int ret = 0
@@ -457,8 +459,13 @@ cdef class {{name}}HashTable(HashTable):
         else:
             raise KeyError(key)
 
+    {{if dtype == "int64" }}
+    # We only use this for int64, can reduce build size and make .pyi
+    #  more accurate by only implementing it for int64
     @cython.boundscheck(False)
-    def map(self, const {{dtype}}_t[:] keys, const int64_t[:] values) -> None:
+    def map_keys_to_values(
+        self, const {{dtype}}_t[:] keys, const int64_t[:] values
+    ) -> None:
         cdef:
             Py_ssize_t i, n = len(values)
             int ret = 0
@@ -470,9 +477,11 @@ cdef class {{name}}HashTable(HashTable):
                 key = {{to_c_type}}(keys[i])
                 k = kh_put_{{dtype}}(self.table, key, &ret)
                 self.table.vals[k] = <Py_ssize_t>values[i]
+    {{endif}}
 
     @cython.boundscheck(False)
     def map_locations(self, const {{dtype}}_t[:] values) -> None:
+        # Used in libindex, safe_sort
         cdef:
             Py_ssize_t i, n = len(values)
             int ret = 0
@@ -488,6 +497,7 @@ cdef class {{name}}HashTable(HashTable):
     @cython.boundscheck(False)
     def lookup(self, const {{dtype}}_t[:] values) -> ndarray:
         # -> np.ndarray[np.intp]
+        # Used in safe_sort, IndexEngine.get_indexer
         cdef:
             Py_ssize_t i, n = len(values)
             int ret = 0

diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py
@@ -576,7 +576,7 @@ def get_flattened_list(
     arrays: DefaultDict[int, list[int]] = defaultdict(list)
     for labs, level in zip(labels, levels):
         table = hashtable.Int64HashTable(ngroups)
-        table.map(comp_ids, labs.astype(np.int64, copy=False))
+        table.map_keys_to_values(comp_ids, labs.astype(np.int64, copy=False))
         for i in range(ngroups):
             arrays[i].append(level[table.get_item(i)])
     return [tuple(array) for array in arrays.values()]

diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py
@@ -1,4 +1,5 @@
 from contextlib import contextmanager
+import struct
 import tracemalloc
 
 import numpy as np
@@ -77,16 +78,16 @@ def test_get_set_contains_len(self, table_type, dtype):
         with pytest.raises(KeyError, match=str(index + 2)):
             table.get_item(index + 2)
 
-    def test_map(self, table_type, dtype, writable):
-        # PyObjectHashTable has no map-method
-        if table_type != ht.PyObjectHashTable:
+    def test_map_keys_to_values(self, table_type, dtype, writable):
+        # only Int64HashTable has this method
+        if table_type == ht.Int64HashTable:
             N = 77
             table = table_type()
             keys = np.arange(N).astype(dtype)
             vals = np.arange(N).astype(np.int64) + N
             keys.flags.writeable = writable
             vals.flags.writeable = writable
-            table.map(keys, vals)
+            table.map_keys_to_values(keys, vals)
             for i in range(N):
                 assert table.get_item(keys[i]) == i + N
 
@@ -180,6 +181,124 @@ def test_no_reallocation(self, table_type, dtype, N):
         assert n_buckets_start == clean_table.get_state()["n_buckets"]
 
 
+class TestHashTableUnsorted:
+    # TODO: moved from test_algos; may be redundancies with other tests
+    def test_string_hashtable_set_item_signature(self):
+        # GH#30419 fix typing in StringHashTable.set_item to prevent segfault
+        tbl = ht.StringHashTable()
+
+        tbl.set_item("key", 1)
+        assert tbl.get_item("key") == 1
+
+        with pytest.raises(TypeError, match="'key' has incorrect type"):
+            # key arg typed as string, not object
+            tbl.set_item(4, 6)
+        with pytest.raises(TypeError, match="'val' has incorrect type"):
+            tbl.get_item(4)
+
+    def test_lookup_nan(self, writable):
+        # GH#21688 ensure we can deal with readonly memory views
+        xs = np.array([2.718, 3.14, np.nan, -7, 5, 2, 3])
+        xs.setflags(write=writable)
+        m = ht.Float64HashTable()
+        m.map_locations(xs)
+        tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs), dtype=np.intp))
+
+    def test_add_signed_zeros(self):
+        # GH#21866 inconsistent hash-function for float64
+        # default hash-function would lead to different hash-buckets
+        # for 0.0 and -0.0 if there are more than 2^30 hash-buckets
+        # but this would mean 16GB
+        N = 4  # 12 * 10**8 would trigger the error, if you have enough memory
+        m = ht.Float64HashTable(N)
+        m.set_item(0.0, 0)
+        m.set_item(-0.0, 0)
+        assert len(m) == 1  # 0.0 and -0.0 are equivalent
+
+    def test_add_different_nans(self):
+        # GH#21866 inconsistent hash-function for float64
+        # create different nans from bit-patterns:
+        NAN1 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000000))[0]
+        NAN2 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000001))[0]
+        assert NAN1 != NAN1
+        assert NAN2 != NAN2
+        # default hash function would lead to different hash-buckets
+        # for NAN1 and NAN2 even if there are only 4 buckets:
+        m = ht.Float64HashTable()
+        m.set_item(NAN1, 0)
+        m.set_item(NAN2, 0)
+        assert len(m) == 1  # NAN1 and NAN2 are equivalent
+
+    def test_lookup_overflow(self, writable):
+        xs = np.array([1, 2, 2**63], dtype=np.uint64)
+        # GH 21688 ensure we can deal with readonly memory views
+        xs.setflags(write=writable)
+        m = ht.UInt64HashTable()
+        m.map_locations(xs)
+        tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs), dtype=np.intp))
+
+    @pytest.mark.parametrize("nvals", [0, 10])  # resizing to 0 is special case
+    @pytest.mark.parametrize(
+        "htable, uniques, dtype, safely_resizes",
+        [
+            (ht.PyObjectHashTable, ht.ObjectVector, "object", False),
+            (ht.StringHashTable, ht.ObjectVector, "object", True),
+            (ht.Float64HashTable, ht.Float64Vector, "float64", False),
+            (ht.Int64HashTable, ht.Int64Vector, "int64", False),
+            (ht.Int32HashTable, ht.Int32Vector, "int32", False),
+            (ht.UInt64HashTable, ht.UInt64Vector, "uint64", False),
+        ],
+    )
+    def test_vector_resize(
+        self, writable, htable, uniques, dtype, safely_resizes, nvals
+    ):
+        # Test for memory errors after internal vector
+        # reallocations (GH 7157)
+        vals = np.array(np.random.randn(1000), dtype=dtype)
+
+        # GH 21688 ensures we can deal with read-only memory views
+        vals.setflags(write=writable)
+
+        # initialise instances; cannot initialise in parametrization,
+        # as otherwise external views would be held on the array (which is
+        # one of the things this test is checking)
+        htable = htable()
+        uniques = uniques()
+
+        # get_labels may append to uniques
+        htable.get_labels(vals[:nvals], uniques, 0, -1)
+        # to_array() sets an external_view_exists flag on uniques.
+        tmp = uniques.to_array()
+        oldshape = tmp.shape
+
+        # subsequent get_labels() calls can no longer append to it
+        # (except for StringHashTables + ObjectVector)
+        if safely_resizes:
+            htable.get_labels(vals, uniques, 0, -1)
+        else:
+            with pytest.raises(ValueError, match="external reference.*"):
+                htable.get_labels(vals, uniques, 0, -1)
+
+        uniques.to_array()  # should not raise here
+        assert tmp.shape == oldshape
+
+    @pytest.mark.parametrize(
+        "hashtable",
+        [
+            ht.PyObjectHashTable,
+            ht.StringHashTable,
+            ht.Float64HashTable,
+            ht.Int64HashTable,
+            ht.Int32HashTable,
+            ht.UInt64HashTable,
+        ],
+    )
+    def test_hashtable_large_sizehint(self, hashtable):
+        # GH#22729 smoketest for not raising when passing a large size_hint
+        size_hint = np.iinfo(np.uint32).max + 1
+        hashtable(size_hint=size_hint)
+
+
 class TestPyObjectHashTableWithNans:
     def test_nan_float(self):
         nan1 = float("nan")
@@ -322,15 +441,6 @@ def test_get_set_contains_len(self, table_type, dtype):
         assert index in table
         assert table.get_item(index) == 41
 
-    def test_map(self, table_type, dtype):
-        N = 332
-        table = table_type()
-        keys = np.full(N, np.nan, dtype=dtype)
-        vals = (np.arange(N) + N).astype(np.int64)
-        table.map(keys, vals)
-        assert len(table) == 1
-        assert table.get_item(np.nan) == 2 * N - 1
-
     def test_map_locations(self, table_type, dtype):
         N = 10
         table = table_type()
@@ -468,6 +578,21 @@ def test_unique_label_indices_intp(writable):
     tm.assert_numpy_array_equal(result, expected)
 
 
+def test_unique_label_indices():
+
+    a = np.random.randint(1, 1 << 10, 1 << 15).astype(np.intp)
+
+    left = ht.unique_label_indices(a)
+    right = np.unique(a, return_index=True)[1]
+
+    tm.assert_numpy_array_equal(left, right, check_dtype=False)
+
+    a[np.random.choice(len(a), 10)] = -1
+    left = ht.unique_label_indices(a)
+    right = np.unique(a, return_index=True)[1][1:]
+    tm.assert_numpy_array_equal(left, right, check_dtype=False)
+
+
 @pytest.mark.parametrize(
     "dtype",
     [

diff --git a/pandas/tests/series/methods/test_unique.py b/pandas/tests/series/methods/test_unique.py
@@ -8,6 +8,12 @@
 
 
 class TestUnique:
+    def test_unique_uint64(self):
+        ser = Series([1, 2, 2**63, 2**63], dtype=np.uint64)
+        res = ser.unique()
+        exp = np.array([1, 2, 2**63], dtype=np.uint64)
+        tm.assert_numpy_array_equal(res, exp)
+
     def test_unique_data_ownership(self):
         # it works! GH#1807
         Series(Series(["a", "c", "b"]).unique()).sort_values()