Skip to content

Commit 1ada3b7

Browse files
jbrockmendelyehoshuadimarsky
authored andcommitted
REF/TST: misplaced hashtable tests, rename HashTable.map map_keys_to_values (pandas-dev#46106)
1 parent 7fbd963 commit 1ada3b7

File tree

6 files changed

+163
-158
lines changed

6 files changed

+163
-158
lines changed

pandas/_libs/hashtable.pyi

+7-8
Original file line numberDiff line numberDiff line change
@@ -120,12 +120,6 @@ class HashTable:
120120
# TODO: `item` type is subclass-specific
121121
def get_item(self, item): ... # TODO: return type?
122122
def set_item(self, item) -> None: ...
123-
# FIXME: we don't actually have this for StringHashTable or ObjectHashTable?
124-
def map(
125-
self,
126-
keys: np.ndarray, # np.ndarray[subclass-specific]
127-
values: np.ndarray, # const int64_t[:]
128-
) -> None: ...
129123
def map_locations(
130124
self,
131125
values: np.ndarray, # np.ndarray[subclass-specific]
@@ -177,11 +171,16 @@ class Float64HashTable(HashTable): ...
177171
class Float32HashTable(HashTable): ...
178172

179173
class Int64HashTable(HashTable):
180-
# Only Int64HashTable has get_labels_groupby
174+
# Only Int64HashTable has get_labels_groupby, map_keys_to_values
181175
def get_labels_groupby(
182176
self,
183-
values: np.ndarray, # const int64_t[:]
177+
values: npt.NDArray[np.int64], # const int64_t[:]
184178
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.int64],]: ...
179+
def map_keys_to_values(
180+
self,
181+
keys: npt.NDArray[np.int64],
182+
values: npt.NDArray[np.int64], # const int64_t[:]
183+
) -> None: ...
185184

186185
class Int32HashTable(HashTable): ...
187186
class Int16HashTable(HashTable): ...

pandas/_libs/hashtable_class_helper.pxi.in

+11-1
Original file line numberDiff line numberDiff line change
@@ -435,6 +435,7 @@ cdef class {{name}}HashTable(HashTable):
435435
}
436436

437437
cpdef get_item(self, {{dtype}}_t val):
438+
# Used in core.sorting, IndexEngine.get_loc
438439
cdef:
439440
khiter_t k
440441
{{c_type}} cval
@@ -446,6 +447,7 @@ cdef class {{name}}HashTable(HashTable):
446447
raise KeyError(val)
447448

448449
cpdef set_item(self, {{dtype}}_t key, Py_ssize_t val):
450+
# Used in libjoin
449451
cdef:
450452
khiter_t k
451453
int ret = 0
@@ -457,8 +459,13 @@ cdef class {{name}}HashTable(HashTable):
457459
else:
458460
raise KeyError(key)
459461

462+
{{if dtype == "int64" }}
463+
# We only use this for int64, can reduce build size and make .pyi
464+
# more accurate by only implementing it for int64
460465
@cython.boundscheck(False)
461-
def map(self, const {{dtype}}_t[:] keys, const int64_t[:] values) -> None:
466+
def map_keys_to_values(
467+
self, const {{dtype}}_t[:] keys, const int64_t[:] values
468+
) -> None:
462469
cdef:
463470
Py_ssize_t i, n = len(values)
464471
int ret = 0
@@ -470,9 +477,11 @@ cdef class {{name}}HashTable(HashTable):
470477
key = {{to_c_type}}(keys[i])
471478
k = kh_put_{{dtype}}(self.table, key, &ret)
472479
self.table.vals[k] = <Py_ssize_t>values[i]
480+
{{endif}}
473481

474482
@cython.boundscheck(False)
475483
def map_locations(self, const {{dtype}}_t[:] values) -> None:
484+
# Used in libindex, safe_sort
476485
cdef:
477486
Py_ssize_t i, n = len(values)
478487
int ret = 0
@@ -488,6 +497,7 @@ cdef class {{name}}HashTable(HashTable):
488497
@cython.boundscheck(False)
489498
def lookup(self, const {{dtype}}_t[:] values) -> ndarray:
490499
# -> np.ndarray[np.intp]
500+
# Used in safe_sort, IndexEngine.get_indexer
491501
cdef:
492502
Py_ssize_t i, n = len(values)
493503
int ret = 0

pandas/core/sorting.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -577,7 +577,7 @@ def get_flattened_list(
577577
arrays: DefaultDict[int, list[int]] = defaultdict(list)
578578
for labs, level in zip(labels, levels):
579579
table = hashtable.Int64HashTable(ngroups)
580-
table.map(comp_ids, labs.astype(np.int64, copy=False))
580+
table.map_keys_to_values(comp_ids, labs.astype(np.int64, copy=False))
581581
for i in range(ngroups):
582582
arrays[i].append(level[table.get_item(i)])
583583
return [tuple(array) for array in arrays.values()]

pandas/tests/libs/test_hashtable.py

+138-13
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from contextlib import contextmanager
2+
import struct
23
import tracemalloc
34

45
import numpy as np
@@ -77,16 +78,16 @@ def test_get_set_contains_len(self, table_type, dtype):
7778
with pytest.raises(KeyError, match=str(index + 2)):
7879
table.get_item(index + 2)
7980

80-
def test_map(self, table_type, dtype, writable):
81-
# PyObjectHashTable has no map-method
82-
if table_type != ht.PyObjectHashTable:
81+
def test_map_keys_to_values(self, table_type, dtype, writable):
82+
# only Int64HashTable has this method
83+
if table_type == ht.Int64HashTable:
8384
N = 77
8485
table = table_type()
8586
keys = np.arange(N).astype(dtype)
8687
vals = np.arange(N).astype(np.int64) + N
8788
keys.flags.writeable = writable
8889
vals.flags.writeable = writable
89-
table.map(keys, vals)
90+
table.map_keys_to_values(keys, vals)
9091
for i in range(N):
9192
assert table.get_item(keys[i]) == i + N
9293

@@ -180,6 +181,124 @@ def test_no_reallocation(self, table_type, dtype, N):
180181
assert n_buckets_start == clean_table.get_state()["n_buckets"]
181182

182183

184+
class TestHashTableUnsorted:
185+
# TODO: moved from test_algos; may be redundancies with other tests
186+
def test_string_hashtable_set_item_signature(self):
187+
# GH#30419 fix typing in StringHashTable.set_item to prevent segfault
188+
tbl = ht.StringHashTable()
189+
190+
tbl.set_item("key", 1)
191+
assert tbl.get_item("key") == 1
192+
193+
with pytest.raises(TypeError, match="'key' has incorrect type"):
194+
# key arg typed as string, not object
195+
tbl.set_item(4, 6)
196+
with pytest.raises(TypeError, match="'val' has incorrect type"):
197+
tbl.get_item(4)
198+
199+
def test_lookup_nan(self, writable):
200+
# GH#21688 ensure we can deal with readonly memory views
201+
xs = np.array([2.718, 3.14, np.nan, -7, 5, 2, 3])
202+
xs.setflags(write=writable)
203+
m = ht.Float64HashTable()
204+
m.map_locations(xs)
205+
tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs), dtype=np.intp))
206+
207+
def test_add_signed_zeros(self):
208+
# GH#21866 inconsistent hash-function for float64
209+
# default hash-function would lead to different hash-buckets
210+
# for 0.0 and -0.0 if there are more than 2^30 hash-buckets
211+
# but this would mean 16GB
212+
N = 4 # 12 * 10**8 would trigger the error, if you have enough memory
213+
m = ht.Float64HashTable(N)
214+
m.set_item(0.0, 0)
215+
m.set_item(-0.0, 0)
216+
assert len(m) == 1 # 0.0 and -0.0 are equivalent
217+
218+
def test_add_different_nans(self):
219+
# GH#21866 inconsistent hash-function for float64
220+
# create different nans from bit-patterns:
221+
NAN1 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000000))[0]
222+
NAN2 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000001))[0]
223+
assert NAN1 != NAN1
224+
assert NAN2 != NAN2
225+
# default hash function would lead to different hash-buckets
226+
# for NAN1 and NAN2 even if there are only 4 buckets:
227+
m = ht.Float64HashTable()
228+
m.set_item(NAN1, 0)
229+
m.set_item(NAN2, 0)
230+
assert len(m) == 1 # NAN1 and NAN2 are equivalent
231+
232+
def test_lookup_overflow(self, writable):
233+
xs = np.array([1, 2, 2**63], dtype=np.uint64)
234+
# GH 21688 ensure we can deal with readonly memory views
235+
xs.setflags(write=writable)
236+
m = ht.UInt64HashTable()
237+
m.map_locations(xs)
238+
tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs), dtype=np.intp))
239+
240+
@pytest.mark.parametrize("nvals", [0, 10]) # resizing to 0 is special case
241+
@pytest.mark.parametrize(
242+
"htable, uniques, dtype, safely_resizes",
243+
[
244+
(ht.PyObjectHashTable, ht.ObjectVector, "object", False),
245+
(ht.StringHashTable, ht.ObjectVector, "object", True),
246+
(ht.Float64HashTable, ht.Float64Vector, "float64", False),
247+
(ht.Int64HashTable, ht.Int64Vector, "int64", False),
248+
(ht.Int32HashTable, ht.Int32Vector, "int32", False),
249+
(ht.UInt64HashTable, ht.UInt64Vector, "uint64", False),
250+
],
251+
)
252+
def test_vector_resize(
253+
self, writable, htable, uniques, dtype, safely_resizes, nvals
254+
):
255+
# Test for memory errors after internal vector
256+
# reallocations (GH 7157)
257+
vals = np.array(np.random.randn(1000), dtype=dtype)
258+
259+
# GH 21688 ensures we can deal with read-only memory views
260+
vals.setflags(write=writable)
261+
262+
# initialise instances; cannot initialise in parametrization,
263+
# as otherwise external views would be held on the array (which is
264+
# one of the things this test is checking)
265+
htable = htable()
266+
uniques = uniques()
267+
268+
# get_labels may append to uniques
269+
htable.get_labels(vals[:nvals], uniques, 0, -1)
270+
# to_array() sets an external_view_exists flag on uniques.
271+
tmp = uniques.to_array()
272+
oldshape = tmp.shape
273+
274+
# subsequent get_labels() calls can no longer append to it
275+
# (except for StringHashTables + ObjectVector)
276+
if safely_resizes:
277+
htable.get_labels(vals, uniques, 0, -1)
278+
else:
279+
with pytest.raises(ValueError, match="external reference.*"):
280+
htable.get_labels(vals, uniques, 0, -1)
281+
282+
uniques.to_array() # should not raise here
283+
assert tmp.shape == oldshape
284+
285+
@pytest.mark.parametrize(
286+
"hashtable",
287+
[
288+
ht.PyObjectHashTable,
289+
ht.StringHashTable,
290+
ht.Float64HashTable,
291+
ht.Int64HashTable,
292+
ht.Int32HashTable,
293+
ht.UInt64HashTable,
294+
],
295+
)
296+
def test_hashtable_large_sizehint(self, hashtable):
297+
# GH#22729 smoketest for not raising when passing a large size_hint
298+
size_hint = np.iinfo(np.uint32).max + 1
299+
hashtable(size_hint=size_hint)
300+
301+
183302
class TestPyObjectHashTableWithNans:
184303
def test_nan_float(self):
185304
nan1 = float("nan")
@@ -322,15 +441,6 @@ def test_get_set_contains_len(self, table_type, dtype):
322441
assert index in table
323442
assert table.get_item(index) == 41
324443

325-
def test_map(self, table_type, dtype):
326-
N = 332
327-
table = table_type()
328-
keys = np.full(N, np.nan, dtype=dtype)
329-
vals = (np.arange(N) + N).astype(np.int64)
330-
table.map(keys, vals)
331-
assert len(table) == 1
332-
assert table.get_item(np.nan) == 2 * N - 1
333-
334444
def test_map_locations(self, table_type, dtype):
335445
N = 10
336446
table = table_type()
@@ -468,6 +578,21 @@ def test_unique_label_indices_intp(writable):
468578
tm.assert_numpy_array_equal(result, expected)
469579

470580

581+
def test_unique_label_indices():
582+
583+
a = np.random.randint(1, 1 << 10, 1 << 15).astype(np.intp)
584+
585+
left = ht.unique_label_indices(a)
586+
right = np.unique(a, return_index=True)[1]
587+
588+
tm.assert_numpy_array_equal(left, right, check_dtype=False)
589+
590+
a[np.random.choice(len(a), 10)] = -1
591+
left = ht.unique_label_indices(a)
592+
right = np.unique(a, return_index=True)[1][1:]
593+
tm.assert_numpy_array_equal(left, right, check_dtype=False)
594+
595+
471596
@pytest.mark.parametrize(
472597
"dtype",
473598
[

pandas/tests/series/methods/test_unique.py

+6
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,12 @@
88

99

1010
class TestUnique:
11+
def test_unique_uint64(self):
12+
ser = Series([1, 2, 2**63, 2**63], dtype=np.uint64)
13+
res = ser.unique()
14+
exp = np.array([1, 2, 2**63], dtype=np.uint64)
15+
tm.assert_numpy_array_equal(res, exp)
16+
1117
def test_unique_data_ownership(self):
1218
# it works! GH#1807
1319
Series(Series(["a", "c", "b"]).unique()).sort_values()

0 commit comments

Comments
 (0)