|
1 | 1 | from contextlib import contextmanager
|
| 2 | +import struct |
2 | 3 | import tracemalloc
|
3 | 4 |
|
4 | 5 | import numpy as np
|
@@ -77,16 +78,16 @@ def test_get_set_contains_len(self, table_type, dtype):
|
77 | 78 | with pytest.raises(KeyError, match=str(index + 2)):
|
78 | 79 | table.get_item(index + 2)
|
79 | 80 |
|
80 |
| - def test_map(self, table_type, dtype, writable): |
81 |
| - # PyObjectHashTable has no map-method |
82 |
| - if table_type != ht.PyObjectHashTable: |
| 81 | + def test_map_keys_to_values(self, table_type, dtype, writable): |
| 82 | + # only Int64HashTable has this method |
| 83 | + if table_type == ht.Int64HashTable: |
83 | 84 | N = 77
|
84 | 85 | table = table_type()
|
85 | 86 | keys = np.arange(N).astype(dtype)
|
86 | 87 | vals = np.arange(N).astype(np.int64) + N
|
87 | 88 | keys.flags.writeable = writable
|
88 | 89 | vals.flags.writeable = writable
|
89 |
| - table.map(keys, vals) |
| 90 | + table.map_keys_to_values(keys, vals) |
90 | 91 | for i in range(N):
|
91 | 92 | assert table.get_item(keys[i]) == i + N
|
92 | 93 |
|
@@ -180,6 +181,124 @@ def test_no_reallocation(self, table_type, dtype, N):
|
180 | 181 | assert n_buckets_start == clean_table.get_state()["n_buckets"]
|
181 | 182 |
|
182 | 183 |
|
| 184 | +class TestHashTableUnsorted: |
| 185 | + # TODO: moved from test_algos; may be redundancies with other tests |
| 186 | + def test_string_hashtable_set_item_signature(self): |
| 187 | + # GH#30419 fix typing in StringHashTable.set_item to prevent segfault |
| 188 | + tbl = ht.StringHashTable() |
| 189 | + |
| 190 | + tbl.set_item("key", 1) |
| 191 | + assert tbl.get_item("key") == 1 |
| 192 | + |
| 193 | + with pytest.raises(TypeError, match="'key' has incorrect type"): |
| 194 | + # key arg typed as string, not object |
| 195 | + tbl.set_item(4, 6) |
| 196 | + with pytest.raises(TypeError, match="'val' has incorrect type"): |
| 197 | + tbl.get_item(4) |
| 198 | + |
| 199 | + def test_lookup_nan(self, writable): |
| 200 | + # GH#21688 ensure we can deal with readonly memory views |
| 201 | + xs = np.array([2.718, 3.14, np.nan, -7, 5, 2, 3]) |
| 202 | + xs.setflags(write=writable) |
| 203 | + m = ht.Float64HashTable() |
| 204 | + m.map_locations(xs) |
| 205 | + tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs), dtype=np.intp)) |
| 206 | + |
| 207 | + def test_add_signed_zeros(self): |
| 208 | + # GH#21866 inconsistent hash-function for float64 |
| 209 | + # default hash-function would lead to different hash-buckets |
| 210 | + # for 0.0 and -0.0 if there are more than 2^30 hash-buckets |
| 211 | + # but this would mean 16GB |
| 212 | + N = 4 # 12 * 10**8 would trigger the error, if you have enough memory |
| 213 | + m = ht.Float64HashTable(N) |
| 214 | + m.set_item(0.0, 0) |
| 215 | + m.set_item(-0.0, 0) |
| 216 | + assert len(m) == 1 # 0.0 and -0.0 are equivalent |
| 217 | + |
| 218 | + def test_add_different_nans(self): |
| 219 | + # GH#21866 inconsistent hash-function for float64 |
| 220 | + # create different nans from bit-patterns: |
| 221 | + NAN1 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000000))[0] |
| 222 | + NAN2 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000001))[0] |
| 223 | + assert NAN1 != NAN1 |
| 224 | + assert NAN2 != NAN2 |
| 225 | + # default hash function would lead to different hash-buckets |
| 226 | + # for NAN1 and NAN2 even if there are only 4 buckets: |
| 227 | + m = ht.Float64HashTable() |
| 228 | + m.set_item(NAN1, 0) |
| 229 | + m.set_item(NAN2, 0) |
| 230 | + assert len(m) == 1 # NAN1 and NAN2 are equivalent |
| 231 | + |
| 232 | + def test_lookup_overflow(self, writable): |
| 233 | + xs = np.array([1, 2, 2**63], dtype=np.uint64) |
| 234 | + # GH 21688 ensure we can deal with readonly memory views |
| 235 | + xs.setflags(write=writable) |
| 236 | + m = ht.UInt64HashTable() |
| 237 | + m.map_locations(xs) |
| 238 | + tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs), dtype=np.intp)) |
| 239 | + |
| 240 | + @pytest.mark.parametrize("nvals", [0, 10]) # resizing to 0 is special case |
| 241 | + @pytest.mark.parametrize( |
| 242 | + "htable, uniques, dtype, safely_resizes", |
| 243 | + [ |
| 244 | + (ht.PyObjectHashTable, ht.ObjectVector, "object", False), |
| 245 | + (ht.StringHashTable, ht.ObjectVector, "object", True), |
| 246 | + (ht.Float64HashTable, ht.Float64Vector, "float64", False), |
| 247 | + (ht.Int64HashTable, ht.Int64Vector, "int64", False), |
| 248 | + (ht.Int32HashTable, ht.Int32Vector, "int32", False), |
| 249 | + (ht.UInt64HashTable, ht.UInt64Vector, "uint64", False), |
| 250 | + ], |
| 251 | + ) |
| 252 | + def test_vector_resize( |
| 253 | + self, writable, htable, uniques, dtype, safely_resizes, nvals |
| 254 | + ): |
| 255 | + # Test for memory errors after internal vector |
| 256 | + # reallocations (GH 7157) |
| 257 | + vals = np.array(np.random.randn(1000), dtype=dtype) |
| 258 | + |
| 259 | + # GH 21688 ensures we can deal with read-only memory views |
| 260 | + vals.setflags(write=writable) |
| 261 | + |
| 262 | + # initialise instances; cannot initialise in parametrization, |
| 263 | + # as otherwise external views would be held on the array (which is |
| 264 | + # one of the things this test is checking) |
| 265 | + htable = htable() |
| 266 | + uniques = uniques() |
| 267 | + |
| 268 | + # get_labels may append to uniques |
| 269 | + htable.get_labels(vals[:nvals], uniques, 0, -1) |
| 270 | + # to_array() sets an external_view_exists flag on uniques. |
| 271 | + tmp = uniques.to_array() |
| 272 | + oldshape = tmp.shape |
| 273 | + |
| 274 | + # subsequent get_labels() calls can no longer append to it |
| 275 | + # (except for StringHashTables + ObjectVector) |
| 276 | + if safely_resizes: |
| 277 | + htable.get_labels(vals, uniques, 0, -1) |
| 278 | + else: |
| 279 | + with pytest.raises(ValueError, match="external reference.*"): |
| 280 | + htable.get_labels(vals, uniques, 0, -1) |
| 281 | + |
| 282 | + uniques.to_array() # should not raise here |
| 283 | + assert tmp.shape == oldshape |
| 284 | + |
| 285 | + @pytest.mark.parametrize( |
| 286 | + "hashtable", |
| 287 | + [ |
| 288 | + ht.PyObjectHashTable, |
| 289 | + ht.StringHashTable, |
| 290 | + ht.Float64HashTable, |
| 291 | + ht.Int64HashTable, |
| 292 | + ht.Int32HashTable, |
| 293 | + ht.UInt64HashTable, |
| 294 | + ], |
| 295 | + ) |
| 296 | + def test_hashtable_large_sizehint(self, hashtable): |
| 297 | + # GH#22729 smoketest for not raising when passing a large size_hint |
| 298 | + size_hint = np.iinfo(np.uint32).max + 1 |
| 299 | + hashtable(size_hint=size_hint) |
| 300 | + |
| 301 | + |
183 | 302 | class TestPyObjectHashTableWithNans:
|
184 | 303 | def test_nan_float(self):
|
185 | 304 | nan1 = float("nan")
|
@@ -322,15 +441,6 @@ def test_get_set_contains_len(self, table_type, dtype):
|
322 | 441 | assert index in table
|
323 | 442 | assert table.get_item(index) == 41
|
324 | 443 |
|
325 |
| - def test_map(self, table_type, dtype): |
326 |
| - N = 332 |
327 |
| - table = table_type() |
328 |
| - keys = np.full(N, np.nan, dtype=dtype) |
329 |
| - vals = (np.arange(N) + N).astype(np.int64) |
330 |
| - table.map(keys, vals) |
331 |
| - assert len(table) == 1 |
332 |
| - assert table.get_item(np.nan) == 2 * N - 1 |
333 |
| - |
334 | 444 | def test_map_locations(self, table_type, dtype):
|
335 | 445 | N = 10
|
336 | 446 | table = table_type()
|
@@ -468,6 +578,21 @@ def test_unique_label_indices_intp(writable):
|
468 | 578 | tm.assert_numpy_array_equal(result, expected)
|
469 | 579 |
|
470 | 580 |
|
| 581 | +def test_unique_label_indices(): |
| 582 | + |
| 583 | + a = np.random.randint(1, 1 << 10, 1 << 15).astype(np.intp) |
| 584 | + |
| 585 | + left = ht.unique_label_indices(a) |
| 586 | + right = np.unique(a, return_index=True)[1] |
| 587 | + |
| 588 | + tm.assert_numpy_array_equal(left, right, check_dtype=False) |
| 589 | + |
| 590 | + a[np.random.choice(len(a), 10)] = -1 |
| 591 | + left = ht.unique_label_indices(a) |
| 592 | + right = np.unique(a, return_index=True)[1][1:] |
| 593 | + tm.assert_numpy_array_equal(left, right, check_dtype=False) |
| 594 | + |
| 595 | + |
471 | 596 | @pytest.mark.parametrize(
|
472 | 597 | "dtype",
|
473 | 598 | [
|
|
0 commit comments