Skip to content

Commit 8423485

Browse files
chris-b1Pingviinituutti
authored andcommitted
BUG: Hashtable size hint cap (pandas-dev#22805)
1 parent 2b23112 commit 8423485

File tree

3 files changed

+17
-4
lines changed

3 files changed

+17
-4
lines changed

doc/source/whatsnew/v0.24.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -1828,6 +1828,7 @@ Groupby/Resample/Rolling
18281828
- Calling :meth:`pandas.core.groupby.GroupBy.rank` with empty groups and ``pct=True`` was raising a ``ZeroDivisionError`` (:issue:`22519`)
18291829
- Bug in :meth:`DataFrame.resample` when resampling ``NaT`` in ``TimeDeltaIndex`` (:issue:`13223`).
18301830
- Bug in :meth:`DataFrame.groupby` did not respect the ``observed`` argument when selecting a column and instead always used ``observed=False`` (:issue:`23970`)
1831+
- Bug preventing hash table creation with very large number (2^32) of rows (:issue:`22805`)
18311832

18321833
Reshaping
18331834
^^^^^^^^^

pandas/_libs/hashtable_class_helper.pxi.in

+8-4
Original file line numberDiff line numberDiff line change
@@ -262,9 +262,10 @@ dtypes = [('Float64', 'float64', True, 'np.nan'),
262262

263263
cdef class {{name}}HashTable(HashTable):
264264

265-
def __cinit__(self, size_hint=1):
265+
def __cinit__(self, int64_t size_hint=1):
266266
self.table = kh_init_{{dtype}}()
267267
if size_hint is not None:
268+
size_hint = min(size_hint, _SIZE_HINT_LIMIT)
268269
kh_resize_{{dtype}}(self.table, size_hint)
269270

270271
def __len__(self):
@@ -573,9 +574,10 @@ cdef class StringHashTable(HashTable):
573574
# or a sentinel np.nan / None missing value
574575
na_string_sentinel = '__nan__'
575576

576-
def __init__(self, int size_hint=1):
577+
def __init__(self, int64_t size_hint=1):
577578
self.table = kh_init_str()
578579
if size_hint is not None:
580+
size_hint = min(size_hint, _SIZE_HINT_LIMIT)
579581
kh_resize_str(self.table, size_hint)
580582

581583
def __dealloc__(self):
@@ -876,9 +878,11 @@ cdef class StringHashTable(HashTable):
876878

877879
cdef class PyObjectHashTable(HashTable):
878880

879-
def __init__(self, size_hint=1):
881+
def __init__(self, int64_t size_hint=1):
880882
self.table = kh_init_pymap()
881-
kh_resize_pymap(self.table, size_hint)
883+
if size_hint is not None:
884+
size_hint = min(size_hint, _SIZE_HINT_LIMIT)
885+
kh_resize_pymap(self.table, size_hint)
882886

883887
def __dealloc__(self):
884888
if self.table is not NULL:

pandas/tests/test_algos.py

+8
Original file line numberDiff line numberDiff line change
@@ -1410,6 +1410,14 @@ def test_hashtable_factorize(self, htable, tm_dtype, writable):
14101410
expected_reconstruct = s_duplicated.dropna().values
14111411
tm.assert_numpy_array_equal(result_reconstruct, expected_reconstruct)
14121412

1413+
@pytest.mark.parametrize('hashtable', [
1414+
ht.PyObjectHashTable, ht.StringHashTable,
1415+
ht.Float64HashTable, ht.Int64HashTable, ht.UInt64HashTable])
1416+
def test_hashtable_large_sizehint(self, hashtable):
1417+
# GH 22729
1418+
size_hint = np.iinfo(np.uint32).max + 1
1419+
tbl = hashtable(size_hint=size_hint) # noqa
1420+
14131421

14141422
def test_quantile():
14151423
s = Series(np.random.randn(100))

0 commit comments

Comments
 (0)