From 0dc5f4b56294ee7a10c676beaad99a100f496af7 Mon Sep 17 00:00:00 2001 From: Chris Bartak Date: Fri, 21 Sep 2018 10:30:58 -0500 Subject: [PATCH 1/5] wip --- pandas/_libs/hashtable_class_helper.pxi.in | 2 ++ pandas/_version.py | 3 ++- pandas/tests/test_algos.py | 8 ++++++++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index f294fd141a9f1..a5c7ba6494160 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -266,6 +266,7 @@ cdef class {{name}}HashTable(HashTable): def __cinit__(self, size_hint=1): self.table = kh_init_{{dtype}}() if size_hint is not None: + size_hint = min(size_hint, _SIZE_HINT_LIMIT) kh_resize_{{dtype}}(self.table, size_hint) def __len__(self): @@ -498,6 +499,7 @@ cdef class StringHashTable(HashTable): def __init__(self, int size_hint=1): self.table = kh_init_str() if size_hint is not None: + size_hint = min(size_hint, _SIZE_HINT_LIMIT) kh_resize_str(self.table, size_hint) def __dealloc__(self): diff --git a/pandas/_version.py b/pandas/_version.py index f4c8938c683da..276a0de3d7131 100644 --- a/pandas/_version.py +++ b/pandas/_version.py @@ -243,7 +243,8 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): print(fmt.format(full_tag=full_tag, tag_prefix=tag_prefix)) pieces["error"] = ("tag '{full_tag}' doesn't start with " "prefix '{tag_prefix}'".format( - full_tag, tag_prefix)) + full_tag=full_tag, + tag_prefix=tag_prefix)) return pieces pieces["closest-tag"] = full_tag[len(tag_prefix):] diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index b2ddbf715b480..46b9ec9bca740 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1302,6 +1302,14 @@ def _test_vector_resize(htable, uniques, dtype, nvals, safely_resizes): _test_vector_resize(tbl(), vect(), dtype, 0, safely_resizes) _test_vector_resize(tbl(), vect(), dtype, 10, safely_resizes) + @pytest.mark.parametrize('hashtable', [ + ht.PyObjectHashTable, ht.StringHashTable, + ht.Float64HashTable, ht.Int64HashTable, ht.UInt64HashTable]) + def test_hashtable_large_sizehint(self, hashtable): + # GH 22729 + size_hint = np.iinfo(np.uint32).max + 1 + tbl = hashtable(size_hint=size_hint) + def test_quantile(): s = Series(np.random.randn(100)) From 33f665adfe139180681548c20c1baa8af42cee82 Mon Sep 17 00:00:00 2001 From: chris Date: Sat, 22 Sep 2018 08:43:19 -0500 Subject: [PATCH 2/5] remaining cases --- pandas/_libs/hashtable_class_helper.pxi.in | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index a5c7ba6494160..fca232627e57f 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -496,7 +496,7 @@ cdef class StringHashTable(HashTable): # or a sentinel np.nan / None missing value na_string_sentinel = '__nan__' - def __init__(self, int size_hint=1): + def __init__(self, size_hint=1): self.table = kh_init_str() if size_hint is not None: size_hint = min(size_hint, _SIZE_HINT_LIMIT) @@ -735,7 +735,9 @@ cdef class PyObjectHashTable(HashTable): def __init__(self, size_hint=1): self.table = kh_init_pymap() - kh_resize_pymap(self.table, size_hint) + if size_hint is not None: + size_hint = min(size_hint, _SIZE_HINT_LIMIT) + kh_resize_pymap(self.table, size_hint) def __dealloc__(self): if self.table is not NULL: From 74b3b457872621d7a67bdf31701efe0d677ce629 Mon Sep 17 00:00:00 2001 From: Chris Bartak Date: Wed, 12 Dec 2018 09:54:01 -0600 Subject: [PATCH 3/5] whatsnew --- doc/source/whatsnew/v0.24.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 80317d6806346..68e07c4ac28b9 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1539,6 +1539,7 @@ Groupby/Resample/Rolling - Bug in :meth:`pandas.core.groupby.DataFrameGroupBy.rank` with ``method='dense'`` and ``pct=True`` when a group has only one member would raise a ``ZeroDivisionError`` (:issue:`23666`). - Calling :meth:`DataFrameGroupBy.rank` and :meth:`SeriesGroupBy.rank` with empty groups and ``pct=True`` was raising a ``ZeroDivisionError`` due to `c1068d9 `_ (:issue:`22519`) - Bug in :meth:`DataFrame.resample` when resampling ``NaT`` in ``TimeDeltaIndex`` (:issue:`13223`). +- Bug preventing hash table creation with very large number (2^32) of rows (:issue:`22805`) Reshaping ^^^^^^^^^ From f2c4a1b28faf7fc090aa143e3b944d9f2ec1078c Mon Sep 17 00:00:00 2001 From: Chris Bartak Date: Wed, 12 Dec 2018 13:09:17 -0600 Subject: [PATCH 4/5] lint --- pandas/tests/test_algos.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 84b91eabf73ad..cae92d01f351c 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1406,12 +1406,12 @@ def test_hashtable_factorize(self, htable, tm_dtype, writable): tm.assert_numpy_array_equal(result_reconstruct, expected_reconstruct) @pytest.mark.parametrize('hashtable', [ - ht.PyObjectHashTable, ht.StringHashTable, - ht.Float64HashTable, ht.Int64HashTable, ht.UInt64HashTable]) + ht.PyObjectHashTable, ht.StringHashTable, + ht.Float64HashTable, ht.Int64HashTable, ht.UInt64HashTable]) def test_hashtable_large_sizehint(self, hashtable): # GH 22729 size_hint = np.iinfo(np.uint32).max + 1 - tbl = hashtable(size_hint=size_hint) + tbl = hashtable(size_hint=size_hint) # noqa def test_quantile(): From d154b079bfc982a191379f625b9e5b33b5ee7143 Mon Sep 17 00:00:00 2001 From: Chris Bartak Date: Thu, 13 Dec 2018 09:48:15 -0600 Subject: [PATCH 5/5] type ctor parameter --- pandas/_libs/hashtable_class_helper.pxi.in | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 037000d928bee..eac35588b6fc3 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -262,7 +262,7 @@ dtypes = [('Float64', 'float64', True, 'np.nan'), cdef class {{name}}HashTable(HashTable): - def __cinit__(self, size_hint=1): + def __cinit__(self, int64_t size_hint=1): self.table = kh_init_{{dtype}}() if size_hint is not None: size_hint = min(size_hint, _SIZE_HINT_LIMIT) @@ -574,7 +574,7 @@ cdef class StringHashTable(HashTable): # or a sentinel np.nan / None missing value na_string_sentinel = '__nan__' - def __init__(self, size_hint=1): + def __init__(self, int64_t size_hint=1): self.table = kh_init_str() if size_hint is not None: size_hint = min(size_hint, _SIZE_HINT_LIMIT) @@ -878,7 +878,7 @@ cdef class StringHashTable(HashTable): cdef class PyObjectHashTable(HashTable): - def __init__(self, size_hint=1): + def __init__(self, int64_t size_hint=1): self.table = kh_init_pymap() if size_hint is not None: size_hint = min(size_hint, _SIZE_HINT_LIMIT)