Skip to content

Commit eb97319

Browse files
author
Mike Kelly
committed
BUG: hashtable memory error causes test_factorize_nan crash
1 parent 9fff73f commit eb97319

File tree

3 files changed

+31
-3
lines changed

3 files changed

+31
-3
lines changed

doc/source/release.rst

+1
Original file line numberDiff line numberDiff line change
@@ -527,6 +527,7 @@ Bug Fixes
527527
(:issue:`7105`)
528528
- Bug in ``DatetimeIndex`` specifying ``freq`` raises ``ValueError`` when passed value is too short (:issue:`7098`)
529529
- Fixed a bug with the `info` repr not honoring the `display.max_info_columns` setting (:issue:`6939`)
530+
- Fixed a memory error in the hashtable implementation that caused crashes while running tests (:issue:`7157`)
530531

531532
pandas 0.13.1
532533
-------------

pandas/hashtable.pyx

+6-3
Original file line numberDiff line numberDiff line change
@@ -66,11 +66,12 @@ cdef class ObjectVector:
6666

6767
def to_array(self):
6868
self.ao.resize(self.n)
69+
self.m = self.n
6970
return self.ao
7071

7172
cdef inline append(self, object o):
7273
if self.n == self.m:
73-
self.m = self.m * 2
74+
self.m = max(self.m * 2, _INIT_VEC_CAP)
7475
self.ao.resize(self.m)
7576
self.data = <PyObject**> self.ao.data
7677

@@ -97,11 +98,12 @@ cdef class Int64Vector:
9798

9899
def to_array(self):
99100
self.ao.resize(self.n)
101+
self.m = self.n
100102
return self.ao
101103

102104
cdef inline append(self, int64_t x):
103105
if self.n == self.m:
104-
self.m = self.m * 2
106+
self.m = max(self.m * 2, _INIT_VEC_CAP)
105107
self.ao.resize(self.m)
106108
self.data = <int64_t*> self.ao.data
107109

@@ -126,11 +128,12 @@ cdef class Float64Vector:
126128

127129
def to_array(self):
128130
self.ao.resize(self.n)
131+
self.m = self.n
129132
return self.ao
130133

131134
cdef inline append(self, float64_t x):
132135
if self.n == self.m:
133-
self.m = self.m * 2
136+
self.m = max(self.m * 2, _INIT_VEC_CAP)
134137
self.ao.resize(self.m)
135138
self.data = <float64_t*> self.ao.data
136139

pandas/tests/test_hashtable.py

+24
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import pandas as pd
66

77
class TestFactorizer(unittest.TestCase):
8+
89
def test_factorize_nan(self):
910
# nan should map to na_sentinel, not reverse_indexer[na_sentinel]
1011
# rizer.factorize should not raise an exception if na_sentinel indexes
@@ -25,6 +26,29 @@ def test_factorize_nan(self):
2526
self.assertEqual(len(set(key)), len(set(expected)))
2627
self.assert_(np.array_equal(pd.isnull(key), expected == na_sentinel))
2728

29+
def test_vector_resize(self):
30+
# Test for memory errors after internal vector
31+
# reallocations (pull request #7157)
32+
33+
def _test_vector_resize(htable, uniques, dtype, nvals):
34+
vals = np.array(np.random.randn(1000), dtype=dtype)
35+
# get_labels appends to the vector
36+
htable.get_labels(vals[:nvals], uniques, 0, -1)
37+
# to_array resizes the vector
38+
uniques.to_array()
39+
htable.get_labels(vals, uniques, 0, -1)
40+
41+
test_cases = [
42+
(_hash.PyObjectHashTable, _hash.ObjectVector, 'object'),
43+
(_hash.Float64HashTable, _hash.Float64Vector, 'float64'),
44+
(_hash.Int64HashTable, _hash.Int64Vector, 'int64')]
45+
46+
for (tbl, vect, dtype) in test_cases:
47+
# resizing to empty is a special case
48+
_test_vector_resize(tbl(), vect(), dtype, 0)
49+
_test_vector_resize(tbl(), vect(), dtype, 10)
50+
51+
2852
if __name__ == '__main__':
2953
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
3054
exit=False)

0 commit comments

Comments
 (0)