Skip to content

Commit a161f18

Browse files
committed
Merge branch 'hashtable' of https://github.com/mtkni/pandas into mtkni-hashtable
Conflicts: doc/source/release.rst
2 parents cd6b4cf + eb97319 commit a161f18

File tree

4 files changed

+51
-33
lines changed

4 files changed

+51
-33
lines changed

doc/source/release.rst

+1
Original file line numberDiff line numberDiff line change
@@ -528,6 +528,7 @@ Bug Fixes
528528
- Bug in ``DatetimeIndex`` specifying ``freq`` raises ``ValueError`` when passed value is too short (:issue:`7098`)
529529
- Fixed a bug with the `info` repr not honoring the `display.max_info_columns` setting (:issue:`6939`)
530530
- Bug ``PeriodIndex`` string slicing with out of bounds values (:issue:`5407`)
531+
- Fixed a memory error in the hashtable implementation/factorizer on resizing of large tables (:issue:`7157`)
531532

532533
pandas 0.13.1
533534
-------------

pandas/hashtable.pyx

+6-3
Original file line numberDiff line numberDiff line change
@@ -66,11 +66,12 @@ cdef class ObjectVector:
6666

6767
def to_array(self):
6868
self.ao.resize(self.n)
69+
self.m = self.n
6970
return self.ao
7071

7172
cdef inline append(self, object o):
7273
if self.n == self.m:
73-
self.m = self.m * 2
74+
self.m = max(self.m * 2, _INIT_VEC_CAP)
7475
self.ao.resize(self.m)
7576
self.data = <PyObject**> self.ao.data
7677

@@ -97,11 +98,12 @@ cdef class Int64Vector:
9798

9899
def to_array(self):
99100
self.ao.resize(self.n)
101+
self.m = self.n
100102
return self.ao
101103

102104
cdef inline append(self, int64_t x):
103105
if self.n == self.m:
104-
self.m = self.m * 2
106+
self.m = max(self.m * 2, _INIT_VEC_CAP)
105107
self.ao.resize(self.m)
106108
self.data = <int64_t*> self.ao.data
107109

@@ -126,11 +128,12 @@ cdef class Float64Vector:
126128

127129
def to_array(self):
128130
self.ao.resize(self.n)
131+
self.m = self.n
129132
return self.ao
130133

131134
cdef inline append(self, float64_t x):
132135
if self.n == self.m:
133-
self.m = self.m * 2
136+
self.m = max(self.m * 2, _INIT_VEC_CAP)
134137
self.ao.resize(self.m)
135138
self.data = <float64_t*> self.ao.data
136139

pandas/tests/test_algos.py

+44
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
import pandas.core.algorithms as algos
99
import pandas.util.testing as tm
10+
import pandas.hashtable as hashtable
1011

1112
class TestMatch(tm.TestCase):
1213
_multiprocess_can_split_ = True
@@ -122,6 +123,49 @@ def test_datelike(self):
122123
self.assert_numpy_array_equal(labels, np.array([ 0,0,0,1,1,0],dtype=np.int64))
123124
self.assert_numpy_array_equal(uniques, pd.PeriodIndex([v1, v2]))
124125

126+
def test_factorize_nan(self):
127+
# nan should map to na_sentinel, not reverse_indexer[na_sentinel]
128+
# rizer.factorize should not raise an exception if na_sentinel indexes
129+
# outside of reverse_indexer
130+
key = np.array([1, 2, 1, np.nan], dtype='O')
131+
rizer = hashtable.Factorizer(len(key))
132+
for na_sentinel in (-1, 20):
133+
ids = rizer.factorize(key, sort=True, na_sentinel=na_sentinel)
134+
expected = np.array([0, 1, 0, na_sentinel], dtype='int32')
135+
self.assertEqual(len(set(key)), len(set(expected)))
136+
self.assertTrue(np.array_equal(pd.isnull(key), expected == na_sentinel))
137+
138+
# nan still maps to na_sentinel when sort=False
139+
key = np.array([0, np.nan, 1], dtype='O')
140+
na_sentinel = -1
141+
ids = rizer.factorize(key, sort=False, na_sentinel=na_sentinel)
142+
expected = np.array([ 2, -1, 0], dtype='int32')
143+
self.assertEqual(len(set(key)), len(set(expected)))
144+
self.assertTrue(np.array_equal(pd.isnull(key), expected == na_sentinel))
145+
146+
def test_vector_resize(self):
147+
# Test for memory errors after internal vector
148+
# reallocations (pull request #7157)
149+
150+
def _test_vector_resize(htable, uniques, dtype, nvals):
151+
vals = np.array(np.random.randn(1000), dtype=dtype)
152+
# get_labels appends to the vector
153+
htable.get_labels(vals[:nvals], uniques, 0, -1)
154+
# to_array resizes the vector
155+
uniques.to_array()
156+
htable.get_labels(vals, uniques, 0, -1)
157+
158+
test_cases = [
159+
(hashtable.PyObjectHashTable, hashtable.ObjectVector, 'object'),
160+
(hashtable.Float64HashTable, hashtable.Float64Vector, 'float64'),
161+
(hashtable.Int64HashTable, hashtable.Int64Vector, 'int64')]
162+
163+
for (tbl, vect, dtype) in test_cases:
164+
# resizing to empty is a special case
165+
_test_vector_resize(tbl(), vect(), dtype, 0)
166+
_test_vector_resize(tbl(), vect(), dtype, 10)
167+
168+
125169
class TestUnique(tm.TestCase):
126170
_multiprocess_can_split_ = True
127171

pandas/tests/test_hashtable.py

-30
This file was deleted.

0 commit comments

Comments
 (0)