Merge branch 'hashtable' of https://github.com/mtkni/pandas into mtkni-hashtable

jreback · jreback · commit a161f1841a75 · 2014-05-18T15:56:08.000-04:00
Conflicts:
	doc/source/release.rst
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -528,6 +528,7 @@ Bug Fixes
 - Bug in ``DatetimeIndex`` specifying ``freq`` raises ``ValueError`` when passed value is too short (:issue:`7098`)
 - Fixed a bug with the `info` repr not honoring the `display.max_info_columns` setting (:issue:`6939`)
 - Bug ``PeriodIndex`` string slicing with out of bounds values (:issue:`5407`)
+- Fixed a memory error in the hashtable implementation/factorizer on resizing of large tables (:issue:`7157`)
 
 pandas 0.13.1
 -------------
diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx
@@ -66,11 +66,12 @@ cdef class ObjectVector:
 
     def to_array(self):
         self.ao.resize(self.n)
+        self.m = self.n
         return self.ao
 
     cdef inline append(self, object o):
         if self.n == self.m:
-            self.m = self.m * 2
+            self.m = max(self.m * 2, _INIT_VEC_CAP)
             self.ao.resize(self.m)
             self.data = <PyObject**> self.ao.data
 
@@ -97,11 +98,12 @@ cdef class Int64Vector:
 
     def to_array(self):
         self.ao.resize(self.n)
+        self.m = self.n
         return self.ao
 
     cdef inline append(self, int64_t x):
         if self.n == self.m:
-            self.m = self.m * 2
+            self.m = max(self.m * 2, _INIT_VEC_CAP)
             self.ao.resize(self.m)
             self.data = <int64_t*> self.ao.data
 
@@ -126,11 +128,12 @@ cdef class Float64Vector:
 
     def to_array(self):
         self.ao.resize(self.n)
+        self.m = self.n
         return self.ao
 
     cdef inline append(self, float64_t x):
         if self.n == self.m:
-            self.m = self.m * 2
+            self.m = max(self.m * 2, _INIT_VEC_CAP)
             self.ao.resize(self.m)
             self.data = <float64_t*> self.ao.data
 
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
@@ -7,6 +7,7 @@
 
 import pandas.core.algorithms as algos
 import pandas.util.testing as tm
+import pandas.hashtable as hashtable
 
 class TestMatch(tm.TestCase):
     _multiprocess_can_split_ = True
@@ -122,6 +123,49 @@ def test_datelike(self):
         self.assert_numpy_array_equal(labels, np.array([ 0,0,0,1,1,0],dtype=np.int64))
         self.assert_numpy_array_equal(uniques, pd.PeriodIndex([v1, v2]))
 
+    def test_factorize_nan(self):
+        # nan should map to na_sentinel, not reverse_indexer[na_sentinel]
+        # rizer.factorize should not raise an exception if na_sentinel indexes
+        # outside of reverse_indexer
+        key = np.array([1, 2, 1, np.nan], dtype='O')
+        rizer = hashtable.Factorizer(len(key))
+        for na_sentinel in (-1, 20):
+            ids = rizer.factorize(key, sort=True, na_sentinel=na_sentinel)
+            expected = np.array([0, 1, 0, na_sentinel], dtype='int32')
+            self.assertEqual(len(set(key)), len(set(expected)))
+            self.assertTrue(np.array_equal(pd.isnull(key), expected == na_sentinel))
+
+        # nan still maps to na_sentinel when sort=False
+        key = np.array([0, np.nan, 1], dtype='O')
+        na_sentinel = -1
+        ids = rizer.factorize(key, sort=False, na_sentinel=na_sentinel)
+        expected = np.array([ 2, -1,  0], dtype='int32')
+        self.assertEqual(len(set(key)), len(set(expected)))
+        self.assertTrue(np.array_equal(pd.isnull(key), expected == na_sentinel))
+
+    def test_vector_resize(self):
+        # Test for memory errors after internal vector
+        # reallocations (pull request #7157)
+
+        def _test_vector_resize(htable, uniques, dtype, nvals):
+            vals = np.array(np.random.randn(1000), dtype=dtype)
+            # get_labels appends to the vector
+            htable.get_labels(vals[:nvals], uniques, 0, -1)
+            # to_array resizes the vector
+            uniques.to_array()
+            htable.get_labels(vals, uniques, 0, -1)
+
+        test_cases = [
+            (hashtable.PyObjectHashTable, hashtable.ObjectVector, 'object'),
+            (hashtable.Float64HashTable,  hashtable.Float64Vector, 'float64'),
+            (hashtable.Int64HashTable,    hashtable.Int64Vector, 'int64')]
+
+        for (tbl, vect, dtype) in test_cases:
+            # resizing to empty is a special case
+            _test_vector_resize(tbl(), vect(), dtype, 0)
+            _test_vector_resize(tbl(), vect(), dtype, 10)
+
+
 class TestUnique(tm.TestCase):
     _multiprocess_can_split_ = True
 
diff --git a/pandas/tests/test_hashtable.py b/pandas/tests/test_hashtable.py