From fea11a771eefc609c9b49fd9e9c6ac7065f3de7b Mon Sep 17 00:00:00 2001
From: mattip <matti.picus@gmail.com>
Date: Fri, 5 May 2017 17:08:28 +0300
Subject: [PATCH 1/3] COMPAT/TEST test, fix for unsafe Vector.resize(), which
 allows refcheck=False

---
 pandas/_libs/hashtable.pxd                 |  1 +
 pandas/_libs/hashtable.pyx                 | 15 ++++++++
 pandas/_libs/hashtable_class_helper.pxi.in | 37 ++++++++++++++++----
 pandas/tests/test_algos.py                 | 40 +++++++++++++---------
 4 files changed, 71 insertions(+), 22 deletions(-)

diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd
index 9b352ae1c003b..3366751af144d 100644
--- a/pandas/_libs/hashtable.pxd
+++ b/pandas/_libs/hashtable.pxd
@@ -52,6 +52,7 @@ cdef struct Int64VectorData:
 cdef class Int64Vector:
     cdef Int64VectorData *data
     cdef ndarray ao
+    cdef bint external_view_exists
 
     cdef resize(self)
     cpdef to_array(self)
diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx
index c8aedcef77502..b56c6d70a7dd8 100644
--- a/pandas/_libs/hashtable.pyx
+++ b/pandas/_libs/hashtable.pyx
@@ -64,6 +64,12 @@ cdef class Factorizer:
         >>> factorize(np.array([1,2,np.nan], dtype='O'), na_sentinel=20)
         array([ 0,  1, 20])
         """
+        if self.uniques.external_view_exists:
+            uniques = ObjectVector()
+            data = self.uniques.to_array()
+            for v in data:
+                uniques.append(v)
+            self.uniques = uniques
         labels = self.table.get_labels(values, self.uniques,
                                        self.count, na_sentinel, check_null)
         mask = (labels == na_sentinel)
@@ -99,6 +105,15 @@ cdef class Int64Factorizer:
 
     def factorize(self, int64_t[:] values, sort=False,
                   na_sentinel=-1, check_null=True):
+        """
+        Factorize values with nans replaced by na_sentinel
+        >>> factorize(np.array([1,2,np.nan], dtype='O'), na_sentinel=20)
+        array([ 0,  1, 20])
+        """
+        if self.uniques.external_view_exists:
+            uniques = Int64Vector()
+            uniques.extend(self.uniques.to_array())
+            self.uniques = uniques
         labels = self.table.get_labels(values, self.uniques,
                                        self.count, na_sentinel,
                                        check_null)
diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
index 3ce82dace40a9..b634869fc4ce5 100644
--- a/pandas/_libs/hashtable_class_helper.pxi.in
+++ b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -71,6 +71,7 @@ cdef class {{name}}Vector:
 
     {{if dtype != 'int64'}}
     cdef:
+        bint external_view_exists
         {{name}}VectorData *data
         ndarray ao
     {{endif}}
@@ -80,6 +81,7 @@ cdef class {{name}}Vector:
             sizeof({{name}}VectorData))
         if not self.data:
             raise MemoryError()
+        self.external_view_exists = False
         self.data.n = 0
         self.data.m = _INIT_VEC_CAP
         self.ao = np.empty(self.data.m, dtype={{idtype}})
@@ -87,7 +89,7 @@ cdef class {{name}}Vector:
 
     cdef resize(self):
         self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
-        self.ao.resize(self.data.m)
+        self.ao.resize(self.data.m, refcheck=False)
         self.data.data = <{{arg}}*> self.ao.data
 
     def __dealloc__(self):
@@ -99,13 +101,19 @@ cdef class {{name}}Vector:
         return self.data.n
 
     cpdef to_array(self):
-        self.ao.resize(self.data.n)
+        if self.data.m != self.data.n:
+            if self.external_view_exists:
+                raise ValueError("should have raised on append(), m=%d n=%d" % (self.data.m, self.data.n))
+            self.ao.resize(self.data.n, refcheck=False)
         self.data.m = self.data.n
+        self.external_view_exists = True
         return self.ao
 
     cdef inline void append(self, {{arg}} x):
 
         if needs_resize(self.data):
+            if self.external_view_exists:
+                raise ValueError("external reference but Vector.resize() needed")
             self.resize()
 
         append_data_{{dtype}}(self.data, x)
@@ -120,15 +128,19 @@ cdef class StringVector:
 
     cdef:
         StringVectorData *data
+        bint external_view_exists
 
     def __cinit__(self):
         self.data = <StringVectorData *>PyMem_Malloc(
             sizeof(StringVectorData))
         if not self.data:
             raise MemoryError()
+        self.external_view_exists = False
         self.data.n = 0
         self.data.m = _INIT_VEC_CAP
         self.data.data = <char **> malloc(self.data.m * sizeof(char *))
+        if not self.data.data:
+            raise MemoryError()
 
     cdef resize(self):
         cdef:
@@ -138,9 +150,10 @@ cdef class StringVector:
         m = self.data.m
         self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
 
-        # TODO: can resize?
         orig_data = self.data.data
         self.data.data = <char **> malloc(self.data.m * sizeof(char *))
+        if not self.data.data:
+            raise MemoryError()
         for i in range(m):
             self.data.data[i] = orig_data[i]
 
@@ -164,6 +177,7 @@ cdef class StringVector:
         for i in range(self.data.n):
             val = self.data.data[i]
             ao[i] = val
+        self.external_view_exists = True
         self.data.m = self.data.n
         return ao
 
@@ -181,8 +195,10 @@ cdef class ObjectVector:
         PyObject **data
         size_t n, m
         ndarray ao
+        bint external_view_exists
 
     def __cinit__(self):
+        self.external_view_exists = False
         self.n = 0
         self.m = _INIT_VEC_CAP
         self.ao = np.empty(_INIT_VEC_CAP, dtype=object)
@@ -193,8 +209,10 @@ cdef class ObjectVector:
 
     cdef inline append(self, object o):
         if self.n == self.m:
+            if self.external_view_exists:
+                raise ValueError("external reference but Vector.resize() needed")
             self.m = max(self.m * 2, _INIT_VEC_CAP)
-            self.ao.resize(self.m)
+            self.ao.resize(self.m, refcheck=False)
             self.data = <PyObject**> self.ao.data
 
         Py_INCREF(o)
@@ -202,8 +220,12 @@ cdef class ObjectVector:
         self.n += 1
 
     def to_array(self):
-        self.ao.resize(self.n)
-        self.m = self.n
+        if self.m != self.n:
+            if self.external_view_exists:
+                raise ValueError("should have raised on append()")
+            self.ao.resize(self.n, refcheck=False)
+            self.m = self.n
+        self.external_view_exists = True
         return self.ao
 
 
@@ -362,6 +384,9 @@ cdef class {{name}}HashTable(HashTable):
 
                     if needs_resize(ud):
                         with gil:
+                            if uniques.external_view_exists:
+                                raise ValueError("external reference to uniques held, "
+                                        "but Vector.resize() needed")
                             uniques.resize()
                     append_data_{{dtype}}(ud, val)
                     labels[i] = count
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
index 093730fb2478b..067c56443ec4e 100644
--- a/pandas/tests/test_algos.py
+++ b/pandas/tests/test_algos.py
@@ -14,7 +14,7 @@
 
 from pandas import compat
 from pandas._libs import (groupby as libgroupby, algos as libalgos,
-                          hashtable)
+                          hashtable as ht)
 from pandas._libs.hashtable import unique_label_indices
 from pandas.compat import lrange, range
 import pandas.core.algorithms as algos
@@ -259,7 +259,7 @@ def test_factorize_nan(self):
         # rizer.factorize should not raise an exception if na_sentinel indexes
         # outside of reverse_indexer
         key = np.array([1, 2, 1, np.nan], dtype='O')
-        rizer = hashtable.Factorizer(len(key))
+        rizer = ht.Factorizer(len(key))
         for na_sentinel in (-1, 20):
             ids = rizer.factorize(key, sort=True, na_sentinel=na_sentinel)
             expected = np.array([0, 1, 0, na_sentinel], dtype='int32')
@@ -1049,14 +1049,14 @@ class TestHashTable(object):
 
     def test_lookup_nan(self):
         xs = np.array([2.718, 3.14, np.nan, -7, 5, 2, 3])
-        m = hashtable.Float64HashTable()
+        m = ht.Float64HashTable()
         m.map_locations(xs)
         tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs),
                                                             dtype=np.int64))
 
     def test_lookup_overflow(self):
         xs = np.array([1, 2, 2**63], dtype=np.uint64)
-        m = hashtable.UInt64HashTable()
+        m = ht.UInt64HashTable()
         m.map_locations(xs)
         tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs),
                                                             dtype=np.int64))
@@ -1070,25 +1070,33 @@ def test_vector_resize(self):
         # Test for memory errors after internal vector
         # reallocations (pull request #7157)
 
-        def _test_vector_resize(htable, uniques, dtype, nvals):
+        def _test_vector_resize(htable, uniques, dtype, nvals, safely_resizes):
             vals = np.array(np.random.randn(1000), dtype=dtype)
             # get_labels appends to the vector
             htable.get_labels(vals[:nvals], uniques, 0, -1)
-            # to_array resizes the vector
-            uniques.to_array()
-            htable.get_labels(vals, uniques, 0, -1)
+            # to_array may resize the vector
+            tmp = uniques.to_array()
+            oldshape = tmp.shape
+            if safely_resizes:
+                htable.get_labels(vals, uniques, 0, -1)
+            else:
+                with pytest.raises(ValueError) as excinfo:
+                    htable.get_labels(vals, uniques, 0, -1)
+                assert str(excinfo.value).startswith('external reference')
+            uniques.to_array()   # should not raise
+            assert tmp.shape == oldshape
 
         test_cases = [
-            (hashtable.PyObjectHashTable, hashtable.ObjectVector, 'object'),
-            (hashtable.StringHashTable, hashtable.ObjectVector, 'object'),
-            (hashtable.Float64HashTable, hashtable.Float64Vector, 'float64'),
-            (hashtable.Int64HashTable, hashtable.Int64Vector, 'int64'),
-            (hashtable.UInt64HashTable, hashtable.UInt64Vector, 'uint64')]
+            (ht.PyObjectHashTable, ht.ObjectVector, 'object', False),
+            (ht.StringHashTable, ht.ObjectVector, 'object', True),
+            (ht.Float64HashTable, ht.Float64Vector, 'float64', False),
+            (ht.Int64HashTable, ht.Int64Vector, 'int64', False),
+            (ht.UInt64HashTable, ht.UInt64Vector, 'uint64', False)]
 
-        for (tbl, vect, dtype) in test_cases:
+        for (tbl, vect, dtype, safely_resizes) in test_cases:
             # resizing to empty is a special case
-            _test_vector_resize(tbl(), vect(), dtype, 0)
-            _test_vector_resize(tbl(), vect(), dtype, 10)
+            _test_vector_resize(tbl(), vect(), dtype, 0, safely_resizes)
+            _test_vector_resize(tbl(), vect(), dtype, 10, safely_resizes)
 
 
 def test_quantile():

From 085baf9b9c74876e61ee404c8dad8a8d910a55ad Mon Sep 17 00:00:00 2001
From: mattip <matti.picus@gmail.com>
Date: Wed, 10 May 2017 14:54:29 +0300
Subject: [PATCH 2/3] COMPAT/TEST improve error msg, document test as per
 review

---
 pandas/_libs/hashtable_class_helper.pxi.in | 5 +++--
 pandas/tests/test_algos.py                 | 8 +++++---
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
index b634869fc4ce5..d1cd7b1f41768 100644
--- a/pandas/_libs/hashtable_class_helper.pxi.in
+++ b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -103,9 +103,10 @@ cdef class {{name}}Vector:
     cpdef to_array(self):
         if self.data.m != self.data.n:
             if self.external_view_exists:
-                raise ValueError("should have raised on append(), m=%d n=%d" % (self.data.m, self.data.n))
+                # should never happen
+                raise ValueError("should have raised on append()")
             self.ao.resize(self.data.n, refcheck=False)
-        self.data.m = self.data.n
+            self.data.m = self.data.n
         self.external_view_exists = True
         return self.ao
 
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
index 067c56443ec4e..351e646cbb0b2 100644
--- a/pandas/tests/test_algos.py
+++ b/pandas/tests/test_algos.py
@@ -1072,18 +1072,20 @@ def test_vector_resize(self):
 
         def _test_vector_resize(htable, uniques, dtype, nvals, safely_resizes):
             vals = np.array(np.random.randn(1000), dtype=dtype)
-            # get_labels appends to the vector
+            # get_labels may append to uniques
             htable.get_labels(vals[:nvals], uniques, 0, -1)
-            # to_array may resize the vector
+            # to_array() set an external_view_exists flag on uniques.
             tmp = uniques.to_array()
             oldshape = tmp.shape
+            # subsequent get_labels() calls can no longer append to it
+            # (for all but StringHashTables + ObjectVector)
             if safely_resizes:
                 htable.get_labels(vals, uniques, 0, -1)
             else:
                 with pytest.raises(ValueError) as excinfo:
                     htable.get_labels(vals, uniques, 0, -1)
                 assert str(excinfo.value).startswith('external reference')
-            uniques.to_array()   # should not raise
+            uniques.to_array()   # should not raise here
             assert tmp.shape == oldshape
 
         test_cases = [

From 247cc75cfbeb58396c4bb119bdb5f0d1ff2463aa Mon Sep 17 00:00:00 2001
From: mattip <matti.picus@gmail.com>
Date: Wed, 10 May 2017 16:28:40 +0300
Subject: [PATCH 3/3] COMPAT/TEST unify interfaces as per review

---
 pandas/_libs/hashtable.pyx                 | 4 +---
 pandas/_libs/hashtable_class_helper.pxi.in | 6 ++++++
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx
index b56c6d70a7dd8..101e2c031f26e 100644
--- a/pandas/_libs/hashtable.pyx
+++ b/pandas/_libs/hashtable.pyx
@@ -66,9 +66,7 @@ cdef class Factorizer:
         """
         if self.uniques.external_view_exists:
             uniques = ObjectVector()
-            data = self.uniques.to_array()
-            for v in data:
-                uniques.append(v)
+            uniques.extend(self.uniques.to_array())
             self.uniques = uniques
         labels = self.table.get_labels(values, self.uniques,
                                        self.count, na_sentinel, check_null)
diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
index d1cd7b1f41768..b80a592669eca 100644
--- a/pandas/_libs/hashtable_class_helper.pxi.in
+++ b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -189,6 +189,9 @@ cdef class StringVector:
 
         append_data_string(self.data, x)
 
+    cdef extend(self, ndarray[:] x):
+        for i in range(len(x)):
+            self.append(x[i])
 
 cdef class ObjectVector:
 
@@ -229,6 +232,9 @@ cdef class ObjectVector:
         self.external_view_exists = True
         return self.ao
 
+    cdef extend(self, ndarray[:] x):
+        for i in range(len(x)):
+            self.append(x[i])
 
 #----------------------------------------------------------------------
 # HashTable