BUG make hashtable.unique support readonly arrays

hexgnu · hexgnu · commit 4f1e2aaceba1 · 2017-12-21T17:59:42.000-08:00
This problem was brought up in #18773 and effectively comes down to how Cython deals with readonly arrays. While it would be ideal for Cython to fix the underlying problem in the meantime we can rely on this. fix: updates one_to_hundred for hundred_elements This is because arange(100) isn't actually 1 to 100... it's 0 to 99 docs: adds comment to fix using ndarray and fixes indenting test: parametrize test for test_readonly_cut doc: add new whatsnew entry for v0.23.0 fix: checkout existing upstream v0.22.0
diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
@@ -341,7 +341,7 @@ Reshaping
 - Bug in :func:`DataFrame.stack` which fails trying to sort mixed type levels under Python 3 (:issue:`18310`)
 - Fixed construction of a :class:`Series` from a ``dict`` containing ``NaN`` as key (:issue:`18480`)
 - Bug in :func:`Series.rank` where ``Series`` containing ``NaT`` modifies the ``Series`` inplace (:issue:`18521`)
--
+- Bug in :func:`cut` which fails when using readonly arrays (:issue:`18773`)
 
 Numeric
 ^^^^^^^
diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -255,10 +255,56 @@ dtypes = [('Float64', 'float64', 'val != val', True),
           ('UInt64', 'uint64', 'False', False),
           ('Int64', 'int64', 'val == iNaT', False)]
 
+def get_dispatch(dtypes):
+  for (name, dtype, null_condition, float_group) in dtypes:
+    unique_template = """\
+        cdef:
+           Py_ssize_t i, n = len(values)
+           int ret = 0
+           {dtype}_t val
+           khiter_t k
+           bint seen_na = 0
+           {name}Vector uniques = {name}Vector()
+           {name}VectorData *ud
+
+        ud = uniques.data
+
+        with nogil:
+            for i in range(n):
+                val = values[i]
+                IF {float_group}:
+                  if val == val:
+                      k = kh_get_{dtype}(self.table, val)
+                      if k == self.table.n_buckets:
+                          kh_put_{dtype}(self.table, val, &ret)
+                          if needs_resize(ud):
+                              with gil:
+                                  uniques.resize()
+                          append_data_{dtype}(ud, val)
+                  elif not seen_na:
+                      seen_na = 1
+                      if needs_resize(ud):
+                          with gil:
+                              uniques.resize()
+                      append_data_{dtype}(ud, NAN)
+                ELSE:
+                  k = kh_get_{dtype}(self.table, val)
+                  if k == self.table.n_buckets:
+                      kh_put_{dtype}(self.table, val, &ret)
+                      if needs_resize(ud):
+                          with gil:
+                              uniques.resize()
+                      append_data_{dtype}(ud, val)
+        return uniques.to_array()
+      """
+
+    unique_template = unique_template.format(name=name, dtype=dtype, null_condition=null_condition, float_group=float_group)
+
+    yield (name, dtype, null_condition, float_group, unique_template)
 }}
 
 
-{{for name, dtype, null_condition, float_group in dtypes}}
+{{for name, dtype, null_condition, float_group, unique_template in get_dispatch(dtypes)}}
 
 cdef class {{name}}HashTable(HashTable):
 
@@ -450,48 +496,20 @@ cdef class {{name}}HashTable(HashTable):
         return np.asarray(labels), arr_uniques
 
     @cython.boundscheck(False)
-    def unique(self, {{dtype}}_t[:] values):
-        cdef:
-            Py_ssize_t i, n = len(values)
-            int ret = 0
-            {{dtype}}_t val
-            khiter_t k
-            bint seen_na = 0
-            {{name}}Vector uniques = {{name}}Vector()
-            {{name}}VectorData *ud
+    def unique(self, ndarray[{{dtype}}_t, ndim=1] values):
+        if values.flags.writeable:
+          # If the value is writeable (mutable) then use memview
+          return self.unique_memview(values)
 
-        ud = uniques.data
-
-        with nogil:
-            for i in range(n):
-                val = values[i]
-
-                {{if float_group}}
-                if val == val:
-                    k = kh_get_{{dtype}}(self.table, val)
-                    if k == self.table.n_buckets:
-                        kh_put_{{dtype}}(self.table, val, &ret)
-                        if needs_resize(ud):
-                            with gil:
-                                uniques.resize()
-                        append_data_{{dtype}}(ud, val)
-                elif not seen_na:
-                    seen_na = 1
-                    if needs_resize(ud):
-                        with gil:
-                            uniques.resize()
-                    append_data_{{dtype}}(ud, NAN)
-                {{else}}
-                k = kh_get_{{dtype}}(self.table, val)
-                if k == self.table.n_buckets:
-                    kh_put_{{dtype}}(self.table, val, &ret)
-                    if needs_resize(ud):
-                        with gil:
-                            uniques.resize()
-                    append_data_{{dtype}}(ud, val)
-                {{endif}}
+        # We cannot use the memoryview version on readonly-buffers due to
+        # a limitation of Cython's typed memoryviews. Instead we can use
+        # the slightly slower Cython ndarray type directly.
+        # see https://github.com/cython/cython/issues/1605
+{{unique_template}}
 
-        return uniques.to_array()
+    @cython.boundscheck(False)
+    def unique_memview(self, {{dtype}}_t[:] values):
+{{unique_template}}
 
 {{endfor}}
 
diff --git a/pandas/tests/reshape/test_tile.py b/pandas/tests/reshape/test_tile.py
@@ -512,6 +512,20 @@ def f():
         tm.assert_numpy_array_equal(
             mask, np.array([False, True, True, True, True]))
 
+    @pytest.mark.parametrize("array_1_writeable,array_2_writeable",[
+        (True, True), (True, False), (False, False)])
+    def test_cut_read_only(self, array_1_writeable, array_2_writeable):
+        # issue 18773
+        array_1 = np.arange(0, 100, 10)
+        array_1.flags.writeable = array_1_writeable
+
+        array_2 = np.arange(0, 100, 10)
+        array_2.flags.writeable = array_2_writeable
+
+        hundred_elements = np.arange(100)
+
+        tm.assert_categorical_equal(cut(hundred_elements, array_1),
+                                    cut(hundred_elements, array_2))
 
 def curpath():
     pth, _ = os.path.split(os.path.abspath(__file__))