BUG make hashtable.unique support readonly arrays (#18825)

hexgnu · jreback · commit 80a5399e59f4 · 2017-12-27T15:27:02.000-05:00
diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
@@ -350,6 +350,7 @@ Reshaping
 - Bug in :func:`DataFrame.stack` which fails trying to sort mixed type levels under Python 3 (:issue:`18310`)
 - Fixed construction of a :class:`Series` from a ``dict`` containing ``NaN`` as key (:issue:`18480`)
 - Bug in :func:`Series.rank` where ``Series`` containing ``NaT`` modifies the ``Series`` inplace (:issue:`18521`)
+- Bug in :func:`cut` which fails when using readonly arrays (:issue:`18773`)
 - Bug in :func:`Dataframe.pivot_table` which fails when the ``aggfunc`` arg is of type string.  The behavior is now consistent with other methods like ``agg`` and ``apply`` (:issue:`18713`)
 
 
diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -255,10 +255,56 @@ dtypes = [('Float64', 'float64', 'val != val', True),
           ('UInt64', 'uint64', 'False', False),
           ('Int64', 'int64', 'val == iNaT', False)]
 
+def get_dispatch(dtypes):
+  for (name, dtype, null_condition, float_group) in dtypes:
+    unique_template = """\
+        cdef:
+           Py_ssize_t i, n = len(values)
+           int ret = 0
+           {dtype}_t val
+           khiter_t k
+           bint seen_na = 0
+           {name}Vector uniques = {name}Vector()
+           {name}VectorData *ud
+
+        ud = uniques.data
+
+        with nogil:
+            for i in range(n):
+                val = values[i]
+                IF {float_group}:
+                  if val == val:
+                      k = kh_get_{dtype}(self.table, val)
+                      if k == self.table.n_buckets:
+                          kh_put_{dtype}(self.table, val, &ret)
+                          if needs_resize(ud):
+                              with gil:
+                                  uniques.resize()
+                          append_data_{dtype}(ud, val)
+                  elif not seen_na:
+                      seen_na = 1
+                      if needs_resize(ud):
+                          with gil:
+                              uniques.resize()
+                      append_data_{dtype}(ud, NAN)
+                ELSE:
+                  k = kh_get_{dtype}(self.table, val)
+                  if k == self.table.n_buckets:
+                      kh_put_{dtype}(self.table, val, &ret)
+                      if needs_resize(ud):
+                          with gil:
+                              uniques.resize()
+                      append_data_{dtype}(ud, val)
+        return uniques.to_array()
+      """
+
+    unique_template = unique_template.format(name=name, dtype=dtype, null_condition=null_condition, float_group=float_group)
+
+    yield (name, dtype, null_condition, float_group, unique_template)
 }}
 
 
-{{for name, dtype, null_condition, float_group in dtypes}}
+{{for name, dtype, null_condition, float_group, unique_template in get_dispatch(dtypes)}}
 
 cdef class {{name}}HashTable(HashTable):
 
@@ -450,48 +496,20 @@ cdef class {{name}}HashTable(HashTable):
         return np.asarray(labels), arr_uniques
 
     @cython.boundscheck(False)
-    def unique(self, {{dtype}}_t[:] values):
-        cdef:
-            Py_ssize_t i, n = len(values)
-            int ret = 0
-            {{dtype}}_t val
-            khiter_t k
-            bint seen_na = 0
-            {{name}}Vector uniques = {{name}}Vector()
-            {{name}}VectorData *ud
+    def unique(self, ndarray[{{dtype}}_t, ndim=1] values):
+        if values.flags.writeable:
+          # If the value is writeable (mutable) then use memview
+          return self.unique_memview(values)
 
-        ud = uniques.data
-
-        with nogil:
-            for i in range(n):
-                val = values[i]
-
-                {{if float_group}}
-                if val == val:
-                    k = kh_get_{{dtype}}(self.table, val)
-                    if k == self.table.n_buckets:
-                        kh_put_{{dtype}}(self.table, val, &ret)
-                        if needs_resize(ud):
-                            with gil:
-                                uniques.resize()
-                        append_data_{{dtype}}(ud, val)
-                elif not seen_na:
-                    seen_na = 1
-                    if needs_resize(ud):
-                        with gil:
-                            uniques.resize()
-                    append_data_{{dtype}}(ud, NAN)
-                {{else}}
-                k = kh_get_{{dtype}}(self.table, val)
-                if k == self.table.n_buckets:
-                    kh_put_{{dtype}}(self.table, val, &ret)
-                    if needs_resize(ud):
-                        with gil:
-                            uniques.resize()
-                    append_data_{{dtype}}(ud, val)
-                {{endif}}
+        # We cannot use the memoryview version on readonly-buffers due to
+        # a limitation of Cython's typed memoryviews. Instead we can use
+        # the slightly slower Cython ndarray type directly.
+        # see https://github.com/cython/cython/issues/1605
+{{unique_template}}
 
-        return uniques.to_array()
+    @cython.boundscheck(False)
+    def unique_memview(self, {{dtype}}_t[:] values):
+{{unique_template}}
 
 {{endfor}}
 
diff --git a/pandas/tests/reshape/test_tile.py b/pandas/tests/reshape/test_tile.py
@@ -512,7 +512,18 @@ def f():
         tm.assert_numpy_array_equal(
             mask, np.array([False, True, True, True, True]))
 
+    @pytest.mark.parametrize(
+        "array_1_writeable, array_2_writeable",
+        [(True, True), (True, False), (False, False)])
+    def test_cut_read_only(self, array_1_writeable, array_2_writeable):
+        # issue 18773
+        array_1 = np.arange(0, 100, 10)
+        array_1.flags.writeable = array_1_writeable
 
-def curpath():
-    pth, _ = os.path.split(os.path.abspath(__file__))
-    return pth
+        array_2 = np.arange(0, 100, 10)
+        array_2.flags.writeable = array_2_writeable
+
+        hundred_elements = np.arange(100)
+
+        tm.assert_categorical_equal(cut(hundred_elements, array_1),
+                                    cut(hundred_elements, array_2))