Skip to content

Commit 21c65ae

Browse files
committed
BUG make hashtable.unique support readonly arrays
This problem was brought up in #18773 and effectively comes down to how Cython deals with readonly arrays. While it would be ideal for Cython to fix the underlying problem in the meantime we can rely on this.
1 parent b5f1e71 commit 21c65ae

File tree

3 files changed

+65
-42
lines changed

3 files changed

+65
-42
lines changed

doc/source/whatsnew/v0.22.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -321,7 +321,7 @@ Reshaping
321321
- Bug in :func:`DataFrame.stack` which fails trying to sort mixed type levels under Python 3 (:issue:`18310`)
322322
- Fixed construction of a :class:`Series` from a ``dict`` containing ``NaN`` as key (:issue:`18480`)
323323
- Bug in :func:`Series.rank` where ``Series`` containing ``NaT`` modifies the ``Series`` inplace (:issue:`18521`)
324-
-
324+
- Bug in :func:`cut` which fails when using readonly arrays (:issue:`18773`)
325325

326326
Numeric
327327
^^^^^^^

pandas/_libs/hashtable_class_helper.pxi.in

+54-41
Original file line numberDiff line numberDiff line change
@@ -255,10 +255,56 @@ dtypes = [('Float64', 'float64', 'val != val', True),
255255
('UInt64', 'uint64', 'False', False),
256256
('Int64', 'int64', 'val == iNaT', False)]
257257

258+
def get_dispatch(dtypes):
259+
for (name, dtype, null_condition, float_group) in dtypes:
260+
unique_template = """\
261+
cdef:
262+
Py_ssize_t i, n = len(values)
263+
int ret = 0
264+
{dtype}_t val
265+
khiter_t k
266+
bint seen_na = 0
267+
{name}Vector uniques = {name}Vector()
268+
{name}VectorData *ud
269+
270+
ud = uniques.data
271+
272+
with nogil:
273+
for i in range(n):
274+
val = values[i]
275+
IF {float_group}:
276+
if val == val:
277+
k = kh_get_{dtype}(self.table, val)
278+
if k == self.table.n_buckets:
279+
kh_put_{dtype}(self.table, val, &ret)
280+
if needs_resize(ud):
281+
with gil:
282+
uniques.resize()
283+
append_data_{dtype}(ud, val)
284+
elif not seen_na:
285+
seen_na = 1
286+
if needs_resize(ud):
287+
with gil:
288+
uniques.resize()
289+
append_data_{dtype}(ud, NAN)
290+
ELSE:
291+
k = kh_get_{dtype}(self.table, val)
292+
if k == self.table.n_buckets:
293+
kh_put_{dtype}(self.table, val, &ret)
294+
if needs_resize(ud):
295+
with gil:
296+
uniques.resize()
297+
append_data_{dtype}(ud, val)
298+
return uniques.to_array()
299+
"""
300+
301+
unique_template = unique_template.format(name=name, dtype=dtype, null_condition=null_condition, float_group=float_group)
302+
303+
yield (name, dtype, null_condition, float_group, unique_template)
258304
}}
259305

260306

261-
{{for name, dtype, null_condition, float_group in dtypes}}
307+
{{for name, dtype, null_condition, float_group, unique_template in get_dispatch(dtypes)}}
262308

263309
cdef class {{name}}HashTable(HashTable):
264310

@@ -450,48 +496,15 @@ cdef class {{name}}HashTable(HashTable):
450496
return np.asarray(labels), arr_uniques
451497

452498
@cython.boundscheck(False)
453-
def unique(self, {{dtype}}_t[:] values):
454-
cdef:
455-
Py_ssize_t i, n = len(values)
456-
int ret = 0
457-
{{dtype}}_t val
458-
khiter_t k
459-
bint seen_na = 0
460-
{{name}}Vector uniques = {{name}}Vector()
461-
{{name}}VectorData *ud
499+
def unique(self, ndarray[{{dtype}}_t, ndim=1] values):
500+
if values.flags.writeable:
501+
return self.unique_memview(values)
462502

463-
ud = uniques.data
503+
{{unique_template}}
464504

465-
with nogil:
466-
for i in range(n):
467-
val = values[i]
468-
469-
{{if float_group}}
470-
if val == val:
471-
k = kh_get_{{dtype}}(self.table, val)
472-
if k == self.table.n_buckets:
473-
kh_put_{{dtype}}(self.table, val, &ret)
474-
if needs_resize(ud):
475-
with gil:
476-
uniques.resize()
477-
append_data_{{dtype}}(ud, val)
478-
elif not seen_na:
479-
seen_na = 1
480-
if needs_resize(ud):
481-
with gil:
482-
uniques.resize()
483-
append_data_{{dtype}}(ud, NAN)
484-
{{else}}
485-
k = kh_get_{{dtype}}(self.table, val)
486-
if k == self.table.n_buckets:
487-
kh_put_{{dtype}}(self.table, val, &ret)
488-
if needs_resize(ud):
489-
with gil:
490-
uniques.resize()
491-
append_data_{{dtype}}(ud, val)
492-
{{endif}}
493-
494-
return uniques.to_array()
505+
@cython.boundscheck(False)
506+
def unique_memview(self, {{dtype}}_t[:] values):
507+
{{unique_template}}
495508

496509
{{endfor}}
497510

pandas/tests/reshape/test_tile.py

+10
Original file line numberDiff line numberDiff line change
@@ -512,6 +512,16 @@ def f():
512512
tm.assert_numpy_array_equal(
513513
mask, np.array([False, True, True, True, True]))
514514

515+
def test_cut_read_only(self):
516+
readonly = np.arange(0, 100, 10)
517+
readonly.flags.writeable = False
518+
519+
mutable = np.arange(0, 100, 10)
520+
521+
one_to_hundred = np.arange(100)
522+
tm.assert_categorical_equal(cut(one_to_hundred, readonly),
523+
cut(one_to_hundred, mutable))
524+
515525

516526
def curpath():
517527
pth, _ = os.path.split(os.path.abspath(__file__))

0 commit comments

Comments
 (0)