Skip to content

Commit 4f1e2aa

Browse files
committed
BUG make hashtable.unique support readonly arrays
This problem was brought up in #18773 and effectively comes down to how Cython deals with readonly arrays. While it would be ideal for Cython to fix the underlying problem in the meantime we can rely on this. fix: updates one_to_hundred for hundred_elements This is because arange(100) isn't actually 1 to 100... it's 0 to 99 docs: adds comment to fix using ndarray and fixes indenting test: parametrize test for test_readonly_cut doc: add new whatsnew entry for v0.23.0 fix: checkout existing upstream v0.22.0
1 parent 316acbf commit 4f1e2aa

File tree

3 files changed

+74
-42
lines changed

3 files changed

+74
-42
lines changed

doc/source/whatsnew/v0.23.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -341,7 +341,7 @@ Reshaping
341341
- Bug in :func:`DataFrame.stack` which fails trying to sort mixed type levels under Python 3 (:issue:`18310`)
342342
- Fixed construction of a :class:`Series` from a ``dict`` containing ``NaN`` as key (:issue:`18480`)
343343
- Bug in :func:`Series.rank` where ``Series`` containing ``NaT`` modifies the ``Series`` inplace (:issue:`18521`)
344-
-
344+
- Bug in :func:`cut` which fails when using readonly arrays (:issue:`18773`)
345345

346346
Numeric
347347
^^^^^^^

pandas/_libs/hashtable_class_helper.pxi.in

+59-41
Original file line numberDiff line numberDiff line change
@@ -255,10 +255,56 @@ dtypes = [('Float64', 'float64', 'val != val', True),
255255
('UInt64', 'uint64', 'False', False),
256256
('Int64', 'int64', 'val == iNaT', False)]
257257

258+
def get_dispatch(dtypes):
259+
for (name, dtype, null_condition, float_group) in dtypes:
260+
unique_template = """\
261+
cdef:
262+
Py_ssize_t i, n = len(values)
263+
int ret = 0
264+
{dtype}_t val
265+
khiter_t k
266+
bint seen_na = 0
267+
{name}Vector uniques = {name}Vector()
268+
{name}VectorData *ud
269+
270+
ud = uniques.data
271+
272+
with nogil:
273+
for i in range(n):
274+
val = values[i]
275+
IF {float_group}:
276+
if val == val:
277+
k = kh_get_{dtype}(self.table, val)
278+
if k == self.table.n_buckets:
279+
kh_put_{dtype}(self.table, val, &ret)
280+
if needs_resize(ud):
281+
with gil:
282+
uniques.resize()
283+
append_data_{dtype}(ud, val)
284+
elif not seen_na:
285+
seen_na = 1
286+
if needs_resize(ud):
287+
with gil:
288+
uniques.resize()
289+
append_data_{dtype}(ud, NAN)
290+
ELSE:
291+
k = kh_get_{dtype}(self.table, val)
292+
if k == self.table.n_buckets:
293+
kh_put_{dtype}(self.table, val, &ret)
294+
if needs_resize(ud):
295+
with gil:
296+
uniques.resize()
297+
append_data_{dtype}(ud, val)
298+
return uniques.to_array()
299+
"""
300+
301+
unique_template = unique_template.format(name=name, dtype=dtype, null_condition=null_condition, float_group=float_group)
302+
303+
yield (name, dtype, null_condition, float_group, unique_template)
258304
}}
259305

260306

261-
{{for name, dtype, null_condition, float_group in dtypes}}
307+
{{for name, dtype, null_condition, float_group, unique_template in get_dispatch(dtypes)}}
262308

263309
cdef class {{name}}HashTable(HashTable):
264310

@@ -450,48 +496,20 @@ cdef class {{name}}HashTable(HashTable):
450496
return np.asarray(labels), arr_uniques
451497

452498
@cython.boundscheck(False)
453-
def unique(self, {{dtype}}_t[:] values):
454-
cdef:
455-
Py_ssize_t i, n = len(values)
456-
int ret = 0
457-
{{dtype}}_t val
458-
khiter_t k
459-
bint seen_na = 0
460-
{{name}}Vector uniques = {{name}}Vector()
461-
{{name}}VectorData *ud
499+
def unique(self, ndarray[{{dtype}}_t, ndim=1] values):
500+
if values.flags.writeable:
501+
# If the value is writeable (mutable) then use memview
502+
return self.unique_memview(values)
462503

463-
ud = uniques.data
464-
465-
with nogil:
466-
for i in range(n):
467-
val = values[i]
468-
469-
{{if float_group}}
470-
if val == val:
471-
k = kh_get_{{dtype}}(self.table, val)
472-
if k == self.table.n_buckets:
473-
kh_put_{{dtype}}(self.table, val, &ret)
474-
if needs_resize(ud):
475-
with gil:
476-
uniques.resize()
477-
append_data_{{dtype}}(ud, val)
478-
elif not seen_na:
479-
seen_na = 1
480-
if needs_resize(ud):
481-
with gil:
482-
uniques.resize()
483-
append_data_{{dtype}}(ud, NAN)
484-
{{else}}
485-
k = kh_get_{{dtype}}(self.table, val)
486-
if k == self.table.n_buckets:
487-
kh_put_{{dtype}}(self.table, val, &ret)
488-
if needs_resize(ud):
489-
with gil:
490-
uniques.resize()
491-
append_data_{{dtype}}(ud, val)
492-
{{endif}}
504+
# We cannot use the memoryview version on readonly-buffers due to
505+
# a limitation of Cython's typed memoryviews. Instead we can use
506+
# the slightly slower Cython ndarray type directly.
507+
# see https://github.com/cython/cython/issues/1605
508+
{{unique_template}}
493509

494-
return uniques.to_array()
510+
@cython.boundscheck(False)
511+
def unique_memview(self, {{dtype}}_t[:] values):
512+
{{unique_template}}
495513

496514
{{endfor}}
497515

pandas/tests/reshape/test_tile.py

+14
Original file line numberDiff line numberDiff line change
@@ -512,6 +512,20 @@ def f():
512512
tm.assert_numpy_array_equal(
513513
mask, np.array([False, True, True, True, True]))
514514

515+
@pytest.mark.parametrize("array_1_writeable,array_2_writeable",[
516+
(True, True), (True, False), (False, False)])
517+
def test_cut_read_only(self, array_1_writeable, array_2_writeable):
518+
# issue 18773
519+
array_1 = np.arange(0, 100, 10)
520+
array_1.flags.writeable = array_1_writeable
521+
522+
array_2 = np.arange(0, 100, 10)
523+
array_2.flags.writeable = array_2_writeable
524+
525+
hundred_elements = np.arange(100)
526+
527+
tm.assert_categorical_equal(cut(hundred_elements, array_1),
528+
cut(hundred_elements, array_2))
515529

516530
def curpath():
517531
pth, _ = os.path.split(os.path.abspath(__file__))

0 commit comments

Comments
 (0)