Skip to content

Commit 80a5399

Browse files
hexgnujreback
authored andcommitted
BUG make hashtable.unique support readonly arrays (#18825)
1 parent ee2e6de commit 80a5399

File tree

3 files changed

+74
-44
lines changed

3 files changed

+74
-44
lines changed

doc/source/whatsnew/v0.23.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -350,6 +350,7 @@ Reshaping
350350
- Bug in :func:`DataFrame.stack` which fails trying to sort mixed type levels under Python 3 (:issue:`18310`)
351351
- Fixed construction of a :class:`Series` from a ``dict`` containing ``NaN`` as key (:issue:`18480`)
352352
- Bug in :func:`Series.rank` where ``Series`` containing ``NaT`` modifies the ``Series`` inplace (:issue:`18521`)
353+
- Bug in :func:`cut` which fails when using readonly arrays (:issue:`18773`)
353354
- Bug in :func:`Dataframe.pivot_table` which fails when the ``aggfunc`` arg is of type string. The behavior is now consistent with other methods like ``agg`` and ``apply`` (:issue:`18713`)
354355

355356

pandas/_libs/hashtable_class_helper.pxi.in

+59-41
Original file line numberDiff line numberDiff line change
@@ -255,10 +255,56 @@ dtypes = [('Float64', 'float64', 'val != val', True),
255255
('UInt64', 'uint64', 'False', False),
256256
('Int64', 'int64', 'val == iNaT', False)]
257257

258+
def get_dispatch(dtypes):
259+
for (name, dtype, null_condition, float_group) in dtypes:
260+
unique_template = """\
261+
cdef:
262+
Py_ssize_t i, n = len(values)
263+
int ret = 0
264+
{dtype}_t val
265+
khiter_t k
266+
bint seen_na = 0
267+
{name}Vector uniques = {name}Vector()
268+
{name}VectorData *ud
269+
270+
ud = uniques.data
271+
272+
with nogil:
273+
for i in range(n):
274+
val = values[i]
275+
IF {float_group}:
276+
if val == val:
277+
k = kh_get_{dtype}(self.table, val)
278+
if k == self.table.n_buckets:
279+
kh_put_{dtype}(self.table, val, &ret)
280+
if needs_resize(ud):
281+
with gil:
282+
uniques.resize()
283+
append_data_{dtype}(ud, val)
284+
elif not seen_na:
285+
seen_na = 1
286+
if needs_resize(ud):
287+
with gil:
288+
uniques.resize()
289+
append_data_{dtype}(ud, NAN)
290+
ELSE:
291+
k = kh_get_{dtype}(self.table, val)
292+
if k == self.table.n_buckets:
293+
kh_put_{dtype}(self.table, val, &ret)
294+
if needs_resize(ud):
295+
with gil:
296+
uniques.resize()
297+
append_data_{dtype}(ud, val)
298+
return uniques.to_array()
299+
"""
300+
301+
unique_template = unique_template.format(name=name, dtype=dtype, null_condition=null_condition, float_group=float_group)
302+
303+
yield (name, dtype, null_condition, float_group, unique_template)
258304
}}
259305

260306

261-
{{for name, dtype, null_condition, float_group in dtypes}}
307+
{{for name, dtype, null_condition, float_group, unique_template in get_dispatch(dtypes)}}
262308

263309
cdef class {{name}}HashTable(HashTable):
264310

@@ -450,48 +496,20 @@ cdef class {{name}}HashTable(HashTable):
450496
return np.asarray(labels), arr_uniques
451497

452498
@cython.boundscheck(False)
453-
def unique(self, {{dtype}}_t[:] values):
454-
cdef:
455-
Py_ssize_t i, n = len(values)
456-
int ret = 0
457-
{{dtype}}_t val
458-
khiter_t k
459-
bint seen_na = 0
460-
{{name}}Vector uniques = {{name}}Vector()
461-
{{name}}VectorData *ud
499+
def unique(self, ndarray[{{dtype}}_t, ndim=1] values):
500+
if values.flags.writeable:
501+
# If the value is writeable (mutable) then use memview
502+
return self.unique_memview(values)
462503

463-
ud = uniques.data
464-
465-
with nogil:
466-
for i in range(n):
467-
val = values[i]
468-
469-
{{if float_group}}
470-
if val == val:
471-
k = kh_get_{{dtype}}(self.table, val)
472-
if k == self.table.n_buckets:
473-
kh_put_{{dtype}}(self.table, val, &ret)
474-
if needs_resize(ud):
475-
with gil:
476-
uniques.resize()
477-
append_data_{{dtype}}(ud, val)
478-
elif not seen_na:
479-
seen_na = 1
480-
if needs_resize(ud):
481-
with gil:
482-
uniques.resize()
483-
append_data_{{dtype}}(ud, NAN)
484-
{{else}}
485-
k = kh_get_{{dtype}}(self.table, val)
486-
if k == self.table.n_buckets:
487-
kh_put_{{dtype}}(self.table, val, &ret)
488-
if needs_resize(ud):
489-
with gil:
490-
uniques.resize()
491-
append_data_{{dtype}}(ud, val)
492-
{{endif}}
504+
# We cannot use the memoryview version on readonly-buffers due to
505+
# a limitation of Cython's typed memoryviews. Instead we can use
506+
# the slightly slower Cython ndarray type directly.
507+
# see https://github.com/cython/cython/issues/1605
508+
{{unique_template}}
493509

494-
return uniques.to_array()
510+
@cython.boundscheck(False)
511+
def unique_memview(self, {{dtype}}_t[:] values):
512+
{{unique_template}}
495513

496514
{{endfor}}
497515

pandas/tests/reshape/test_tile.py

+14-3
Original file line numberDiff line numberDiff line change
@@ -512,7 +512,18 @@ def f():
512512
tm.assert_numpy_array_equal(
513513
mask, np.array([False, True, True, True, True]))
514514

515+
@pytest.mark.parametrize(
516+
"array_1_writeable, array_2_writeable",
517+
[(True, True), (True, False), (False, False)])
518+
def test_cut_read_only(self, array_1_writeable, array_2_writeable):
519+
# issue 18773
520+
array_1 = np.arange(0, 100, 10)
521+
array_1.flags.writeable = array_1_writeable
515522

516-
def curpath():
517-
pth, _ = os.path.split(os.path.abspath(__file__))
518-
return pth
523+
array_2 = np.arange(0, 100, 10)
524+
array_2.flags.writeable = array_2_writeable
525+
526+
hundred_elements = np.arange(100)
527+
528+
tm.assert_categorical_equal(cut(hundred_elements, array_1),
529+
cut(hundred_elements, array_2))

0 commit comments

Comments
 (0)