From 4f1e2aaceba1a8d251c56185de5890c15d1fca19 Mon Sep 17 00:00:00 2001 From: Matthew Kirk Date: Sun, 17 Dec 2017 20:38:27 -0800 Subject: [PATCH 1/2] BUG make hashtable.unique support readonly arrays This problem was brought up in https://github.com/pandas-dev/pandas/issues/18773 and effectively comes down to how Cython deals with readonly arrays. While it would be ideal for Cython to fix the underlying problem in the meantime we can rely on this. fix: updates one_to_hundred for hundred_elements This is because arange(100) isn't actually 1 to 100... it's 0 to 99 docs: adds comment to fix using ndarray and fixes indenting test: parametrize test for test_readonly_cut doc: add new whatsnew entry for v0.23.0 fix: checkout existing upstream v0.22.0 --- doc/source/whatsnew/v0.23.0.txt | 2 +- pandas/_libs/hashtable_class_helper.pxi.in | 100 ++++++++++++--------- pandas/tests/reshape/test_tile.py | 14 +++ 3 files changed, 74 insertions(+), 42 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 40e1e2011479c..5c947ba3d4766 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -341,7 +341,7 @@ Reshaping - Bug in :func:`DataFrame.stack` which fails trying to sort mixed type levels under Python 3 (:issue:`18310`) - Fixed construction of a :class:`Series` from a ``dict`` containing ``NaN`` as key (:issue:`18480`) - Bug in :func:`Series.rank` where ``Series`` containing ``NaT`` modifies the ``Series`` inplace (:issue:`18521`) -- +- Bug in :func:`cut` which fails when using readonly arrays (:issue:`18773`) Numeric ^^^^^^^ diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 6e1c4397810b7..bd9dd1f9bae37 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -255,10 +255,56 @@ dtypes = [('Float64', 'float64', 'val != val', True), ('UInt64', 'uint64', 'False', False), ('Int64', 'int64', 'val == iNaT', False)] +def get_dispatch(dtypes): + for (name, dtype, null_condition, float_group) in dtypes: + unique_template = """\ + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + {dtype}_t val + khiter_t k + bint seen_na = 0 + {name}Vector uniques = {name}Vector() + {name}VectorData *ud + + ud = uniques.data + + with nogil: + for i in range(n): + val = values[i] + IF {float_group}: + if val == val: + k = kh_get_{dtype}(self.table, val) + if k == self.table.n_buckets: + kh_put_{dtype}(self.table, val, &ret) + if needs_resize(ud): + with gil: + uniques.resize() + append_data_{dtype}(ud, val) + elif not seen_na: + seen_na = 1 + if needs_resize(ud): + with gil: + uniques.resize() + append_data_{dtype}(ud, NAN) + ELSE: + k = kh_get_{dtype}(self.table, val) + if k == self.table.n_buckets: + kh_put_{dtype}(self.table, val, &ret) + if needs_resize(ud): + with gil: + uniques.resize() + append_data_{dtype}(ud, val) + return uniques.to_array() + """ + + unique_template = unique_template.format(name=name, dtype=dtype, null_condition=null_condition, float_group=float_group) + + yield (name, dtype, null_condition, float_group, unique_template) }} -{{for name, dtype, null_condition, float_group in dtypes}} +{{for name, dtype, null_condition, float_group, unique_template in get_dispatch(dtypes)}} cdef class {{name}}HashTable(HashTable): @@ -450,48 +496,20 @@ cdef class {{name}}HashTable(HashTable): return np.asarray(labels), arr_uniques @cython.boundscheck(False) - def unique(self, {{dtype}}_t[:] values): - cdef: - Py_ssize_t i, n = len(values) - int ret = 0 - {{dtype}}_t val - khiter_t k - bint seen_na = 0 - {{name}}Vector uniques = {{name}}Vector() - {{name}}VectorData *ud + def unique(self, ndarray[{{dtype}}_t, ndim=1] values): + if values.flags.writeable: + # If the value is writeable (mutable) then use memview + return self.unique_memview(values) - ud = uniques.data - - with nogil: - for i in range(n): - val = values[i] - - {{if float_group}} - if val == val: - k = kh_get_{{dtype}}(self.table, val) - if k == self.table.n_buckets: - kh_put_{{dtype}}(self.table, val, &ret) - if needs_resize(ud): - with gil: - uniques.resize() - append_data_{{dtype}}(ud, val) - elif not seen_na: - seen_na = 1 - if needs_resize(ud): - with gil: - uniques.resize() - append_data_{{dtype}}(ud, NAN) - {{else}} - k = kh_get_{{dtype}}(self.table, val) - if k == self.table.n_buckets: - kh_put_{{dtype}}(self.table, val, &ret) - if needs_resize(ud): - with gil: - uniques.resize() - append_data_{{dtype}}(ud, val) - {{endif}} + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + # see https://github.com/cython/cython/issues/1605 +{{unique_template}} - return uniques.to_array() + @cython.boundscheck(False) + def unique_memview(self, {{dtype}}_t[:] values): +{{unique_template}} {{endfor}} diff --git a/pandas/tests/reshape/test_tile.py b/pandas/tests/reshape/test_tile.py index c27af7a5bf8e4..a70dbb4ac9de5 100644 --- a/pandas/tests/reshape/test_tile.py +++ b/pandas/tests/reshape/test_tile.py @@ -512,6 +512,20 @@ def f(): tm.assert_numpy_array_equal( mask, np.array([False, True, True, True, True])) + @pytest.mark.parametrize("array_1_writeable,array_2_writeable",[ + (True, True), (True, False), (False, False)]) + def test_cut_read_only(self, array_1_writeable, array_2_writeable): + # issue 18773 + array_1 = np.arange(0, 100, 10) + array_1.flags.writeable = array_1_writeable + + array_2 = np.arange(0, 100, 10) + array_2.flags.writeable = array_2_writeable + + hundred_elements = np.arange(100) + + tm.assert_categorical_equal(cut(hundred_elements, array_1), + cut(hundred_elements, array_2)) def curpath(): pth, _ = os.path.split(os.path.abspath(__file__)) From ddb5247d73345af80f5723d6f3865409f12b3877 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 23 Dec 2017 15:56:03 -0500 Subject: [PATCH 2/2] lint --- pandas/tests/reshape/test_tile.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/pandas/tests/reshape/test_tile.py b/pandas/tests/reshape/test_tile.py index a70dbb4ac9de5..48f25112b45cf 100644 --- a/pandas/tests/reshape/test_tile.py +++ b/pandas/tests/reshape/test_tile.py @@ -512,8 +512,9 @@ def f(): tm.assert_numpy_array_equal( mask, np.array([False, True, True, True, True])) - @pytest.mark.parametrize("array_1_writeable,array_2_writeable",[ - (True, True), (True, False), (False, False)]) + @pytest.mark.parametrize( + "array_1_writeable, array_2_writeable", + [(True, True), (True, False), (False, False)]) def test_cut_read_only(self, array_1_writeable, array_2_writeable): # issue 18773 array_1 = np.arange(0, 100, 10) @@ -526,7 +527,3 @@ def test_cut_read_only(self, array_1_writeable, array_2_writeable): tm.assert_categorical_equal(cut(hundred_elements, array_1), cut(hundred_elements, array_2)) - -def curpath(): - pth, _ = os.path.split(os.path.abspath(__file__)) - return pth