From af58060a1af8e5d6d34a82d664d12fe96d0e3f0a Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 16 Mar 2021 15:06:59 -0700 Subject: [PATCH 1/5] CLN: intp_t instead of int64_t for indexers in libs funcs --- pandas/_libs/algos.pyx | 15 +++++++++------ pandas/_libs/algos_take_helper.pxi.in | 4 ++-- pandas/_libs/groupby.pyx | 2 ++ pandas/_libs/join.pyx | 17 ++++++++++++----- pandas/core/array_algos/take.py | 9 +++++---- pandas/core/arrays/categorical.py | 2 +- pandas/core/sorting.py | 1 + pandas/tests/test_algos.py | 8 ++++---- 8 files changed, 36 insertions(+), 22 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 5783d3c2353aa..047eb848b7540 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -199,8 +199,10 @@ def groupsort_indexer(const int64_t[:] index, Py_ssize_t ngroups): Returns ------- - tuple - 1-d indexer ordered by groups, group counts. + ndarray[intp_t, ndim=1] + Indexer + ndarray[int64_t, ndim=1] + Group Counts Notes ----- @@ -208,11 +210,12 @@ def groupsort_indexer(const int64_t[:] index, Py_ssize_t ngroups): """ cdef: Py_ssize_t i, loc, label, n - ndarray[int64_t] counts, where, result + ndarray[int64_t] counts, where + ndarray[intp_t] indexer counts = np.zeros(ngroups + 1, dtype=np.int64) n = len(index) - result = np.zeros(n, dtype=np.int64) + indexer = np.zeros(n, dtype=np.intp) where = np.zeros(ngroups + 1, dtype=np.int64) with nogil: @@ -228,10 +231,10 @@ def groupsort_indexer(const int64_t[:] index, Py_ssize_t ngroups): # this is our indexer for i in range(n): label = index[i] + 1 - result[where[label]] = i + indexer[where[label]] = i where[label] += 1 - return result, counts + return indexer, counts @cython.boundscheck(False) diff --git a/pandas/_libs/algos_take_helper.pxi.in b/pandas/_libs/algos_take_helper.pxi.in index 4eefd9d1f7267..04ce109a6392f 100644 --- a/pandas/_libs/algos_take_helper.pxi.in +++ b/pandas/_libs/algos_take_helper.pxi.in @@ -102,7 +102,7 @@ def take_2d_axis0_{{name}}_{{dest}}(const {{c_type_in}}[:, :] values, {{else}} def take_2d_axis0_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values, {{endif}} - ndarray[int64_t] indexer, + ndarray[intp_t] indexer, {{c_type_out}}[:, :] out, fill_value=np.nan): cdef: @@ -156,7 +156,7 @@ def take_2d_axis1_{{name}}_{{dest}}(const {{c_type_in}}[:, :] values, {{else}} def take_2d_axis1_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values, {{endif}} - ndarray[int64_t] indexer, + ndarray[intp_t] indexer, {{c_type_out}}[:, :] out, fill_value=np.nan): diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 1bfb66cbf21ac..89020f2078584 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -19,6 +19,7 @@ from numpy cimport ( int16_t, int32_t, int64_t, + intp_t, ndarray, uint8_t, uint16_t, @@ -141,6 +142,7 @@ def group_median_float64(ndarray[float64_t, ndim=2] out, Py_ssize_t i, j, N, K, ngroups, size ndarray[int64_t] _counts ndarray[float64_t, ndim=2] data + ndarray[intp_t] indexer float64_t* ptr assert min_count == -1, "'min_count' only used in add and prod" diff --git a/pandas/_libs/join.pyx b/pandas/_libs/join.pyx index 1b79d68c13570..bac416037ada2 100644 --- a/pandas/_libs/join.pyx +++ b/pandas/_libs/join.pyx @@ -10,6 +10,7 @@ from numpy cimport ( int16_t, int32_t, int64_t, + intp_t, ndarray, uint8_t, uint16_t, @@ -31,7 +32,8 @@ def inner_join(const int64_t[:] left, const int64_t[:] right, Py_ssize_t max_groups): cdef: Py_ssize_t i, j, k, count = 0 - ndarray[int64_t] left_count, right_count, left_sorter, right_sorter + ndarray[intp_t] left_sorter, right_sorter + ndarray[int64_t] left_count, right_count ndarray[int64_t] left_indexer, right_indexer int64_t lc, rc Py_ssize_t loc, left_pos = 0, right_pos = 0, position = 0 @@ -82,8 +84,8 @@ def left_outer_join(const int64_t[:] left, const int64_t[:] right, Py_ssize_t max_groups, bint sort=True): cdef: Py_ssize_t i, j, k, count = 0 - ndarray[int64_t] left_count, right_count, left_sorter, right_sorter - ndarray rev + ndarray[int64_t] left_count, right_count + ndarray[intp_t] rev, left_sorter, right_sorter ndarray[int64_t] left_indexer, right_indexer int64_t lc, rc Py_ssize_t loc, left_pos = 0, right_pos = 0, position = 0 @@ -155,7 +157,8 @@ def full_outer_join(const int64_t[:] left, const int64_t[:] right, Py_ssize_t max_groups): cdef: Py_ssize_t i, j, k, count = 0 - ndarray[int64_t] left_count, right_count, left_sorter, right_sorter + ndarray[intp_t] left_sorter, right_sorter + ndarray[int64_t] left_count, right_count ndarray[int64_t] left_indexer, right_indexer int64_t lc, rc int64_t left_pos = 0, right_pos = 0 @@ -213,12 +216,16 @@ def full_outer_join(const int64_t[:] left, const int64_t[:] right, _get_result_indexer(right_sorter, right_indexer)) -cdef _get_result_indexer(ndarray[int64_t] sorter, ndarray[int64_t] indexer): +cdef ndarray[int64_t] _get_result_indexer( + ndarray[intp_t] sorter, ndarray[int64_t] indexer +): if len(sorter) > 0: # cython-only equivalent to # `res = algos.take_nd(sorter, indexer, fill_value=-1)` res = np.empty(len(indexer), dtype=np.int64) take_1d_int64_int64(sorter, indexer, res, -1) + # FIXME: sorter is intp_t, not int64_t, opposite for indexer; + # will this break on 32bit builds? else: # length-0 case res = np.empty(len(indexer), dtype=np.int64) diff --git a/pandas/core/array_algos/take.py b/pandas/core/array_algos/take.py index 31cbadb0e442b..e77c0d7794f15 100644 --- a/pandas/core/array_algos/take.py +++ b/pandas/core/array_algos/take.py @@ -117,10 +117,10 @@ def _take_nd_ndarray( ) -> np.ndarray: if indexer is None: - indexer = np.arange(arr.shape[axis], dtype=np.int64) + indexer = np.arange(arr.shape[axis], dtype=np.intp) dtype, fill_value = arr.dtype, arr.dtype.type() else: - indexer = ensure_int64(indexer, copy=False) + indexer = ensure_platform_int(indexer) indexer, dtype, fill_value, mask_info = _take_preprocess_indexer_and_fill_value( arr, indexer, out, fill_value, allow_fill ) @@ -320,7 +320,7 @@ def _get_take_nd_function( if func is None: def func(arr, indexer, out, fill_value=np.nan): - indexer = ensure_int64(indexer) + indexer = ensure_platform_int(indexer) _take_nd_object( arr, indexer, out, axis=axis, fill_value=fill_value, mask_info=mask_info ) @@ -471,7 +471,7 @@ def wrapper( def _take_nd_object( arr: np.ndarray, - indexer: np.ndarray, + indexer: np.ndarray, # np.ndarray[np.intp] out: np.ndarray, axis: int, fill_value, @@ -547,4 +547,5 @@ def _take_preprocess_indexer_and_fill_value( # to crash when trying to cast it to dtype) dtype, fill_value = arr.dtype, arr.dtype.type() + indexer = ensure_platform_int(indexer) return indexer, dtype, fill_value, mask_info diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 864bd0684d445..0c00faa0000cc 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1979,7 +1979,7 @@ def _reverse_indexer(self) -> Dict[Hashable, np.ndarray]: """ categories = self.categories r, counts = libalgos.groupsort_indexer( - self.codes.astype("int64"), categories.size + self.codes.astype("int64", copy=False), categories.size ) counts = counts.cumsum() _result = (r[start:end] for start, end in zip(counts, counts[1:])) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 720643d3d98aa..f2270feb6e2b6 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -596,6 +596,7 @@ def get_group_index_sorter( ) if do_groupsort: sorter, _ = algos.groupsort_indexer(ensure_int64(group_index), ngroups) + # sorter _should_ already be intp, but mypy is not yet able to verify return ensure_platform_int(sorter) else: return group_index.argsort(kind="mergesort") diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 876df69ae7f63..c8df18ddaeebe 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -2123,19 +2123,19 @@ def test_groupsort_indexer(): # need to use a stable sort # np.argsort returns int, groupsort_indexer - # always returns int64 + # always returns intp expected = np.argsort(a, kind="mergesort") - expected = expected.astype(np.int64) + expected = expected.astype(np.intp) tm.assert_numpy_array_equal(result, expected) # compare with lexsort # np.lexsort returns int, groupsort_indexer - # always returns int64 + # always returns intp key = a * 1000 + b result = libalgos.groupsort_indexer(key, 1000000)[0] expected = np.lexsort((b, a)) - expected = expected.astype(np.int64) + expected = expected.astype(np.intp) tm.assert_numpy_array_equal(result, expected) From b67cae12ba0121892439a28cce20334fb40816ff Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 16 Mar 2021 19:08:00 -0700 Subject: [PATCH 2/5] troubleshoot --- pandas/_libs/join.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/join.pyx b/pandas/_libs/join.pyx index bac416037ada2..4e05fc23be686 100644 --- a/pandas/_libs/join.pyx +++ b/pandas/_libs/join.pyx @@ -21,6 +21,7 @@ from numpy cimport ( cnp.import_array() from pandas._libs.algos import ( + ensure_int64, ensure_platform_int, groupsort_indexer, take_1d_int64_int64, @@ -223,7 +224,7 @@ cdef ndarray[int64_t] _get_result_indexer( # cython-only equivalent to # `res = algos.take_nd(sorter, indexer, fill_value=-1)` res = np.empty(len(indexer), dtype=np.int64) - take_1d_int64_int64(sorter, indexer, res, -1) + take_1d_int64_int64(ensure_int64(sorter), ensure_platform_int(indexer), res, -1) # FIXME: sorter is intp_t, not int64_t, opposite for indexer; # will this break on 32bit builds? else: From e59fec59aefc2d150b18a25958f512cccf131aca Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 16 Mar 2021 21:09:02 -0700 Subject: [PATCH 3/5] try to get more helpful exception --- pandas/core/arrays/categorical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 0c00faa0000cc..f14cd494c500e 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -434,7 +434,7 @@ def __init__( "by passing in a categories argument." ) from err except ValueError as err: - + raise # troubleshoot 32bit failures # TODO(EA2D) raise NotImplementedError( "> 1 ndim Categorical are not supported at this time" From f3c4f628b08a400813f4db572d1f432687c3fe8a Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 16 Mar 2021 21:21:02 -0700 Subject: [PATCH 4/5] update take_1d_ indexer dtype --- pandas/_libs/algos_take_helper.pxi.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/algos_take_helper.pxi.in b/pandas/_libs/algos_take_helper.pxi.in index 04ce109a6392f..cdf4ef3b119d2 100644 --- a/pandas/_libs/algos_take_helper.pxi.in +++ b/pandas/_libs/algos_take_helper.pxi.in @@ -66,7 +66,7 @@ def take_1d_{{name}}_{{dest}}(const {{c_type_in}}[:] values, {{else}} def take_1d_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=1] values, {{endif}} - const int64_t[:] indexer, + const intp_t[:] indexer, {{c_type_out}}[:] out, fill_value=np.nan): From b8c96f01e7118625c39cbc790d7cb7b06070cc44 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 17 Mar 2021 07:16:06 -0700 Subject: [PATCH 5/5] update test, docstring --- pandas/core/arrays/categorical.py | 5 +++-- pandas/tests/groupby/test_categorical.py | 6 +++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index f14cd494c500e..00667aae5c9ff 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -434,7 +434,7 @@ def __init__( "by passing in a categories argument." ) from err except ValueError as err: - raise # troubleshoot 32bit failures + # TODO(EA2D) raise NotImplementedError( "> 1 ndim Categorical are not supported at this time" @@ -1961,7 +1961,8 @@ def _reverse_indexer(self) -> Dict[Hashable, np.ndarray]: Returns ------- - dict of categories -> indexers + Dict[Hashable, np.ndarray[np.intp]] + dict of categories -> indexers Examples -------- diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index f685680515a8f..da438826a939a 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1717,9 +1717,9 @@ def test_groupby_categorical_indices_unused_categories(): grouped = df.groupby("key", sort=False) result = grouped.indices expected = { - "b": np.array([0, 1], dtype="int64"), - "a": np.array([2], dtype="int64"), - "c": np.array([], dtype="int64"), + "b": np.array([0, 1], dtype="intp"), + "a": np.array([2], dtype="intp"), + "c": np.array([], dtype="intp"), } assert result.keys() == expected.keys() for key in result.keys():