Skip to content

Commit 31b0552

Browse files
committed
CLN: intp_t instead of int64_t for indexers in libs funcs
1 parent c13027c commit 31b0552

File tree

8 files changed

+36
-22
lines changed

8 files changed

+36
-22
lines changed

pandas/_libs/algos.pyx

+9-6
Original file line numberDiff line numberDiff line change
@@ -199,20 +199,23 @@ def groupsort_indexer(const int64_t[:] index, Py_ssize_t ngroups):
199199
200200
Returns
201201
-------
202-
tuple
203-
1-d indexer ordered by groups, group counts.
202+
ndarray[intp_t, ndim=1]
203+
Indexer
204+
ndarray[int64_t, ndim=1]
205+
Group Counts
204206
205207
Notes
206208
-----
207209
This is a reverse of the label factorization process.
208210
"""
209211
cdef:
210212
Py_ssize_t i, loc, label, n
211-
ndarray[int64_t] counts, where, result
213+
ndarray[int64_t] counts, where
214+
ndarray[intp_t] indexer
212215

213216
counts = np.zeros(ngroups + 1, dtype=np.int64)
214217
n = len(index)
215-
result = np.zeros(n, dtype=np.int64)
218+
indexer = np.zeros(n, dtype=np.intp)
216219
where = np.zeros(ngroups + 1, dtype=np.int64)
217220

218221
with nogil:
@@ -228,10 +231,10 @@ def groupsort_indexer(const int64_t[:] index, Py_ssize_t ngroups):
228231
# this is our indexer
229232
for i in range(n):
230233
label = index[i] + 1
231-
result[where[label]] = i
234+
indexer[where[label]] = i
232235
where[label] += 1
233236

234-
return result, counts
237+
return indexer, counts
235238

236239

237240
@cython.boundscheck(False)

pandas/_libs/algos_take_helper.pxi.in

+2-2
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ def take_2d_axis0_{{name}}_{{dest}}(const {{c_type_in}}[:, :] values,
102102
{{else}}
103103
def take_2d_axis0_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values,
104104
{{endif}}
105-
ndarray[int64_t] indexer,
105+
ndarray[intp_t] indexer,
106106
{{c_type_out}}[:, :] out,
107107
fill_value=np.nan):
108108
cdef:
@@ -156,7 +156,7 @@ def take_2d_axis1_{{name}}_{{dest}}(const {{c_type_in}}[:, :] values,
156156
{{else}}
157157
def take_2d_axis1_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values,
158158
{{endif}}
159-
ndarray[int64_t] indexer,
159+
ndarray[intp_t] indexer,
160160
{{c_type_out}}[:, :] out,
161161
fill_value=np.nan):
162162

pandas/_libs/groupby.pyx

+2
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ from numpy cimport (
1919
int16_t,
2020
int32_t,
2121
int64_t,
22+
intp_t,
2223
ndarray,
2324
uint8_t,
2425
uint16_t,
@@ -141,6 +142,7 @@ def group_median_float64(ndarray[float64_t, ndim=2] out,
141142
Py_ssize_t i, j, N, K, ngroups, size
142143
ndarray[int64_t] _counts
143144
ndarray[float64_t, ndim=2] data
145+
ndarray[intp_t] indexer
144146
float64_t* ptr
145147

146148
assert min_count == -1, "'min_count' only used in add and prod"

pandas/_libs/join.pyx

+12-5
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ from numpy cimport (
1010
int16_t,
1111
int32_t,
1212
int64_t,
13+
intp_t,
1314
ndarray,
1415
uint8_t,
1516
uint16_t,
@@ -31,7 +32,8 @@ def inner_join(const int64_t[:] left, const int64_t[:] right,
3132
Py_ssize_t max_groups):
3233
cdef:
3334
Py_ssize_t i, j, k, count = 0
34-
ndarray[int64_t] left_count, right_count, left_sorter, right_sorter
35+
ndarray[intp_t] left_sorter, right_sorter
36+
ndarray[int64_t] left_count, right_count
3537
ndarray[int64_t] left_indexer, right_indexer
3638
int64_t lc, rc
3739
Py_ssize_t loc, left_pos = 0, right_pos = 0, position = 0
@@ -82,8 +84,8 @@ def left_outer_join(const int64_t[:] left, const int64_t[:] right,
8284
Py_ssize_t max_groups, bint sort=True):
8385
cdef:
8486
Py_ssize_t i, j, k, count = 0
85-
ndarray[int64_t] left_count, right_count, left_sorter, right_sorter
86-
ndarray rev
87+
ndarray[int64_t] left_count, right_count
88+
ndarray[intp_t] rev, left_sorter, right_sorter
8789
ndarray[int64_t] left_indexer, right_indexer
8890
int64_t lc, rc
8991
Py_ssize_t loc, left_pos = 0, right_pos = 0, position = 0
@@ -155,7 +157,8 @@ def full_outer_join(const int64_t[:] left, const int64_t[:] right,
155157
Py_ssize_t max_groups):
156158
cdef:
157159
Py_ssize_t i, j, k, count = 0
158-
ndarray[int64_t] left_count, right_count, left_sorter, right_sorter
160+
ndarray[intp_t] left_sorter, right_sorter
161+
ndarray[int64_t] left_count, right_count
159162
ndarray[int64_t] left_indexer, right_indexer
160163
int64_t lc, rc
161164
int64_t left_pos = 0, right_pos = 0
@@ -213,12 +216,16 @@ def full_outer_join(const int64_t[:] left, const int64_t[:] right,
213216
_get_result_indexer(right_sorter, right_indexer))
214217

215218

216-
cdef _get_result_indexer(ndarray[int64_t] sorter, ndarray[int64_t] indexer):
219+
cdef ndarray[int64_t] _get_result_indexer(
220+
ndarray[intp_t] sorter, ndarray[int64_t] indexer
221+
):
217222
if len(sorter) > 0:
218223
# cython-only equivalent to
219224
# `res = algos.take_nd(sorter, indexer, fill_value=-1)`
220225
res = np.empty(len(indexer), dtype=np.int64)
221226
take_1d_int64_int64(sorter, indexer, res, -1)
227+
# FIXME: sorter is intp_t, not int64_t, opposite for indexer;
228+
# will this break on 32bit builds?
222229
else:
223230
# length-0 case
224231
res = np.empty(len(indexer), dtype=np.int64)

pandas/core/array_algos/take.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -117,10 +117,10 @@ def _take_nd_ndarray(
117117
) -> np.ndarray:
118118

119119
if indexer is None:
120-
indexer = np.arange(arr.shape[axis], dtype=np.int64)
120+
indexer = np.arange(arr.shape[axis], dtype=np.intp)
121121
dtype, fill_value = arr.dtype, arr.dtype.type()
122122
else:
123-
indexer = ensure_int64(indexer, copy=False)
123+
indexer = ensure_platform_int(indexer)
124124
indexer, dtype, fill_value, mask_info = _take_preprocess_indexer_and_fill_value(
125125
arr, indexer, out, fill_value, allow_fill
126126
)
@@ -320,7 +320,7 @@ def _get_take_nd_function(
320320
if func is None:
321321

322322
def func(arr, indexer, out, fill_value=np.nan):
323-
indexer = ensure_int64(indexer)
323+
indexer = ensure_platform_int(indexer)
324324
_take_nd_object(
325325
arr, indexer, out, axis=axis, fill_value=fill_value, mask_info=mask_info
326326
)
@@ -471,7 +471,7 @@ def wrapper(
471471

472472
def _take_nd_object(
473473
arr: np.ndarray,
474-
indexer: np.ndarray,
474+
indexer: np.ndarray, # np.ndarray[np.intp]
475475
out: np.ndarray,
476476
axis: int,
477477
fill_value,
@@ -547,4 +547,5 @@ def _take_preprocess_indexer_and_fill_value(
547547
# to crash when trying to cast it to dtype)
548548
dtype, fill_value = arr.dtype, arr.dtype.type()
549549

550+
indexer = ensure_platform_int(indexer)
550551
return indexer, dtype, fill_value, mask_info

pandas/core/arrays/categorical.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1979,7 +1979,7 @@ def _reverse_indexer(self) -> Dict[Hashable, np.ndarray]:
19791979
"""
19801980
categories = self.categories
19811981
r, counts = libalgos.groupsort_indexer(
1982-
self.codes.astype("int64"), categories.size
1982+
self.codes.astype("int64", copy=False), categories.size
19831983
)
19841984
counts = counts.cumsum()
19851985
_result = (r[start:end] for start, end in zip(counts, counts[1:]))

pandas/core/sorting.py

+1
Original file line numberDiff line numberDiff line change
@@ -596,6 +596,7 @@ def get_group_index_sorter(
596596
)
597597
if do_groupsort:
598598
sorter, _ = algos.groupsort_indexer(ensure_int64(group_index), ngroups)
599+
# sorter _should_ already be intp, but mypy is not yet able to verify
599600
return ensure_platform_int(sorter)
600601
else:
601602
return group_index.argsort(kind="mergesort")

pandas/tests/test_algos.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -2123,19 +2123,19 @@ def test_groupsort_indexer():
21232123

21242124
# need to use a stable sort
21252125
# np.argsort returns int, groupsort_indexer
2126-
# always returns int64
2126+
# always returns intp
21272127
expected = np.argsort(a, kind="mergesort")
2128-
expected = expected.astype(np.int64)
2128+
expected = expected.astype(np.intp)
21292129

21302130
tm.assert_numpy_array_equal(result, expected)
21312131

21322132
# compare with lexsort
21332133
# np.lexsort returns int, groupsort_indexer
2134-
# always returns int64
2134+
# always returns intp
21352135
key = a * 1000 + b
21362136
result = libalgos.groupsort_indexer(key, 1000000)[0]
21372137
expected = np.lexsort((b, a))
2138-
expected = expected.astype(np.int64)
2138+
expected = expected.astype(np.intp)
21392139

21402140
tm.assert_numpy_array_equal(result, expected)
21412141

0 commit comments

Comments
 (0)