Skip to content

Commit 38640d1

Browse files
authored
CLN: intp_t instead of int64_t for indexers in libs funcs (#40475)
1 parent 64e87b3 commit 38640d1

File tree

9 files changed

+42
-28
lines changed

9 files changed

+42
-28
lines changed

pandas/_libs/algos.pyx

+9-6
Original file line numberDiff line numberDiff line change
@@ -199,20 +199,23 @@ def groupsort_indexer(const int64_t[:] index, Py_ssize_t ngroups):
199199
200200
Returns
201201
-------
202-
tuple
203-
1-d indexer ordered by groups, group counts.
202+
ndarray[intp_t, ndim=1]
203+
Indexer
204+
ndarray[int64_t, ndim=1]
205+
Group Counts
204206
205207
Notes
206208
-----
207209
This is a reverse of the label factorization process.
208210
"""
209211
cdef:
210212
Py_ssize_t i, loc, label, n
211-
ndarray[int64_t] counts, where, result
213+
ndarray[int64_t] counts, where
214+
ndarray[intp_t] indexer
212215

213216
counts = np.zeros(ngroups + 1, dtype=np.int64)
214217
n = len(index)
215-
result = np.zeros(n, dtype=np.int64)
218+
indexer = np.zeros(n, dtype=np.intp)
216219
where = np.zeros(ngroups + 1, dtype=np.int64)
217220

218221
with nogil:
@@ -228,10 +231,10 @@ def groupsort_indexer(const int64_t[:] index, Py_ssize_t ngroups):
228231
# this is our indexer
229232
for i in range(n):
230233
label = index[i] + 1
231-
result[where[label]] = i
234+
indexer[where[label]] = i
232235
where[label] += 1
233236

234-
return result, counts
237+
return indexer, counts
235238

236239

237240
@cython.boundscheck(False)

pandas/_libs/algos_take_helper.pxi.in

+3-3
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ def take_1d_{{name}}_{{dest}}(const {{c_type_in}}[:] values,
6666
{{else}}
6767
def take_1d_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=1] values,
6868
{{endif}}
69-
const int64_t[:] indexer,
69+
const intp_t[:] indexer,
7070
{{c_type_out}}[:] out,
7171
fill_value=np.nan):
7272

@@ -102,7 +102,7 @@ def take_2d_axis0_{{name}}_{{dest}}(const {{c_type_in}}[:, :] values,
102102
{{else}}
103103
def take_2d_axis0_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values,
104104
{{endif}}
105-
ndarray[int64_t] indexer,
105+
ndarray[intp_t] indexer,
106106
{{c_type_out}}[:, :] out,
107107
fill_value=np.nan):
108108
cdef:
@@ -156,7 +156,7 @@ def take_2d_axis1_{{name}}_{{dest}}(const {{c_type_in}}[:, :] values,
156156
{{else}}
157157
def take_2d_axis1_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values,
158158
{{endif}}
159-
ndarray[int64_t] indexer,
159+
ndarray[intp_t] indexer,
160160
{{c_type_out}}[:, :] out,
161161
fill_value=np.nan):
162162

pandas/_libs/groupby.pyx

+2
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ from numpy cimport (
1919
int16_t,
2020
int32_t,
2121
int64_t,
22+
intp_t,
2223
ndarray,
2324
uint8_t,
2425
uint16_t,
@@ -141,6 +142,7 @@ def group_median_float64(ndarray[float64_t, ndim=2] out,
141142
Py_ssize_t i, j, N, K, ngroups, size
142143
ndarray[int64_t] _counts
143144
ndarray[float64_t, ndim=2] data
145+
ndarray[intp_t] indexer
144146
float64_t* ptr
145147

146148
assert min_count == -1, "'min_count' only used in add and prod"

pandas/_libs/join.pyx

+12-6
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,8 @@ def inner_join(const intp_t[:] left, const intp_t[:] right,
3333
Py_ssize_t max_groups):
3434
cdef:
3535
Py_ssize_t i, j, k, count = 0
36-
ndarray[int64_t] left_count, right_count, left_sorter, right_sorter
36+
ndarray[intp_t] left_sorter, right_sorter
37+
ndarray[int64_t] left_count, right_count
3738
ndarray[int64_t] left_indexer, right_indexer
3839
int64_t lc, rc
3940
Py_ssize_t loc, left_pos = 0, right_pos = 0, position = 0
@@ -84,8 +85,8 @@ def left_outer_join(const intp_t[:] left, const intp_t[:] right,
8485
Py_ssize_t max_groups, bint sort=True):
8586
cdef:
8687
Py_ssize_t i, j, k, count = 0
87-
ndarray[int64_t] left_count, right_count, left_sorter, right_sorter
88-
ndarray rev
88+
ndarray[int64_t] left_count, right_count
89+
ndarray[intp_t] rev, left_sorter, right_sorter
8990
ndarray[int64_t] left_indexer, right_indexer
9091
int64_t lc, rc
9192
Py_ssize_t loc, left_pos = 0, right_pos = 0, position = 0
@@ -157,7 +158,8 @@ def full_outer_join(const intp_t[:] left, const intp_t[:] right,
157158
Py_ssize_t max_groups):
158159
cdef:
159160
Py_ssize_t i, j, k, count = 0
160-
ndarray[int64_t] left_count, right_count, left_sorter, right_sorter
161+
ndarray[intp_t] left_sorter, right_sorter
162+
ndarray[int64_t] left_count, right_count
161163
ndarray[int64_t] left_indexer, right_indexer
162164
int64_t lc, rc
163165
int64_t left_pos = 0, right_pos = 0
@@ -215,12 +217,16 @@ def full_outer_join(const intp_t[:] left, const intp_t[:] right,
215217
_get_result_indexer(right_sorter, right_indexer))
216218

217219

218-
cdef _get_result_indexer(ndarray[int64_t] sorter, ndarray[int64_t] indexer):
220+
cdef ndarray[int64_t] _get_result_indexer(
221+
ndarray[intp_t] sorter, ndarray[int64_t] indexer
222+
):
219223
if len(sorter) > 0:
220224
# cython-only equivalent to
221225
# `res = algos.take_nd(sorter, indexer, fill_value=-1)`
222226
res = np.empty(len(indexer), dtype=np.int64)
223-
take_1d_int64_int64(sorter, indexer, res, -1)
227+
take_1d_int64_int64(ensure_int64(sorter), ensure_platform_int(indexer), res, -1)
228+
# FIXME: sorter is intp_t, not int64_t, opposite for indexer;
229+
# will this break on 32bit builds?
224230
else:
225231
# length-0 case
226232
res = np.empty(len(indexer), dtype=np.int64)

pandas/core/array_algos/take.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -117,10 +117,10 @@ def _take_nd_ndarray(
117117
) -> np.ndarray:
118118

119119
if indexer is None:
120-
indexer = np.arange(arr.shape[axis], dtype=np.int64)
120+
indexer = np.arange(arr.shape[axis], dtype=np.intp)
121121
dtype, fill_value = arr.dtype, arr.dtype.type()
122122
else:
123-
indexer = ensure_int64(indexer, copy=False)
123+
indexer = ensure_platform_int(indexer)
124124
indexer, dtype, fill_value, mask_info = _take_preprocess_indexer_and_fill_value(
125125
arr, indexer, out, fill_value, allow_fill
126126
)
@@ -317,7 +317,7 @@ def _get_take_nd_function(
317317
if func is None:
318318

319319
def func(arr, indexer, out, fill_value=np.nan):
320-
indexer = ensure_int64(indexer)
320+
indexer = ensure_platform_int(indexer)
321321
_take_nd_object(
322322
arr, indexer, out, axis=axis, fill_value=fill_value, mask_info=mask_info
323323
)
@@ -468,7 +468,7 @@ def wrapper(
468468

469469
def _take_nd_object(
470470
arr: np.ndarray,
471-
indexer: np.ndarray,
471+
indexer: np.ndarray, # np.ndarray[np.intp]
472472
out: np.ndarray,
473473
axis: int,
474474
fill_value,
@@ -544,4 +544,5 @@ def _take_preprocess_indexer_and_fill_value(
544544
# to crash when trying to cast it to dtype)
545545
dtype, fill_value = arr.dtype, arr.dtype.type()
546546

547+
indexer = ensure_platform_int(indexer)
547548
return indexer, dtype, fill_value, mask_info

pandas/core/arrays/categorical.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -1961,7 +1961,8 @@ def _reverse_indexer(self) -> Dict[Hashable, np.ndarray]:
19611961
19621962
Returns
19631963
-------
1964-
dict of categories -> indexers
1964+
Dict[Hashable, np.ndarray[np.intp]]
1965+
dict of categories -> indexers
19651966
19661967
Examples
19671968
--------
@@ -1979,7 +1980,7 @@ def _reverse_indexer(self) -> Dict[Hashable, np.ndarray]:
19791980
"""
19801981
categories = self.categories
19811982
r, counts = libalgos.groupsort_indexer(
1982-
self.codes.astype("int64"), categories.size
1983+
self.codes.astype("int64", copy=False), categories.size
19831984
)
19841985
counts = counts.cumsum()
19851986
_result = (r[start:end] for start, end in zip(counts, counts[1:]))

pandas/core/sorting.py

+1
Original file line numberDiff line numberDiff line change
@@ -606,6 +606,7 @@ def get_group_index_sorter(
606606
)
607607
if do_groupsort:
608608
sorter, _ = algos.groupsort_indexer(ensure_int64(group_index), ngroups)
609+
# sorter _should_ already be intp, but mypy is not yet able to verify
609610
else:
610611
sorter = group_index.argsort(kind="mergesort")
611612
return ensure_platform_int(sorter)

pandas/tests/groupby/test_categorical.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -1717,9 +1717,9 @@ def test_groupby_categorical_indices_unused_categories():
17171717
grouped = df.groupby("key", sort=False)
17181718
result = grouped.indices
17191719
expected = {
1720-
"b": np.array([0, 1], dtype="int64"),
1721-
"a": np.array([2], dtype="int64"),
1722-
"c": np.array([], dtype="int64"),
1720+
"b": np.array([0, 1], dtype="intp"),
1721+
"a": np.array([2], dtype="intp"),
1722+
"c": np.array([], dtype="intp"),
17231723
}
17241724
assert result.keys() == expected.keys()
17251725
for key in result.keys():

pandas/tests/test_algos.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -2123,19 +2123,19 @@ def test_groupsort_indexer():
21232123

21242124
# need to use a stable sort
21252125
# np.argsort returns int, groupsort_indexer
2126-
# always returns int64
2126+
# always returns intp
21272127
expected = np.argsort(a, kind="mergesort")
2128-
expected = expected.astype(np.int64)
2128+
expected = expected.astype(np.intp)
21292129

21302130
tm.assert_numpy_array_equal(result, expected)
21312131

21322132
# compare with lexsort
21332133
# np.lexsort returns int, groupsort_indexer
2134-
# always returns int64
2134+
# always returns intp
21352135
key = a * 1000 + b
21362136
result = libalgos.groupsort_indexer(key, 1000000)[0]
21372137
expected = np.lexsort((b, a))
2138-
expected = expected.astype(np.int64)
2138+
expected = expected.astype(np.intp)
21392139

21402140
tm.assert_numpy_array_equal(result, expected)
21412141

0 commit comments

Comments
 (0)