Skip to content

Commit f4b6cbe

Browse files
authored
CLN: ensure_platform_int earlier (pandas-dev#40528)
1 parent bc62e76 commit f4b6cbe

File tree

9 files changed

+77
-55
lines changed

9 files changed

+77
-55
lines changed

pandas/_libs/algos.pyx

+6-7
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,7 @@ def is_lexsorted(list_of_arrays: list) -> bint:
191191

192192
@cython.boundscheck(False)
193193
@cython.wraparound(False)
194-
def groupsort_indexer(const int64_t[:] index, Py_ssize_t ngroups):
194+
def groupsort_indexer(const intp_t[:] index, Py_ssize_t ngroups):
195195
"""
196196
Compute a 1-d indexer.
197197
@@ -200,7 +200,7 @@ def groupsort_indexer(const int64_t[:] index, Py_ssize_t ngroups):
200200
201201
Parameters
202202
----------
203-
index: int64 ndarray
203+
index: np.ndarray[np.intp]
204204
Mappings from group -> position.
205205
ngroups: int64
206206
Number of groups.
@@ -209,7 +209,7 @@ def groupsort_indexer(const int64_t[:] index, Py_ssize_t ngroups):
209209
-------
210210
ndarray[intp_t, ndim=1]
211211
Indexer
212-
ndarray[int64_t, ndim=1]
212+
ndarray[intp_t, ndim=1]
213213
Group Counts
214214
215215
Notes
@@ -218,13 +218,12 @@ def groupsort_indexer(const int64_t[:] index, Py_ssize_t ngroups):
218218
"""
219219
cdef:
220220
Py_ssize_t i, loc, label, n
221-
ndarray[int64_t] counts, where
222-
ndarray[intp_t] indexer
221+
ndarray[intp_t] indexer, where, counts
223222

224-
counts = np.zeros(ngroups + 1, dtype=np.int64)
223+
counts = np.zeros(ngroups + 1, dtype=np.intp)
225224
n = len(index)
226225
indexer = np.zeros(n, dtype=np.intp)
227-
where = np.zeros(ngroups + 1, dtype=np.int64)
226+
where = np.zeros(ngroups + 1, dtype=np.intp)
228227

229228
with nogil:
230229

pandas/_libs/algos_take_helper.pxi.in

+26
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,32 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
88
# take_1d, take_2d
99
# ----------------------------------------------------------------------
1010

11+
12+
@cython.wraparound(False)
13+
@cython.boundscheck(False)
14+
def take_1d_intp_intp(
15+
const intp_t[:] values,
16+
const intp_t[:] indexer,
17+
intp_t[::1] out,
18+
intp_t fill_value=-1,
19+
):
20+
cdef:
21+
Py_ssize_t i, n, idx
22+
intp_t fv
23+
24+
n = indexer.shape[0]
25+
26+
fv = fill_value
27+
28+
with nogil:
29+
for i in range(n):
30+
idx = indexer[i]
31+
if idx == -1:
32+
out[i] = fv
33+
else:
34+
out[i] = values[idx]
35+
36+
1137
{{py:
1238

1339
# c_type_in, c_type_out

pandas/_libs/groupby.pyx

+3-2
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ from pandas._libs.util cimport (
3737
)
3838

3939
from pandas._libs.algos import (
40+
ensure_platform_int,
4041
groupsort_indexer,
4142
rank_1d,
4243
take_2d_axis1_float64_float64,
@@ -111,7 +112,7 @@ def group_median_float64(ndarray[float64_t, ndim=2] out,
111112
"""
112113
cdef:
113114
Py_ssize_t i, j, N, K, ngroups, size
114-
ndarray[int64_t] _counts
115+
ndarray[intp_t] _counts
115116
ndarray[float64_t, ndim=2] data
116117
ndarray[intp_t] indexer
117118
float64_t* ptr
@@ -121,7 +122,7 @@ def group_median_float64(ndarray[float64_t, ndim=2] out,
121122
ngroups = len(counts)
122123
N, K = (<object>values).shape
123124

124-
indexer, _counts = groupsort_indexer(labels, ngroups)
125+
indexer, _counts = groupsort_indexer(ensure_platform_int(labels), ngroups)
125126
counts[:] = _counts[1:]
126127

127128
data = np.empty((K, N), dtype=np.float64)

pandas/_libs/join.pyx

+33-37
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,9 @@ from numpy cimport (
2121
cnp.import_array()
2222

2323
from pandas._libs.algos import (
24-
ensure_int64,
25-
ensure_platform_int,
2624
groupsort_indexer,
2725
take_1d_int64_int64,
26+
take_1d_intp_intp,
2827
)
2928

3029

@@ -34,16 +33,16 @@ def inner_join(const intp_t[:] left, const intp_t[:] right,
3433
cdef:
3534
Py_ssize_t i, j, k, count = 0
3635
ndarray[intp_t] left_sorter, right_sorter
37-
ndarray[int64_t] left_count, right_count
38-
ndarray[int64_t] left_indexer, right_indexer
39-
int64_t lc, rc
36+
ndarray[intp_t] left_count, right_count
37+
ndarray[intp_t] left_indexer, right_indexer
38+
intp_t lc, rc
4039
Py_ssize_t loc, left_pos = 0, right_pos = 0, position = 0
4140
Py_ssize_t offset
4241

4342
# NA group in location 0
4443

45-
left_sorter, left_count = groupsort_indexer(ensure_int64(left), max_groups)
46-
right_sorter, right_count = groupsort_indexer(ensure_int64(right), max_groups)
44+
left_sorter, left_count = groupsort_indexer(left, max_groups)
45+
right_sorter, right_count = groupsort_indexer(right, max_groups)
4746

4847
with nogil:
4948
# First pass, determine size of result set, do not use the NA group
@@ -58,8 +57,8 @@ def inner_join(const intp_t[:] left, const intp_t[:] right,
5857
left_pos = left_count[0]
5958
right_pos = right_count[0]
6059

61-
left_indexer = np.empty(count, dtype=np.int64)
62-
right_indexer = np.empty(count, dtype=np.int64)
60+
left_indexer = np.empty(count, dtype=np.intp)
61+
right_indexer = np.empty(count, dtype=np.intp)
6362

6463
with nogil:
6564
for i in range(1, max_groups + 1):
@@ -85,17 +84,17 @@ def left_outer_join(const intp_t[:] left, const intp_t[:] right,
8584
Py_ssize_t max_groups, bint sort=True):
8685
cdef:
8786
Py_ssize_t i, j, k, count = 0
88-
ndarray[int64_t] left_count, right_count
87+
ndarray[intp_t] left_count, right_count
8988
ndarray[intp_t] rev, left_sorter, right_sorter
90-
ndarray[int64_t] left_indexer, right_indexer
91-
int64_t lc, rc
89+
ndarray[intp_t] left_indexer, right_indexer
90+
intp_t lc, rc
9291
Py_ssize_t loc, left_pos = 0, right_pos = 0, position = 0
9392
Py_ssize_t offset
9493

9594
# NA group in location 0
9695

97-
left_sorter, left_count = groupsort_indexer(ensure_int64(left), max_groups)
98-
right_sorter, right_count = groupsort_indexer(ensure_int64(right), max_groups)
96+
left_sorter, left_count = groupsort_indexer(left, max_groups)
97+
right_sorter, right_count = groupsort_indexer(right, max_groups)
9998

10099
with nogil:
101100
# First pass, determine size of result set, do not use the NA group
@@ -109,8 +108,8 @@ def left_outer_join(const intp_t[:] left, const intp_t[:] right,
109108
left_pos = left_count[0]
110109
right_pos = right_count[0]
111110

112-
left_indexer = np.empty(count, dtype=np.int64)
113-
right_indexer = np.empty(count, dtype=np.int64)
111+
left_indexer = np.empty(count, dtype=np.intp)
112+
right_indexer = np.empty(count, dtype=np.intp)
114113

115114
with nogil:
116115
for i in range(1, max_groups + 1):
@@ -142,11 +141,10 @@ def left_outer_join(const intp_t[:] left, const intp_t[:] right,
142141
# this is a short-cut to avoid groupsort_indexer
143142
# otherwise, the `else` path also works in this case
144143
rev = np.empty(len(left), dtype=np.intp)
145-
rev.put(ensure_platform_int(left_sorter), np.arange(len(left)))
144+
rev.put(left_sorter, np.arange(len(left)))
146145
else:
147146
rev, _ = groupsort_indexer(left_indexer, len(left))
148147

149-
rev = ensure_platform_int(rev)
150148
right_indexer = right_indexer.take(rev)
151149
left_indexer = left_indexer.take(rev)
152150

@@ -159,16 +157,16 @@ def full_outer_join(const intp_t[:] left, const intp_t[:] right,
159157
cdef:
160158
Py_ssize_t i, j, k, count = 0
161159
ndarray[intp_t] left_sorter, right_sorter
162-
ndarray[int64_t] left_count, right_count
163-
ndarray[int64_t] left_indexer, right_indexer
164-
int64_t lc, rc
165-
int64_t left_pos = 0, right_pos = 0
160+
ndarray[intp_t] left_count, right_count
161+
ndarray[intp_t] left_indexer, right_indexer
162+
intp_t lc, rc
163+
intp_t left_pos = 0, right_pos = 0
166164
Py_ssize_t offset, position = 0
167165

168166
# NA group in location 0
169167

170-
left_sorter, left_count = groupsort_indexer(ensure_int64(left), max_groups)
171-
right_sorter, right_count = groupsort_indexer(ensure_int64(right), max_groups)
168+
left_sorter, left_count = groupsort_indexer(left, max_groups)
169+
right_sorter, right_count = groupsort_indexer(right, max_groups)
172170

173171
with nogil:
174172
# First pass, determine size of result set, do not use the NA group
@@ -185,8 +183,8 @@ def full_outer_join(const intp_t[:] left, const intp_t[:] right,
185183
left_pos = left_count[0]
186184
right_pos = right_count[0]
187185

188-
left_indexer = np.empty(count, dtype=np.int64)
189-
right_indexer = np.empty(count, dtype=np.int64)
186+
left_indexer = np.empty(count, dtype=np.intp)
187+
right_indexer = np.empty(count, dtype=np.intp)
190188

191189
with nogil:
192190
for i in range(1, max_groups + 1):
@@ -217,31 +215,29 @@ def full_outer_join(const intp_t[:] left, const intp_t[:] right,
217215
_get_result_indexer(right_sorter, right_indexer))
218216

219217

220-
cdef ndarray[int64_t] _get_result_indexer(
221-
ndarray[intp_t] sorter, ndarray[int64_t] indexer
218+
cdef ndarray[intp_t] _get_result_indexer(
219+
ndarray[intp_t] sorter, ndarray[intp_t] indexer
222220
):
223221
if len(sorter) > 0:
224222
# cython-only equivalent to
225223
# `res = algos.take_nd(sorter, indexer, fill_value=-1)`
226-
res = np.empty(len(indexer), dtype=np.int64)
227-
take_1d_int64_int64(ensure_int64(sorter), ensure_platform_int(indexer), res, -1)
228-
# FIXME: sorter is intp_t, not int64_t, opposite for indexer;
229-
# will this break on 32bit builds?
224+
res = np.empty(len(indexer), dtype=np.intp)
225+
take_1d_intp_intp(sorter, indexer, res, -1)
230226
else:
231227
# length-0 case
232-
res = np.empty(len(indexer), dtype=np.int64)
228+
res = np.empty(len(indexer), dtype=np.intp)
233229
res[:] = -1
234230

235231
return res
236232

237233

238-
def ffill_indexer(const int64_t[:] indexer):
234+
def ffill_indexer(const intp_t[:] indexer):
239235
cdef:
240236
Py_ssize_t i, n = len(indexer)
241-
ndarray[int64_t] result
242-
int64_t val, last_obs
237+
ndarray[intp_t] result
238+
intp_t val, last_obs
243239

244-
result = np.empty(n, dtype=np.int64)
240+
result = np.empty(n, dtype=np.intp)
245241
last_obs = -1
246242

247243
for i in range(n):

pandas/core/arrays/categorical.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1981,9 +1981,9 @@ def _reverse_indexer(self) -> Dict[Hashable, np.ndarray]:
19811981
"""
19821982
categories = self.categories
19831983
r, counts = libalgos.groupsort_indexer(
1984-
self.codes.astype("int64", copy=False), categories.size
1984+
ensure_platform_int(self.codes), categories.size
19851985
)
1986-
counts = counts.cumsum()
1986+
counts = ensure_int64(counts).cumsum()
19871987
_result = (r[start:end] for start, end in zip(counts, counts[1:]))
19881988
return dict(zip(categories, _result))
19891989

pandas/core/indexes/base.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -4154,7 +4154,7 @@ def _get_leaf_sorter(labels: List[np.ndarray]) -> np.ndarray:
41544154
return np.empty(0, dtype=np.intp)
41554155

41564156
if len(labels) == 1:
4157-
return get_group_index_sorter(labels[0])
4157+
return get_group_index_sorter(ensure_platform_int(labels[0]))
41584158

41594159
# find indexers of beginning of each set of
41604160
# same-key labels w.r.t all but last level
@@ -4224,7 +4224,7 @@ def _get_leaf_sorter(labels: List[np.ndarray]) -> np.ndarray:
42244224
if level == 0: # outer most level, take the fast route
42254225
ngroups = 1 + new_lev_codes.max()
42264226
left_indexer, counts = libalgos.groupsort_indexer(
4227-
ensure_int64(new_lev_codes), ngroups
4227+
new_lev_codes, ngroups
42284228
)
42294229

42304230
# missing values are placed first; drop them!

pandas/core/sorting.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -604,7 +604,7 @@ def get_group_index_sorter(
604604
(alpha + beta * ngroups) < (count * np.log(count)) # type: ignore[operator]
605605
)
606606
if do_groupsort:
607-
sorter, _ = algos.groupsort_indexer(ensure_int64(group_index), ngroups)
607+
sorter, _ = algos.groupsort_indexer(ensure_platform_int(group_index), ngroups)
608608
# sorter _should_ already be intp, but mypy is not yet able to verify
609609
else:
610610
sorter = group_index.argsort(kind="mergesort")

pandas/tests/libs/test_join.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -264,8 +264,8 @@ def test_left_outer_join_bug():
264264

265265
lidx, ridx = libjoin.left_outer_join(left, right, max_groups, sort=False)
266266

267-
exp_lidx = np.arange(len(left), dtype=np.int64)
268-
exp_ridx = -np.ones(len(left), dtype=np.int64)
267+
exp_lidx = np.arange(len(left), dtype=np.intp)
268+
exp_ridx = -np.ones(len(left), dtype=np.intp)
269269

270270
exp_ridx[left == 1] = 1
271271
exp_ridx[left == 3] = 0

pandas/tests/test_algos.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -2116,8 +2116,8 @@ def test_is_lexsorted():
21162116

21172117

21182118
def test_groupsort_indexer():
2119-
a = np.random.randint(0, 1000, 100).astype(np.int64)
2120-
b = np.random.randint(0, 1000, 100).astype(np.int64)
2119+
a = np.random.randint(0, 1000, 100).astype(np.intp)
2120+
b = np.random.randint(0, 1000, 100).astype(np.intp)
21212121

21222122
result = libalgos.groupsort_indexer(a, 1000)[0]
21232123

0 commit comments

Comments
 (0)