Skip to content

CLN: ensure_platform_int earlier #40528

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Mar 23, 2021
13 changes: 6 additions & 7 deletions pandas/_libs/algos.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ def is_lexsorted(list_of_arrays: list) -> bint:

@cython.boundscheck(False)
@cython.wraparound(False)
def groupsort_indexer(const int64_t[:] index, Py_ssize_t ngroups):
def groupsort_indexer(const intp_t[:] index, Py_ssize_t ngroups):
"""
Compute a 1-d indexer.

Expand All @@ -200,7 +200,7 @@ def groupsort_indexer(const int64_t[:] index, Py_ssize_t ngroups):

Parameters
----------
index: int64 ndarray
index: np.ndarray[np.intp]
Mappings from group -> position.
ngroups: int64
Number of groups.
Expand All @@ -209,7 +209,7 @@ def groupsort_indexer(const int64_t[:] index, Py_ssize_t ngroups):
-------
ndarray[intp_t, ndim=1]
Indexer
ndarray[int64_t, ndim=1]
ndarray[intp_t, ndim=1]
Group Counts

Notes
Expand All @@ -218,13 +218,12 @@ def groupsort_indexer(const int64_t[:] index, Py_ssize_t ngroups):
"""
cdef:
Py_ssize_t i, loc, label, n
ndarray[int64_t] counts, where
ndarray[intp_t] indexer
ndarray[intp_t] indexer, where, counts

counts = np.zeros(ngroups + 1, dtype=np.int64)
counts = np.zeros(ngroups + 1, dtype=np.intp)
n = len(index)
indexer = np.zeros(n, dtype=np.intp)
where = np.zeros(ngroups + 1, dtype=np.int64)
where = np.zeros(ngroups + 1, dtype=np.intp)

with nogil:

Expand Down
26 changes: 26 additions & 0 deletions pandas/_libs/algos_take_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,32 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
# take_1d, take_2d
# ----------------------------------------------------------------------


@cython.wraparound(False)
@cython.boundscheck(False)
def take_1d_intp_intp(
const intp_t[:] values,
const intp_t[:] indexer,
intp_t[::1] out,
intp_t fill_value=-1,
):
cdef:
Py_ssize_t i, n, idx
intp_t fv

n = indexer.shape[0]

fv = fill_value

with nogil:
for i in range(n):
idx = indexer[i]
if idx == -1:
out[i] = fv
else:
out[i] = values[idx]


{{py:

# c_type_in, c_type_out
Expand Down
5 changes: 3 additions & 2 deletions pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ from pandas._libs.util cimport (
)

from pandas._libs.algos import (
ensure_platform_int,
groupsort_indexer,
rank_1d,
take_2d_axis1_float64_float64,
Expand Down Expand Up @@ -111,7 +112,7 @@ def group_median_float64(ndarray[float64_t, ndim=2] out,
"""
cdef:
Py_ssize_t i, j, N, K, ngroups, size
ndarray[int64_t] _counts
ndarray[intp_t] _counts
ndarray[float64_t, ndim=2] data
ndarray[intp_t] indexer
float64_t* ptr
Expand All @@ -121,7 +122,7 @@ def group_median_float64(ndarray[float64_t, ndim=2] out,
ngroups = len(counts)
N, K = (<object>values).shape

indexer, _counts = groupsort_indexer(labels, ngroups)
indexer, _counts = groupsort_indexer(ensure_platform_int(labels), ngroups)
counts[:] = _counts[1:]

data = np.empty((K, N), dtype=np.float64)
Expand Down
70 changes: 33 additions & 37 deletions pandas/_libs/join.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,9 @@ from numpy cimport (
cnp.import_array()

from pandas._libs.algos import (
ensure_int64,
ensure_platform_int,
groupsort_indexer,
take_1d_int64_int64,
take_1d_intp_intp,
)


Expand All @@ -34,16 +33,16 @@ def inner_join(const intp_t[:] left, const intp_t[:] right,
cdef:
Py_ssize_t i, j, k, count = 0
ndarray[intp_t] left_sorter, right_sorter
ndarray[int64_t] left_count, right_count
ndarray[int64_t] left_indexer, right_indexer
int64_t lc, rc
ndarray[intp_t] left_count, right_count
ndarray[intp_t] left_indexer, right_indexer
intp_t lc, rc
Py_ssize_t loc, left_pos = 0, right_pos = 0, position = 0
Py_ssize_t offset

# NA group in location 0

left_sorter, left_count = groupsort_indexer(ensure_int64(left), max_groups)
right_sorter, right_count = groupsort_indexer(ensure_int64(right), max_groups)
left_sorter, left_count = groupsort_indexer(left, max_groups)
right_sorter, right_count = groupsort_indexer(right, max_groups)

with nogil:
# First pass, determine size of result set, do not use the NA group
Expand All @@ -58,8 +57,8 @@ def inner_join(const intp_t[:] left, const intp_t[:] right,
left_pos = left_count[0]
right_pos = right_count[0]

left_indexer = np.empty(count, dtype=np.int64)
right_indexer = np.empty(count, dtype=np.int64)
left_indexer = np.empty(count, dtype=np.intp)
right_indexer = np.empty(count, dtype=np.intp)

with nogil:
for i in range(1, max_groups + 1):
Expand All @@ -85,17 +84,17 @@ def left_outer_join(const intp_t[:] left, const intp_t[:] right,
Py_ssize_t max_groups, bint sort=True):
cdef:
Py_ssize_t i, j, k, count = 0
ndarray[int64_t] left_count, right_count
ndarray[intp_t] left_count, right_count
ndarray[intp_t] rev, left_sorter, right_sorter
ndarray[int64_t] left_indexer, right_indexer
int64_t lc, rc
ndarray[intp_t] left_indexer, right_indexer
intp_t lc, rc
Py_ssize_t loc, left_pos = 0, right_pos = 0, position = 0
Py_ssize_t offset

# NA group in location 0

left_sorter, left_count = groupsort_indexer(ensure_int64(left), max_groups)
right_sorter, right_count = groupsort_indexer(ensure_int64(right), max_groups)
left_sorter, left_count = groupsort_indexer(left, max_groups)
right_sorter, right_count = groupsort_indexer(right, max_groups)

with nogil:
# First pass, determine size of result set, do not use the NA group
Expand All @@ -109,8 +108,8 @@ def left_outer_join(const intp_t[:] left, const intp_t[:] right,
left_pos = left_count[0]
right_pos = right_count[0]

left_indexer = np.empty(count, dtype=np.int64)
right_indexer = np.empty(count, dtype=np.int64)
left_indexer = np.empty(count, dtype=np.intp)
right_indexer = np.empty(count, dtype=np.intp)

with nogil:
for i in range(1, max_groups + 1):
Expand Down Expand Up @@ -142,11 +141,10 @@ def left_outer_join(const intp_t[:] left, const intp_t[:] right,
# this is a short-cut to avoid groupsort_indexer
# otherwise, the `else` path also works in this case
rev = np.empty(len(left), dtype=np.intp)
rev.put(ensure_platform_int(left_sorter), np.arange(len(left)))
rev.put(left_sorter, np.arange(len(left)))
else:
rev, _ = groupsort_indexer(left_indexer, len(left))

rev = ensure_platform_int(rev)
right_indexer = right_indexer.take(rev)
left_indexer = left_indexer.take(rev)

Expand All @@ -159,16 +157,16 @@ def full_outer_join(const intp_t[:] left, const intp_t[:] right,
cdef:
Py_ssize_t i, j, k, count = 0
ndarray[intp_t] left_sorter, right_sorter
ndarray[int64_t] left_count, right_count
ndarray[int64_t] left_indexer, right_indexer
int64_t lc, rc
int64_t left_pos = 0, right_pos = 0
ndarray[intp_t] left_count, right_count
ndarray[intp_t] left_indexer, right_indexer
intp_t lc, rc
intp_t left_pos = 0, right_pos = 0
Py_ssize_t offset, position = 0

# NA group in location 0

left_sorter, left_count = groupsort_indexer(ensure_int64(left), max_groups)
right_sorter, right_count = groupsort_indexer(ensure_int64(right), max_groups)
left_sorter, left_count = groupsort_indexer(left, max_groups)
right_sorter, right_count = groupsort_indexer(right, max_groups)

with nogil:
# First pass, determine size of result set, do not use the NA group
Expand All @@ -185,8 +183,8 @@ def full_outer_join(const intp_t[:] left, const intp_t[:] right,
left_pos = left_count[0]
right_pos = right_count[0]

left_indexer = np.empty(count, dtype=np.int64)
right_indexer = np.empty(count, dtype=np.int64)
left_indexer = np.empty(count, dtype=np.intp)
right_indexer = np.empty(count, dtype=np.intp)

with nogil:
for i in range(1, max_groups + 1):
Expand Down Expand Up @@ -217,31 +215,29 @@ def full_outer_join(const intp_t[:] left, const intp_t[:] right,
_get_result_indexer(right_sorter, right_indexer))


cdef ndarray[int64_t] _get_result_indexer(
ndarray[intp_t] sorter, ndarray[int64_t] indexer
cdef ndarray[intp_t] _get_result_indexer(
ndarray[intp_t] sorter, ndarray[intp_t] indexer
):
if len(sorter) > 0:
# cython-only equivalent to
# `res = algos.take_nd(sorter, indexer, fill_value=-1)`
res = np.empty(len(indexer), dtype=np.int64)
take_1d_int64_int64(ensure_int64(sorter), ensure_platform_int(indexer), res, -1)
# FIXME: sorter is intp_t, not int64_t, opposite for indexer;
# will this break on 32bit builds?
res = np.empty(len(indexer), dtype=np.intp)
take_1d_intp_intp(sorter, indexer, res, -1)
else:
# length-0 case
res = np.empty(len(indexer), dtype=np.int64)
res = np.empty(len(indexer), dtype=np.intp)
res[:] = -1

return res


def ffill_indexer(const int64_t[:] indexer):
def ffill_indexer(const intp_t[:] indexer):
cdef:
Py_ssize_t i, n = len(indexer)
ndarray[int64_t] result
int64_t val, last_obs
ndarray[intp_t] result
intp_t val, last_obs

result = np.empty(n, dtype=np.int64)
result = np.empty(n, dtype=np.intp)
last_obs = -1

for i in range(n):
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -1981,9 +1981,9 @@ def _reverse_indexer(self) -> Dict[Hashable, np.ndarray]:
"""
categories = self.categories
r, counts = libalgos.groupsort_indexer(
self.codes.astype("int64", copy=False), categories.size
ensure_platform_int(self.codes), categories.size
)
counts = counts.cumsum()
counts = ensure_int64(counts).cumsum()
_result = (r[start:end] for start, end in zip(counts, counts[1:]))
return dict(zip(categories, _result))

Expand Down
4 changes: 2 additions & 2 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4154,7 +4154,7 @@ def _get_leaf_sorter(labels: List[np.ndarray]) -> np.ndarray:
return np.empty(0, dtype=np.intp)

if len(labels) == 1:
return get_group_index_sorter(labels[0])
return get_group_index_sorter(ensure_platform_int(labels[0]))

# find indexers of beginning of each set of
# same-key labels w.r.t all but last level
Expand Down Expand Up @@ -4224,7 +4224,7 @@ def _get_leaf_sorter(labels: List[np.ndarray]) -> np.ndarray:
if level == 0: # outer most level, take the fast route
ngroups = 1 + new_lev_codes.max()
left_indexer, counts = libalgos.groupsort_indexer(
ensure_int64(new_lev_codes), ngroups
new_lev_codes, ngroups
)

# missing values are placed first; drop them!
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/sorting.py
Original file line number Diff line number Diff line change
Expand Up @@ -604,7 +604,7 @@ def get_group_index_sorter(
(alpha + beta * ngroups) < (count * np.log(count)) # type: ignore[operator]
)
if do_groupsort:
sorter, _ = algos.groupsort_indexer(ensure_int64(group_index), ngroups)
sorter, _ = algos.groupsort_indexer(ensure_platform_int(group_index), ngroups)
# sorter _should_ already be intp, but mypy is not yet able to verify
else:
sorter = group_index.argsort(kind="mergesort")
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/libs/test_join.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,8 +264,8 @@ def test_left_outer_join_bug():

lidx, ridx = libjoin.left_outer_join(left, right, max_groups, sort=False)

exp_lidx = np.arange(len(left), dtype=np.int64)
exp_ridx = -np.ones(len(left), dtype=np.int64)
exp_lidx = np.arange(len(left), dtype=np.intp)
exp_ridx = -np.ones(len(left), dtype=np.intp)

exp_ridx[left == 1] = 1
exp_ridx[left == 3] = 0
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/test_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -2116,8 +2116,8 @@ def test_is_lexsorted():


def test_groupsort_indexer():
a = np.random.randint(0, 1000, 100).astype(np.int64)
b = np.random.randint(0, 1000, 100).astype(np.int64)
a = np.random.randint(0, 1000, 100).astype(np.intp)
b = np.random.randint(0, 1000, 100).astype(np.intp)

result = libalgos.groupsort_indexer(a, 1000)[0]

Expand Down