Skip to content

CLN: intp_t instead of int64_t for indexers in libs funcs #40475

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Mar 20, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 9 additions & 6 deletions pandas/_libs/algos.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -199,20 +199,23 @@ def groupsort_indexer(const int64_t[:] index, Py_ssize_t ngroups):

Returns
-------
tuple
1-d indexer ordered by groups, group counts.
ndarray[intp_t, ndim=1]
Indexer
ndarray[int64_t, ndim=1]
Group Counts

Notes
-----
This is a reverse of the label factorization process.
"""
cdef:
Py_ssize_t i, loc, label, n
ndarray[int64_t] counts, where, result
ndarray[int64_t] counts, where
ndarray[intp_t] indexer

counts = np.zeros(ngroups + 1, dtype=np.int64)
n = len(index)
result = np.zeros(n, dtype=np.int64)
indexer = np.zeros(n, dtype=np.intp)
where = np.zeros(ngroups + 1, dtype=np.int64)

with nogil:
Expand All @@ -228,10 +231,10 @@ def groupsort_indexer(const int64_t[:] index, Py_ssize_t ngroups):
# this is our indexer
for i in range(n):
label = index[i] + 1
result[where[label]] = i
indexer[where[label]] = i
where[label] += 1

return result, counts
return indexer, counts


@cython.boundscheck(False)
Expand Down
6 changes: 3 additions & 3 deletions pandas/_libs/algos_take_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def take_1d_{{name}}_{{dest}}(const {{c_type_in}}[:] values,
{{else}}
def take_1d_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=1] values,
{{endif}}
const int64_t[:] indexer,
const intp_t[:] indexer,
{{c_type_out}}[:] out,
fill_value=np.nan):

Expand Down Expand Up @@ -102,7 +102,7 @@ def take_2d_axis0_{{name}}_{{dest}}(const {{c_type_in}}[:, :] values,
{{else}}
def take_2d_axis0_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values,
{{endif}}
ndarray[int64_t] indexer,
ndarray[intp_t] indexer,
{{c_type_out}}[:, :] out,
fill_value=np.nan):
cdef:
Expand Down Expand Up @@ -156,7 +156,7 @@ def take_2d_axis1_{{name}}_{{dest}}(const {{c_type_in}}[:, :] values,
{{else}}
def take_2d_axis1_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values,
{{endif}}
ndarray[int64_t] indexer,
ndarray[intp_t] indexer,
{{c_type_out}}[:, :] out,
fill_value=np.nan):

Expand Down
2 changes: 2 additions & 0 deletions pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ from numpy cimport (
int16_t,
int32_t,
int64_t,
intp_t,
ndarray,
uint8_t,
uint16_t,
Expand Down Expand Up @@ -141,6 +142,7 @@ def group_median_float64(ndarray[float64_t, ndim=2] out,
Py_ssize_t i, j, N, K, ngroups, size
ndarray[int64_t] _counts
ndarray[float64_t, ndim=2] data
ndarray[intp_t] indexer
float64_t* ptr

assert min_count == -1, "'min_count' only used in add and prod"
Expand Down
18 changes: 12 additions & 6 deletions pandas/_libs/join.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ def inner_join(const intp_t[:] left, const intp_t[:] right,
Py_ssize_t max_groups):
cdef:
Py_ssize_t i, j, k, count = 0
ndarray[int64_t] left_count, right_count, left_sorter, right_sorter
ndarray[intp_t] left_sorter, right_sorter
ndarray[int64_t] left_count, right_count
ndarray[int64_t] left_indexer, right_indexer
int64_t lc, rc
Py_ssize_t loc, left_pos = 0, right_pos = 0, position = 0
Expand Down Expand Up @@ -84,8 +85,8 @@ def left_outer_join(const intp_t[:] left, const intp_t[:] right,
Py_ssize_t max_groups, bint sort=True):
cdef:
Py_ssize_t i, j, k, count = 0
ndarray[int64_t] left_count, right_count, left_sorter, right_sorter
ndarray rev
ndarray[int64_t] left_count, right_count
ndarray[intp_t] rev, left_sorter, right_sorter
ndarray[int64_t] left_indexer, right_indexer
int64_t lc, rc
Py_ssize_t loc, left_pos = 0, right_pos = 0, position = 0
Expand Down Expand Up @@ -157,7 +158,8 @@ def full_outer_join(const intp_t[:] left, const intp_t[:] right,
Py_ssize_t max_groups):
cdef:
Py_ssize_t i, j, k, count = 0
ndarray[int64_t] left_count, right_count, left_sorter, right_sorter
ndarray[intp_t] left_sorter, right_sorter
ndarray[int64_t] left_count, right_count
ndarray[int64_t] left_indexer, right_indexer
int64_t lc, rc
int64_t left_pos = 0, right_pos = 0
Expand Down Expand Up @@ -215,12 +217,16 @@ def full_outer_join(const intp_t[:] left, const intp_t[:] right,
_get_result_indexer(right_sorter, right_indexer))


cdef _get_result_indexer(ndarray[int64_t] sorter, ndarray[int64_t] indexer):
cdef ndarray[int64_t] _get_result_indexer(
ndarray[intp_t] sorter, ndarray[int64_t] indexer
):
if len(sorter) > 0:
# cython-only equivalent to
# `res = algos.take_nd(sorter, indexer, fill_value=-1)`
res = np.empty(len(indexer), dtype=np.int64)
take_1d_int64_int64(sorter, indexer, res, -1)
take_1d_int64_int64(ensure_int64(sorter), ensure_platform_int(indexer), res, -1)
# FIXME: sorter is intp_t, not int64_t, opposite for indexer;
# will this break on 32bit builds?
else:
# length-0 case
res = np.empty(len(indexer), dtype=np.int64)
Expand Down
9 changes: 5 additions & 4 deletions pandas/core/array_algos/take.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,10 +117,10 @@ def _take_nd_ndarray(
) -> np.ndarray:

if indexer is None:
indexer = np.arange(arr.shape[axis], dtype=np.int64)
indexer = np.arange(arr.shape[axis], dtype=np.intp)
dtype, fill_value = arr.dtype, arr.dtype.type()
else:
indexer = ensure_int64(indexer, copy=False)
indexer = ensure_platform_int(indexer)
indexer, dtype, fill_value, mask_info = _take_preprocess_indexer_and_fill_value(
arr, indexer, out, fill_value, allow_fill
)
Expand Down Expand Up @@ -317,7 +317,7 @@ def _get_take_nd_function(
if func is None:

def func(arr, indexer, out, fill_value=np.nan):
indexer = ensure_int64(indexer)
indexer = ensure_platform_int(indexer)
_take_nd_object(
arr, indexer, out, axis=axis, fill_value=fill_value, mask_info=mask_info
)
Expand Down Expand Up @@ -468,7 +468,7 @@ def wrapper(

def _take_nd_object(
arr: np.ndarray,
indexer: np.ndarray,
indexer: np.ndarray, # np.ndarray[np.intp]
out: np.ndarray,
axis: int,
fill_value,
Expand Down Expand Up @@ -544,4 +544,5 @@ def _take_preprocess_indexer_and_fill_value(
# to crash when trying to cast it to dtype)
dtype, fill_value = arr.dtype, arr.dtype.type()

indexer = ensure_platform_int(indexer)
return indexer, dtype, fill_value, mask_info
5 changes: 3 additions & 2 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -1961,7 +1961,8 @@ def _reverse_indexer(self) -> Dict[Hashable, np.ndarray]:

Returns
-------
dict of categories -> indexers
Dict[Hashable, np.ndarray[np.intp]]
dict of categories -> indexers

Examples
--------
Expand All @@ -1979,7 +1980,7 @@ def _reverse_indexer(self) -> Dict[Hashable, np.ndarray]:
"""
categories = self.categories
r, counts = libalgos.groupsort_indexer(
self.codes.astype("int64"), categories.size
self.codes.astype("int64", copy=False), categories.size
)
counts = counts.cumsum()
_result = (r[start:end] for start, end in zip(counts, counts[1:]))
Expand Down
1 change: 1 addition & 0 deletions pandas/core/sorting.py
Original file line number Diff line number Diff line change
Expand Up @@ -606,6 +606,7 @@ def get_group_index_sorter(
)
if do_groupsort:
sorter, _ = algos.groupsort_indexer(ensure_int64(group_index), ngroups)
# sorter _should_ already be intp, but mypy is not yet able to verify
else:
sorter = group_index.argsort(kind="mergesort")
return ensure_platform_int(sorter)
Expand Down
6 changes: 3 additions & 3 deletions pandas/tests/groupby/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -1717,9 +1717,9 @@ def test_groupby_categorical_indices_unused_categories():
grouped = df.groupby("key", sort=False)
result = grouped.indices
expected = {
"b": np.array([0, 1], dtype="int64"),
"a": np.array([2], dtype="int64"),
"c": np.array([], dtype="int64"),
"b": np.array([0, 1], dtype="intp"),
"a": np.array([2], dtype="intp"),
"c": np.array([], dtype="intp"),
}
assert result.keys() == expected.keys()
for key in result.keys():
Expand Down
8 changes: 4 additions & 4 deletions pandas/tests/test_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -2123,19 +2123,19 @@ def test_groupsort_indexer():

# need to use a stable sort
# np.argsort returns int, groupsort_indexer
# always returns int64
# always returns intp
expected = np.argsort(a, kind="mergesort")
expected = expected.astype(np.int64)
expected = expected.astype(np.intp)

tm.assert_numpy_array_equal(result, expected)

# compare with lexsort
# np.lexsort returns int, groupsort_indexer
# always returns int64
# always returns intp
key = a * 1000 + b
result = libalgos.groupsort_indexer(key, 1000000)[0]
expected = np.lexsort((b, a))
expected = expected.astype(np.int64)
expected = expected.astype(np.intp)

tm.assert_numpy_array_equal(result, expected)

Expand Down