Skip to content

CLN: Make internal numpy sort and argsort use kind="stable" #53829

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 16 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pandas/_libs/algos.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ cpdef ndarray[int64_t, ndim=1] unique_deltas(const int64_t[:] arr):
kh_destroy_int64(table)

result = np.array(uniques, dtype=np.int64)
result.sort()
result.sort(kind="stable")
return result


Expand Down
4 changes: 2 additions & 2 deletions pandas/_libs/index.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -790,7 +790,7 @@ cdef class BaseMultiIndexCodesEngine:
ndarray[int64_t, ndim=1] new_codes, new_target_codes
ndarray[intp_t, ndim=1] sorted_indexer

target_order = np.argsort(target).astype("int64")
target_order = np.argsort(target, kind="stable").astype("int64")
target_values = target[target_order]
num_values, num_target_values = len(values), len(target_values)
new_codes, new_target_codes = (
Expand Down Expand Up @@ -831,7 +831,7 @@ cdef class BaseMultiIndexCodesEngine:
# get the indexer, and undo the sorting of `target.values`
algo = algos.backfill if method == "backfill" else algos.pad
sorted_indexer = algo(new_codes, new_target_codes, limit=limit)
return sorted_indexer[np.argsort(target_order)]
return sorted_indexer[np.argsort(target_order, kind="stable")]

def get_loc(self, object key):
if is_definitely_invalid_key(key):
Expand Down
4 changes: 2 additions & 2 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -930,7 +930,7 @@ def get_level_sorter(
Argsort for a single level of a multi-index, keeping the order of higher
levels unchanged. `starts` points to starts of same-key indices w.r.t
to leading levels; equivalent to:
np.hstack([codes[starts[i]:starts[i+1]].argsort(kind='mergesort')
np.hstack([codes[starts[i]:starts[i+1]].argsort(kind='stable')
+ starts[i] for i in range(len(starts) - 1)])

Parameters
Expand All @@ -948,7 +948,7 @@ def get_level_sorter(

for i in range(len(starts) - 1):
l, r = starts[i], starts[i + 1]
out[l:r] = l + codes[l:r].argsort(kind="mergesort")
out[l:r] = l + codes[l:r].argsort(kind="stable")

return out

Expand Down
10 changes: 5 additions & 5 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -1041,7 +1041,7 @@ def mode(

npresult = htable.mode(values, dropna=dropna, mask=mask)
try:
npresult = np.sort(npresult)
npresult = np.sort(npresult, kind="stable")
except TypeError as err:
warnings.warn(
f"Unable to sort modes: {err}",
Expand Down Expand Up @@ -1593,7 +1593,7 @@ def safe_sort(
ordered = _sort_mixed(values)
else:
try:
sorter = values.argsort()
sorter = values.argsort(kind="stable")
ordered = values.take(sorter)
except TypeError:
# Previous sorters failed or were not applicable, try `_sort_mixed`
Expand Down Expand Up @@ -1634,7 +1634,7 @@ def safe_sort(

if use_na_sentinel:
# take_nd is faster, but only works for na_sentinels of -1
order2 = sorter.argsort()
order2 = sorter.argsort(kind="stable")
new_codes = take_nd(order2, codes, fill_value=-1)
if verify:
mask = (codes < -len(values)) | (codes >= len(values))
Expand Down Expand Up @@ -1663,8 +1663,8 @@ def _sort_mixed(values) -> AnyArrayLike:
str_pos = np.array([isinstance(x, str) for x in values], dtype=bool)
null_pos = np.array([isna(x) for x in values], dtype=bool)
num_pos = ~str_pos & ~null_pos
str_argsort = np.argsort(values[str_pos])
num_argsort = np.argsort(values[num_pos])
str_argsort = np.argsort(values[str_pos], kind="stable")
num_argsort = np.argsort(values[num_pos], kind="stable")
# convert boolean arrays to positional indices, then order by underlying values
str_locs = str_pos.nonzero()[0].take(str_argsort)
num_locs = num_pos.nonzero()[0].take(num_argsort)
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -1691,7 +1691,7 @@ def __setitem__(self, key, value) -> None:
raise ValueError("Length of indexer and values mismatch")
if len(indices) == 0:
return
argsort = np.argsort(indices)
argsort = np.argsort(indices, kind="stable")
indices = indices[argsort]
value = value.take(argsort)
mask = np.zeros(len(self), dtype=np.bool_)
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/groupby/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def recode_for_groupby(

take_codes = unique_codes[unique_codes != -1]
if sort:
take_codes = np.sort(take_codes)
take_codes = np.sort(take_codes, kind="stable")

# we recode according to the uniques
categories = c.categories.take(take_codes)
Expand All @@ -76,7 +76,7 @@ def recode_for_groupby(
# GH 38140: exclude nan from indexer for categories
unique_notnan_codes = unique1d(c.codes[c.codes != -1])
if sort:
unique_notnan_codes = np.sort(unique_notnan_codes)
unique_notnan_codes = np.sort(unique_notnan_codes, kind="stable")
if len(all_codes) > len(unique_notnan_codes):
# GH 13179: All categories need to be present, even if missing from the data
missing_codes = np.setdiff1d(all_codes, unique_notnan_codes, assume_unique=True)
Expand Down
10 changes: 5 additions & 5 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2060,7 +2060,7 @@ def _apply_filter(self, indices, dropna):
if len(indices) == 0:
indices = np.array([], dtype="int64")
else:
indices = np.sort(np.concatenate(indices))
indices = np.sort(np.concatenate(indices), kind="stable")
if dropna:
filtered = self._selected_obj.take(indices, axis=self.axis)
else:
Expand Down Expand Up @@ -2837,9 +2837,9 @@ def _value_counts(
if sort:
# Sort the values and then resort by the main grouping
index_level = range(len(self.grouper.groupings))
result_series = result_series.sort_values(ascending=ascending).sort_index(
level=index_level, sort_remaining=False
)
result_series = result_series.sort_values(
ascending=ascending, kind="stable"
).sort_index(level=index_level, sort_remaining=False, kind="stable")

result: Series | DataFrame
if self.as_index:
Expand Down Expand Up @@ -3890,7 +3890,7 @@ def _fill(self, direction: Literal["ffill", "bfill"], limit: int | None = None):
limit = -1

ids, _, _ = self.grouper.group_info
sorted_labels = np.argsort(ids, kind="mergesort").astype(np.intp, copy=False)
sorted_labels = np.argsort(ids, kind="stable").astype(np.intp, copy=False)
if direction == "bfill":
sorted_labels = sorted_labels[::-1]

Expand Down
6 changes: 3 additions & 3 deletions pandas/core/groupby/grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -370,7 +370,7 @@ def _set_grouper(
# before we call _grouper.take.
assert self._grouper is not None
if self._indexer is not None:
reverse_indexer = self._indexer.argsort()
reverse_indexer = self._indexer.argsort(kind="stable")
unsorted_ax = self._grouper.take(reverse_indexer)
ax = unsorted_ax.take(obj.index)
else:
Expand Down Expand Up @@ -401,7 +401,7 @@ def _set_grouper(
# use stable sort to support first, last, nth
# TODO: why does putting na_position="first" fix datetimelike cases?
indexer = self._indexer_deprecated = ax.array.argsort(
kind="mergesort", na_position="first"
kind="stable", na_position="first"
)
ax = ax.take(indexer)
obj = obj.take(indexer, axis=self.axis)
Expand Down Expand Up @@ -755,7 +755,7 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
ucodes = algorithms.unique1d(cat.codes)
ucodes = ucodes[ucodes != -1]
if self._sort:
ucodes = np.sort(ucodes)
ucodes = np.sort(ucodes, kind="stable")
else:
ucodes = np.arange(len(categories))

Expand Down
2 changes: 1 addition & 1 deletion pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3542,7 +3542,7 @@ def _intersection_via_get_indexer(
if sort is False:
# sort bc we want the elements in the same order they are in self
# unnecessary in the case with sort=None bc we will sort later
taker = np.sort(taker)
taker = np.sort(taker, kind="stable")

result: MultiIndex | ExtensionArray | np.ndarray
if isinstance(left_unique, ABCMultiIndex):
Expand Down
6 changes: 4 additions & 2 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -514,7 +514,9 @@ def _combine(
return self.make_empty()

# FIXME: optimization potential
indexer = np.sort(np.concatenate([b.mgr_locs.as_array for b in blocks]))
indexer = np.sort(
np.concatenate([b.mgr_locs.as_array for b in blocks]), kind="stable"
)
inv_indexer = lib.get_reverse_indexer(indexer, self.shape[0])

new_blocks: list[Block] = []
Expand Down Expand Up @@ -2249,7 +2251,7 @@ def _merge_blocks(
bvals2 = cast(Sequence[NDArrayBackedExtensionArray], bvals)
new_values = bvals2[0]._concat_same_type(bvals2, axis=0)

argsort = np.argsort(new_mgr_locs)
argsort = np.argsort(new_mgr_locs, kind="stable")
new_values = new_values[argsort]
new_mgr_locs = new_mgr_locs[argsort]

Expand Down
4 changes: 2 additions & 2 deletions pandas/core/methods/selectn.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def compute(self, method: str) -> Series:
# slow method
if n >= len(self.obj):
ascending = method == "nsmallest"
return self.obj.sort_values(ascending=ascending).head(n)
return self.obj.sort_values(ascending=ascending, kind="stable").head(n)

# fast method
new_dtype = dropped.dtype
Expand Down Expand Up @@ -141,7 +141,7 @@ def compute(self, method: str) -> Series:
# here because kth_smallest will modify its input
kth_val = libalgos.kth_smallest(arr.copy(order="C"), n - 1)
(ns,) = np.nonzero(arr <= kth_val)
inds = ns[arr[ns].argsort(kind="mergesort")]
inds = ns[arr[ns].argsort(kind="stable")]

if self.keep != "all":
inds = inds[:n]
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/sorting.py
Original file line number Diff line number Diff line change
Expand Up @@ -651,7 +651,7 @@ def get_group_index_sorter(
shape = map(len, keys)
that is, linear in the number of combinations (cartesian product) of unique
values of groupby keys. This can be huge when doing multi-key groupby.
np.argsort(kind='mergesort') is O(count x log(count)) where count is the
np.argsort(kind='stable') is O(count x log(count)) where count is the
length of the data-frame;
Both algorithms are `stable` sort and that is necessary for correctness of
groupby operations. e.g. consider:
Expand Down Expand Up @@ -680,7 +680,7 @@ def get_group_index_sorter(
)
# sorter _should_ already be intp, but mypy is not yet able to verify
else:
sorter = group_index.argsort(kind="mergesort")
sorter = group_index.argsort(kind="stable")
return ensure_platform_int(sorter)


Expand Down
2 changes: 1 addition & 1 deletion pandas/io/stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -1629,7 +1629,7 @@ def _read_value_labels(self) -> None:
val = np.frombuffer(
self._path_or_buf.read(4 * n), dtype=f"{self._byteorder}i4", count=n
)
ii = np.argsort(off)
ii = np.argsort(off, kind="stable")
off = off[ii]
val = val[ii]
txt = self._path_or_buf.read(txtlen)
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/arrays/categorical/test_sorting.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def test_numpy_argsort(self):
tm.assert_numpy_array_equal(np.argsort(c), expected, check_dtype=False)

tm.assert_numpy_array_equal(
np.argsort(c, kind="mergesort"), expected, check_dtype=False
np.argsort(c, kind="stable"), expected, check_dtype=False
)

msg = "the 'axis' parameter is not supported"
Expand Down
19 changes: 3 additions & 16 deletions pandas/tests/frame/methods/test_nlargest.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@

import pandas as pd
import pandas._testing as tm
from pandas.util.version import Version


@pytest.fixture
Expand Down Expand Up @@ -156,28 +155,16 @@ def test_nlargest_n_identical_values(self):
[["a", "b", "c"], ["c", "b", "a"], ["a"], ["b"], ["a", "b"], ["c", "b"]],
)
@pytest.mark.parametrize("n", range(1, 6))
def test_nlargest_n_duplicate_index(self, df_duplicates, n, order, request):
def test_nlargest_n_duplicate_index(self, df_duplicates, n, order):
# GH#13412

df = df_duplicates
result = df.nsmallest(n, order)
expected = df.sort_values(order).head(n)
expected = df.sort_values(order, kind="stable").head(n)
tm.assert_frame_equal(result, expected)

result = df.nlargest(n, order)
expected = df.sort_values(order, ascending=False).head(n)
if Version(np.__version__) >= Version("1.25") and (
(order == ["a"] and n in (1, 2, 3, 4)) or (order == ["a", "b"]) and n == 5
):
request.node.add_marker(
pytest.mark.xfail(
reason=(
"pandas default unstable sorting of duplicates"
"issue with numpy>=1.25 with AVX instructions"
),
strict=False,
)
)
expected = df.sort_values(order, ascending=False, kind="stable").head(n)
tm.assert_frame_equal(result, expected)

def test_nlargest_duplicate_keep_all_ties(self):
Expand Down
45 changes: 12 additions & 33 deletions pandas/tests/frame/methods/test_sort_values.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
date_range,
)
import pandas._testing as tm
from pandas.util.version import Version


class TestDataFrameSortValues:
Expand Down Expand Up @@ -856,38 +855,26 @@ def ascending(request):

class TestSortValuesLevelAsStr:
def test_sort_index_level_and_column_label(
self, df_none, df_idx, sort_names, ascending, request
self, df_none, df_idx, sort_names, ascending
):
# GH#14353
if (
Version(np.__version__) >= Version("1.25")
and request.node.callspec.id == "df_idx0-inner-True"
):
request.node.add_marker(
pytest.mark.xfail(
reason=(
"pandas default unstable sorting of duplicates"
"issue with numpy>=1.25 with AVX instructions"
),
strict=False,
)
)

# Get index levels from df_idx
levels = df_idx.index.names

# Compute expected by sorting on columns and the setting index
expected = df_none.sort_values(
by=sort_names, ascending=ascending, axis=0
by=sort_names, ascending=ascending, axis=0, kind="stable"
).set_index(levels)

# Compute result sorting on mix on columns and index levels
result = df_idx.sort_values(by=sort_names, ascending=ascending, axis=0)
result = df_idx.sort_values(
by=sort_names, ascending=ascending, axis=0, kind="stable"
)

tm.assert_frame_equal(result, expected)

def test_sort_column_level_and_index_label(
self, df_none, df_idx, sort_names, ascending, request
self, df_none, df_idx, sort_names, ascending
):
# GH#14353

Expand All @@ -898,25 +885,17 @@ def test_sort_column_level_and_index_label(
# transposing. For some cases this will result in a frame with
# multiple column levels
expected = (
df_none.sort_values(by=sort_names, ascending=ascending, axis=0)
df_none.sort_values(
by=sort_names, ascending=ascending, axis=0, kind="stable"
)
.set_index(levels)
.T
)

# Compute result by transposing and sorting on axis=1.
result = df_idx.T.sort_values(by=sort_names, ascending=ascending, axis=1)

if Version(np.__version__) >= Version("1.25"):
request.node.add_marker(
pytest.mark.xfail(
reason=(
"pandas default unstable sorting of duplicates"
"issue with numpy>=1.25 with AVX instructions"
),
strict=False,
)
)

result = df_idx.T.sort_values(
by=sort_names, ascending=ascending, axis=1, kind="stable"
)
tm.assert_frame_equal(result, expected)

def test_sort_values_validate_ascending_for_value_error(self):
Expand Down
Loading