Skip to content

Bug: Save original index and remap after function completes #61116

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Apr 15, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ Other enhancements
- :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`)
- :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
- :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`)
- :meth:`Series.nlargest` uses a 'stable' sort internally and will preserve original ordering.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
- :meth:`Series.nlargest` uses a 'stable' sort internally and will preserve original ordering.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mroeschke - there is a change in Series.nlargest here where previously we weren't using a stable sorting algorithm in certain cases. Now the only dtype I think this can possibly effect is object (as otherwise equal implies identical), and even there it is quite uncommon to have an impact. Perhaps the line needs to make this more clear that it is unlikely to have any impact, or are you thinking it isn't even worth mentioning?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah gotcha. OK doesnt hurt to include it in the whatsnew notes

- :class:`ArrowDtype` now supports ``pyarrow.JsonType`` (:issue:`60958`)
- :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` methods ``sum``, ``mean``, ``median``, ``prod``, ``min``, ``max``, ``std``, ``var`` and ``sem`` now accept ``skipna`` parameter (:issue:`15675`)
- :class:`Rolling` and :class:`Expanding` now support ``nunique`` (:issue:`26958`)
Expand Down Expand Up @@ -593,6 +594,7 @@ Performance improvements
- :func:`concat` returns a :class:`RangeIndex` column when possible when ``objs`` contains :class:`Series` and :class:`DataFrame` and ``axis=0`` (:issue:`58119`)
- :func:`concat` returns a :class:`RangeIndex` level in the :class:`MultiIndex` result when ``keys`` is a ``range`` or :class:`RangeIndex` (:issue:`57542`)
- :meth:`RangeIndex.append` returns a :class:`RangeIndex` instead of a :class:`Index` when appending values that could continue the :class:`RangeIndex` (:issue:`57467`)
- :meth:`Series.nlargest` has improved performance when there are duplicate values in the index (:issue:`55767`)
- :meth:`Series.str.extract` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`57542`)
- :meth:`Series.str.partition` with :class:`ArrowDtype` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`57768`)
- Performance improvement in :class:`DataFrame` when ``data`` is a ``dict`` and ``columns`` is specified (:issue:`24368`)
Expand Down
43 changes: 32 additions & 11 deletions pandas/core/methods/selectn.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from typing import (
TYPE_CHECKING,
Generic,
Literal,
cast,
final,
)
Expand Down Expand Up @@ -54,7 +55,9 @@


class SelectN(Generic[NDFrameT]):
def __init__(self, obj: NDFrameT, n: int, keep: str) -> None:
def __init__(
self, obj: NDFrameT, n: int, keep: Literal["first", "last", "all"]
) -> None:
self.obj = obj
self.n = n
self.keep = keep
Expand Down Expand Up @@ -111,15 +114,25 @@ def compute(self, method: str) -> Series:
if n <= 0:
return self.obj[[]]

dropped = self.obj.dropna()
nan_index = self.obj.drop(dropped.index)
# Save index and reset to default index to avoid performance impact
# from when index contains duplicates
original_index: Index = self.obj.index
default_index = self.obj.reset_index(drop=True)

# slow method
if n >= len(self.obj):
# Slower method used when taking the full length of the series
# In this case, it is equivalent to a sort.
if n >= len(default_index):
ascending = method == "nsmallest"
return self.obj.sort_values(ascending=ascending).head(n)
result = default_index.sort_values(ascending=ascending, kind="stable").head(
n
)
result.index = original_index.take(result.index)
return result

# Fast method used in the general case
dropped = default_index.dropna()
nan_index = default_index.drop(dropped.index)

# fast method
new_dtype = dropped.dtype

# Similar to algorithms._ensure_data
Expand Down Expand Up @@ -158,7 +171,7 @@ def compute(self, method: str) -> Series:
else:
kth_val = np.nan
(ns,) = np.nonzero(arr <= kth_val)
inds = ns[arr[ns].argsort(kind="mergesort")]
inds = ns[arr[ns].argsort(kind="stable")]

if self.keep != "all":
inds = inds[:n]
Expand All @@ -173,7 +186,9 @@ def compute(self, method: str) -> Series:
# reverse indices
inds = narr - 1 - inds

return concat([dropped.iloc[inds], nan_index]).iloc[:findex]
result = concat([dropped.iloc[inds], nan_index]).iloc[:findex]
result.index = original_index.take(result.index)
return result


class SelectNFrame(SelectN[DataFrame]):
Expand All @@ -192,7 +207,13 @@ class SelectNFrame(SelectN[DataFrame]):
nordered : DataFrame
"""

def __init__(self, obj: DataFrame, n: int, keep: str, columns: IndexLabel) -> None:
def __init__(
self,
obj: DataFrame,
n: int,
keep: Literal["first", "last", "all"],
columns: IndexLabel,
) -> None:
super().__init__(obj, n, keep)
if not is_list_like(columns) or isinstance(columns, tuple):
columns = [columns]
Expand Down Expand Up @@ -277,4 +298,4 @@ def get_indexer(current_indexer: Index, other_indexer: Index) -> Index:

ascending = method == "nsmallest"

return frame.sort_values(columns, ascending=ascending, kind="mergesort")
return frame.sort_values(columns, ascending=ascending, kind="stable")
4 changes: 2 additions & 2 deletions pandas/tests/frame/methods/test_nlargest.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,11 +153,11 @@ def test_nlargest_n_duplicate_index(self, n, order, request):
index=[0, 0, 1, 1, 1],
)
result = df.nsmallest(n, order)
expected = df.sort_values(order).head(n)
expected = df.sort_values(order, kind="stable").head(n)
tm.assert_frame_equal(result, expected)

result = df.nlargest(n, order)
expected = df.sort_values(order, ascending=False).head(n)
expected = df.sort_values(order, ascending=False, kind="stable").head(n)
if Version(np.__version__) >= Version("1.25") and (
(order == ["a"] and n in (1, 2, 3, 4)) or ((order == ["a", "b"]) and n == 5)
):
Expand Down