Merge branch 'main' into fix-issue-61221

gsmll · web-flow · commit 555bad9b1de2 · 2025-04-16T15:00:55.000-05:00
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -61,6 +61,7 @@ Other enhancements
 - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`)
 - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
 - :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`)
+- :meth:`Series.nlargest` uses a 'stable' sort internally and will preserve original ordering.
 - :class:`ArrowDtype` now supports ``pyarrow.JsonType`` (:issue:`60958`)
 - :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` methods ``sum``, ``mean``, ``median``, ``prod``, ``min``, ``max``, ``std``, ``var`` and ``sem`` now accept ``skipna`` parameter (:issue:`15675`)
 - :class:`Rolling` and :class:`Expanding` now support ``nunique`` (:issue:`26958`)
@@ -593,6 +594,7 @@ Performance improvements
 - :func:`concat` returns a :class:`RangeIndex` column when possible when ``objs`` contains :class:`Series` and :class:`DataFrame` and ``axis=0`` (:issue:`58119`)
 - :func:`concat` returns a :class:`RangeIndex` level in the :class:`MultiIndex` result when ``keys`` is a ``range`` or :class:`RangeIndex` (:issue:`57542`)
 - :meth:`RangeIndex.append` returns a :class:`RangeIndex` instead of a :class:`Index` when appending values that could continue the :class:`RangeIndex` (:issue:`57467`)
+- :meth:`Series.nlargest` has improved performance when there are duplicate values in the index (:issue:`55767`)
 - :meth:`Series.str.extract` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`57542`)
 - :meth:`Series.str.partition` with :class:`ArrowDtype` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`57768`)
 - Performance improvement in :class:`DataFrame` when ``data`` is a ``dict`` and ``columns`` is specified (:issue:`24368`)
diff --git a/pandas/core/methods/selectn.py b/pandas/core/methods/selectn.py
@@ -11,6 +11,7 @@
 from typing import (
     TYPE_CHECKING,
     Generic,
+    Literal,
     cast,
     final,
 )
@@ -54,7 +55,9 @@
 
 
 class SelectN(Generic[NDFrameT]):
-    def __init__(self, obj: NDFrameT, n: int, keep: str) -> None:
+    def __init__(
+        self, obj: NDFrameT, n: int, keep: Literal["first", "last", "all"]
+    ) -> None:
         self.obj = obj
         self.n = n
         self.keep = keep
@@ -111,15 +114,25 @@ def compute(self, method: str) -> Series:
         if n <= 0:
             return self.obj[[]]
 
-        dropped = self.obj.dropna()
-        nan_index = self.obj.drop(dropped.index)
+        # Save index and reset to default index to avoid performance impact
+        # from when index contains duplicates
+        original_index: Index = self.obj.index
+        default_index = self.obj.reset_index(drop=True)
 
-        # slow method
-        if n >= len(self.obj):
+        # Slower method used when taking the full length of the series
+        # In this case, it is equivalent to a sort.
+        if n >= len(default_index):
             ascending = method == "nsmallest"
-            return self.obj.sort_values(ascending=ascending).head(n)
+            result = default_index.sort_values(ascending=ascending, kind="stable").head(
+                n
+            )
+            result.index = original_index.take(result.index)
+            return result
+
+        # Fast method used in the general case
+        dropped = default_index.dropna()
+        nan_index = default_index.drop(dropped.index)
 
-        # fast method
         new_dtype = dropped.dtype
 
         # Similar to algorithms._ensure_data
@@ -158,7 +171,7 @@ def compute(self, method: str) -> Series:
         else:
             kth_val = np.nan
         (ns,) = np.nonzero(arr <= kth_val)
-        inds = ns[arr[ns].argsort(kind="mergesort")]
+        inds = ns[arr[ns].argsort(kind="stable")]
 
         if self.keep != "all":
             inds = inds[:n]
@@ -173,7 +186,9 @@ def compute(self, method: str) -> Series:
             # reverse indices
             inds = narr - 1 - inds
 
-        return concat([dropped.iloc[inds], nan_index]).iloc[:findex]
+        result = concat([dropped.iloc[inds], nan_index]).iloc[:findex]
+        result.index = original_index.take(result.index)
+        return result
 
 
 class SelectNFrame(SelectN[DataFrame]):
@@ -192,7 +207,13 @@ class SelectNFrame(SelectN[DataFrame]):
     nordered : DataFrame
     """
 
-    def __init__(self, obj: DataFrame, n: int, keep: str, columns: IndexLabel) -> None:
+    def __init__(
+        self,
+        obj: DataFrame,
+        n: int,
+        keep: Literal["first", "last", "all"],
+        columns: IndexLabel,
+    ) -> None:
         super().__init__(obj, n, keep)
         if not is_list_like(columns) or isinstance(columns, tuple):
             columns = [columns]
@@ -277,4 +298,4 @@ def get_indexer(current_indexer: Index, other_indexer: Index) -> Index:
 
         ascending = method == "nsmallest"
 
-        return frame.sort_values(columns, ascending=ascending, kind="mergesort")
+        return frame.sort_values(columns, ascending=ascending, kind="stable")
diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
@@ -39,6 +39,7 @@
 )
 from pandas._libs.lib import is_string_array
 from pandas._libs.tslibs import timezones
+from pandas.compat import HAS_PYARROW
 from pandas.compat._optional import import_optional_dependency
 from pandas.compat.pickle_compat import patch_pickle
 from pandas.errors import (
@@ -381,6 +382,13 @@ def read_hdf(
     DataFrame.to_hdf : Write a HDF file from a DataFrame.
     HDFStore : Low-level access to HDF files.
 
+    Notes
+    -----
+    When ``errors="surrogatepass"``, ``pd.options.future.infer_string`` is true,
+    and PyArrow is installed, if a UTF-16 surrogate is encountered when decoding
+    to UTF-8, the resulting dtype will be
+    ``pd.StringDtype(storage="python", na_value=np.nan)``.
+
     Examples
     --------
     >>> df = pd.DataFrame([[1, 1.0, "a"]], columns=["x", "y", "z"])  # doctest: +SKIP
@@ -2257,6 +2265,20 @@ def convert(
         # making an Index instance could throw a number of different errors
         try:
             new_pd_index = factory(values, **kwargs)
+        except UnicodeEncodeError as err:
+            if (
+                errors == "surrogatepass"
+                and get_option("future.infer_string")
+                and str(err).endswith("surrogates not allowed")
+                and HAS_PYARROW
+            ):
+                new_pd_index = factory(
+                    values,
+                    dtype=StringDtype(storage="python", na_value=np.nan),
+                    **kwargs,
+                )
+            else:
+                raise
         except ValueError:
             # if the output freq is different that what we recorded,
             # it should be None (see also 'doc example part 2')
@@ -3170,12 +3192,29 @@ def read_index_node(
                 **kwargs,
             )
         else:
-            index = factory(
-                _unconvert_index(
-                    data, kind, encoding=self.encoding, errors=self.errors
-                ),
-                **kwargs,
-            )
+            try:
+                index = factory(
+                    _unconvert_index(
+                        data, kind, encoding=self.encoding, errors=self.errors
+                    ),
+                    **kwargs,
+                )
+            except UnicodeEncodeError as err:
+                if (
+                    self.errors == "surrogatepass"
+                    and get_option("future.infer_string")
+                    and str(err).endswith("surrogates not allowed")
+                    and HAS_PYARROW
+                ):
+                    index = factory(
+                        _unconvert_index(
+                            data, kind, encoding=self.encoding, errors=self.errors
+                        ),
+                        dtype=StringDtype(storage="python", na_value=np.nan),
+                        **kwargs,
+                    )
+                else:
+                    raise
 
         index.name = name
 
@@ -3311,13 +3350,24 @@ def read(
         self.validate_read(columns, where)
         index = self.read_index("index", start=start, stop=stop)
         values = self.read_array("values", start=start, stop=stop)
-        result = Series(values, index=index, name=self.name, copy=False)
-        if (
-            using_string_dtype()
-            and isinstance(values, np.ndarray)
-            and is_string_array(values, skipna=True)
-        ):
-            result = result.astype(StringDtype(na_value=np.nan))
+        try:
+            result = Series(values, index=index, name=self.name, copy=False)
+        except UnicodeEncodeError as err:
+            if (
+                self.errors == "surrogatepass"
+                and get_option("future.infer_string")
+                and str(err).endswith("surrogates not allowed")
+                and HAS_PYARROW
+            ):
+                result = Series(
+                    values,
+                    index=index,
+                    name=self.name,
+                    copy=False,
+                    dtype=StringDtype(storage="python", na_value=np.nan),
+                )
+            else:
+                raise
         return result
 
     def write(self, obj, **kwargs) -> None:
@@ -4764,7 +4814,24 @@ def read(
                 values = values.reshape((1, values.shape[0]))
 
             if isinstance(values, (np.ndarray, DatetimeArray)):
-                df = DataFrame(values.T, columns=cols_, index=index_, copy=False)
+                try:
+                    df = DataFrame(values.T, columns=cols_, index=index_, copy=False)
+                except UnicodeEncodeError as err:
+                    if (
+                        self.errors == "surrogatepass"
+                        and get_option("future.infer_string")
+                        and str(err).endswith("surrogates not allowed")
+                        and HAS_PYARROW
+                    ):
+                        df = DataFrame(
+                            values.T,
+                            columns=cols_,
+                            index=index_,
+                            copy=False,
+                            dtype=StringDtype(storage="python", na_value=np.nan),
+                        )
+                    else:
+                        raise
             elif isinstance(values, Index):
                 df = DataFrame(values, columns=cols_, index=index_)
             else:
@@ -4774,23 +4841,10 @@ def read(
                 assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype)
 
             # If str / string dtype is stored in meta, use that.
-            converted = False
             for column in cols_:
                 dtype = getattr(self.table.attrs, f"{column}_meta", None)
                 if dtype in ["str", "string"]:
                     df[column] = df[column].astype(dtype)
-                    converted = True
-            # Otherwise try inference.
-            if (
-                not converted
-                and using_string_dtype()
-                and isinstance(values, np.ndarray)
-                and is_string_array(
-                    values,
-                    skipna=True,
-                )
-            ):
-                df = df.astype(StringDtype(na_value=np.nan))
             frames.append(df)
 
         if len(frames) == 1:
@@ -5224,7 +5278,7 @@ def _convert_string_array(data: np.ndarray, encoding: str, errors: str) -> np.nd
     # encode if needed
     if len(data):
         data = (
-            Series(data.ravel(), copy=False)
+            Series(data.ravel(), copy=False, dtype="object")
             .str.encode(encoding, errors)
             ._values.reshape(data.shape)
         )
@@ -5264,7 +5318,9 @@ def _unconvert_string_array(
         dtype = f"U{itemsize}"
 
         if isinstance(data[0], bytes):
-            ser = Series(data, copy=False).str.decode(encoding, errors=errors)
+            ser = Series(data, copy=False).str.decode(
+                encoding, errors=errors, dtype="object"
+            )
             data = ser.to_numpy()
             data.flags.writeable = True
         else:
diff --git a/pandas/tests/frame/methods/test_nlargest.py b/pandas/tests/frame/methods/test_nlargest.py
@@ -153,11 +153,11 @@ def test_nlargest_n_duplicate_index(self, n, order, request):
             index=[0, 0, 1, 1, 1],
         )
         result = df.nsmallest(n, order)
-        expected = df.sort_values(order).head(n)
+        expected = df.sort_values(order, kind="stable").head(n)
         tm.assert_frame_equal(result, expected)
 
         result = df.nlargest(n, order)
-        expected = df.sort_values(order, ascending=False).head(n)
+        expected = df.sort_values(order, ascending=False, kind="stable").head(n)
         if Version(np.__version__) >= Version("1.25") and (
             (order == ["a"] and n in (1, 2, 3, 4)) or ((order == ["a", "b"]) and n == 5)
         ):
diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py
@@ -7,8 +7,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas.compat import PY312
 
 import pandas as pd
@@ -25,7 +23,6 @@
     timedelta_range,
 )
 import pandas._testing as tm
-from pandas.conftest import has_pyarrow
 from pandas.tests.io.pytables.common import (
     _maybe_remove,
     ensure_clean_store,
@@ -385,20 +382,24 @@ def test_to_hdf_with_min_itemsize(tmp_path, setup_path):
     tm.assert_series_equal(read_hdf(path, "ss4"), concat([df["B"], df2["B"]]))
 
 
-@pytest.mark.xfail(
-    using_string_dtype() and has_pyarrow,
-    reason="TODO(infer_string): can't encode '\ud800': surrogates not allowed",
-)
 @pytest.mark.parametrize("format", ["fixed", "table"])
-def test_to_hdf_errors(tmp_path, format, setup_path):
+def test_to_hdf_errors(tmp_path, format, setup_path, using_infer_string):
     data = ["\ud800foo"]
-    ser = Series(data, index=Index(data))
+    ser = Series(data, index=Index(data, dtype="object"), dtype="object")
     path = tmp_path / setup_path
     # GH 20835
     ser.to_hdf(path, key="table", format=format, errors="surrogatepass")
 
     result = read_hdf(path, "table", errors="surrogatepass")
-    tm.assert_series_equal(result, ser)
+
+    if using_infer_string:
+        # https://github.com/pandas-dev/pandas/pull/60993
+        # Surrogates fallback to python storage.
+        dtype = pd.StringDtype(storage="python", na_value=np.nan)
+    else:
+        dtype = "object"
+    expected = Series(data, index=Index(data, dtype=dtype), dtype=dtype)
+    tm.assert_series_equal(result, expected)
 
 
 def test_create_table_index(setup_path):