Skip to content

Commit 555bad9

Browse files
authored
Merge branch 'main' into fix-issue-61221
2 parents 8c19221 + a393c31 commit 555bad9

File tree

5 files changed

+132
-52
lines changed

5 files changed

+132
-52
lines changed

doc/source/whatsnew/v3.0.0.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ Other enhancements
6161
- :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`)
6262
- :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
6363
- :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`)
64+
- :meth:`Series.nlargest` uses a 'stable' sort internally and will preserve original ordering.
6465
- :class:`ArrowDtype` now supports ``pyarrow.JsonType`` (:issue:`60958`)
6566
- :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` methods ``sum``, ``mean``, ``median``, ``prod``, ``min``, ``max``, ``std``, ``var`` and ``sem`` now accept ``skipna`` parameter (:issue:`15675`)
6667
- :class:`Rolling` and :class:`Expanding` now support ``nunique`` (:issue:`26958`)
@@ -593,6 +594,7 @@ Performance improvements
593594
- :func:`concat` returns a :class:`RangeIndex` column when possible when ``objs`` contains :class:`Series` and :class:`DataFrame` and ``axis=0`` (:issue:`58119`)
594595
- :func:`concat` returns a :class:`RangeIndex` level in the :class:`MultiIndex` result when ``keys`` is a ``range`` or :class:`RangeIndex` (:issue:`57542`)
595596
- :meth:`RangeIndex.append` returns a :class:`RangeIndex` instead of a :class:`Index` when appending values that could continue the :class:`RangeIndex` (:issue:`57467`)
597+
- :meth:`Series.nlargest` has improved performance when there are duplicate values in the index (:issue:`55767`)
596598
- :meth:`Series.str.extract` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`57542`)
597599
- :meth:`Series.str.partition` with :class:`ArrowDtype` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`57768`)
598600
- Performance improvement in :class:`DataFrame` when ``data`` is a ``dict`` and ``columns`` is specified (:issue:`24368`)

pandas/core/methods/selectn.py

Lines changed: 32 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from typing import (
1212
TYPE_CHECKING,
1313
Generic,
14+
Literal,
1415
cast,
1516
final,
1617
)
@@ -54,7 +55,9 @@
5455

5556

5657
class SelectN(Generic[NDFrameT]):
57-
def __init__(self, obj: NDFrameT, n: int, keep: str) -> None:
58+
def __init__(
59+
self, obj: NDFrameT, n: int, keep: Literal["first", "last", "all"]
60+
) -> None:
5861
self.obj = obj
5962
self.n = n
6063
self.keep = keep
@@ -111,15 +114,25 @@ def compute(self, method: str) -> Series:
111114
if n <= 0:
112115
return self.obj[[]]
113116

114-
dropped = self.obj.dropna()
115-
nan_index = self.obj.drop(dropped.index)
117+
# Save index and reset to default index to avoid performance impact
118+
# from when index contains duplicates
119+
original_index: Index = self.obj.index
120+
default_index = self.obj.reset_index(drop=True)
116121

117-
# slow method
118-
if n >= len(self.obj):
122+
# Slower method used when taking the full length of the series
123+
# In this case, it is equivalent to a sort.
124+
if n >= len(default_index):
119125
ascending = method == "nsmallest"
120-
return self.obj.sort_values(ascending=ascending).head(n)
126+
result = default_index.sort_values(ascending=ascending, kind="stable").head(
127+
n
128+
)
129+
result.index = original_index.take(result.index)
130+
return result
131+
132+
# Fast method used in the general case
133+
dropped = default_index.dropna()
134+
nan_index = default_index.drop(dropped.index)
121135

122-
# fast method
123136
new_dtype = dropped.dtype
124137

125138
# Similar to algorithms._ensure_data
@@ -158,7 +171,7 @@ def compute(self, method: str) -> Series:
158171
else:
159172
kth_val = np.nan
160173
(ns,) = np.nonzero(arr <= kth_val)
161-
inds = ns[arr[ns].argsort(kind="mergesort")]
174+
inds = ns[arr[ns].argsort(kind="stable")]
162175

163176
if self.keep != "all":
164177
inds = inds[:n]
@@ -173,7 +186,9 @@ def compute(self, method: str) -> Series:
173186
# reverse indices
174187
inds = narr - 1 - inds
175188

176-
return concat([dropped.iloc[inds], nan_index]).iloc[:findex]
189+
result = concat([dropped.iloc[inds], nan_index]).iloc[:findex]
190+
result.index = original_index.take(result.index)
191+
return result
177192

178193

179194
class SelectNFrame(SelectN[DataFrame]):
@@ -192,7 +207,13 @@ class SelectNFrame(SelectN[DataFrame]):
192207
nordered : DataFrame
193208
"""
194209

195-
def __init__(self, obj: DataFrame, n: int, keep: str, columns: IndexLabel) -> None:
210+
def __init__(
211+
self,
212+
obj: DataFrame,
213+
n: int,
214+
keep: Literal["first", "last", "all"],
215+
columns: IndexLabel,
216+
) -> None:
196217
super().__init__(obj, n, keep)
197218
if not is_list_like(columns) or isinstance(columns, tuple):
198219
columns = [columns]
@@ -277,4 +298,4 @@ def get_indexer(current_indexer: Index, other_indexer: Index) -> Index:
277298

278299
ascending = method == "nsmallest"
279300

280-
return frame.sort_values(columns, ascending=ascending, kind="mergesort")
301+
return frame.sort_values(columns, ascending=ascending, kind="stable")

pandas/io/pytables.py

Lines changed: 85 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
)
4040
from pandas._libs.lib import is_string_array
4141
from pandas._libs.tslibs import timezones
42+
from pandas.compat import HAS_PYARROW
4243
from pandas.compat._optional import import_optional_dependency
4344
from pandas.compat.pickle_compat import patch_pickle
4445
from pandas.errors import (
@@ -381,6 +382,13 @@ def read_hdf(
381382
DataFrame.to_hdf : Write a HDF file from a DataFrame.
382383
HDFStore : Low-level access to HDF files.
383384
385+
Notes
386+
-----
387+
When ``errors="surrogatepass"``, ``pd.options.future.infer_string`` is true,
388+
and PyArrow is installed, if a UTF-16 surrogate is encountered when decoding
389+
to UTF-8, the resulting dtype will be
390+
``pd.StringDtype(storage="python", na_value=np.nan)``.
391+
384392
Examples
385393
--------
386394
>>> df = pd.DataFrame([[1, 1.0, "a"]], columns=["x", "y", "z"]) # doctest: +SKIP
@@ -2257,6 +2265,20 @@ def convert(
22572265
# making an Index instance could throw a number of different errors
22582266
try:
22592267
new_pd_index = factory(values, **kwargs)
2268+
except UnicodeEncodeError as err:
2269+
if (
2270+
errors == "surrogatepass"
2271+
and get_option("future.infer_string")
2272+
and str(err).endswith("surrogates not allowed")
2273+
and HAS_PYARROW
2274+
):
2275+
new_pd_index = factory(
2276+
values,
2277+
dtype=StringDtype(storage="python", na_value=np.nan),
2278+
**kwargs,
2279+
)
2280+
else:
2281+
raise
22602282
except ValueError:
22612283
# if the output freq is different that what we recorded,
22622284
# it should be None (see also 'doc example part 2')
@@ -3170,12 +3192,29 @@ def read_index_node(
31703192
**kwargs,
31713193
)
31723194
else:
3173-
index = factory(
3174-
_unconvert_index(
3175-
data, kind, encoding=self.encoding, errors=self.errors
3176-
),
3177-
**kwargs,
3178-
)
3195+
try:
3196+
index = factory(
3197+
_unconvert_index(
3198+
data, kind, encoding=self.encoding, errors=self.errors
3199+
),
3200+
**kwargs,
3201+
)
3202+
except UnicodeEncodeError as err:
3203+
if (
3204+
self.errors == "surrogatepass"
3205+
and get_option("future.infer_string")
3206+
and str(err).endswith("surrogates not allowed")
3207+
and HAS_PYARROW
3208+
):
3209+
index = factory(
3210+
_unconvert_index(
3211+
data, kind, encoding=self.encoding, errors=self.errors
3212+
),
3213+
dtype=StringDtype(storage="python", na_value=np.nan),
3214+
**kwargs,
3215+
)
3216+
else:
3217+
raise
31793218

31803219
index.name = name
31813220

@@ -3311,13 +3350,24 @@ def read(
33113350
self.validate_read(columns, where)
33123351
index = self.read_index("index", start=start, stop=stop)
33133352
values = self.read_array("values", start=start, stop=stop)
3314-
result = Series(values, index=index, name=self.name, copy=False)
3315-
if (
3316-
using_string_dtype()
3317-
and isinstance(values, np.ndarray)
3318-
and is_string_array(values, skipna=True)
3319-
):
3320-
result = result.astype(StringDtype(na_value=np.nan))
3353+
try:
3354+
result = Series(values, index=index, name=self.name, copy=False)
3355+
except UnicodeEncodeError as err:
3356+
if (
3357+
self.errors == "surrogatepass"
3358+
and get_option("future.infer_string")
3359+
and str(err).endswith("surrogates not allowed")
3360+
and HAS_PYARROW
3361+
):
3362+
result = Series(
3363+
values,
3364+
index=index,
3365+
name=self.name,
3366+
copy=False,
3367+
dtype=StringDtype(storage="python", na_value=np.nan),
3368+
)
3369+
else:
3370+
raise
33213371
return result
33223372

33233373
def write(self, obj, **kwargs) -> None:
@@ -4764,7 +4814,24 @@ def read(
47644814
values = values.reshape((1, values.shape[0]))
47654815

47664816
if isinstance(values, (np.ndarray, DatetimeArray)):
4767-
df = DataFrame(values.T, columns=cols_, index=index_, copy=False)
4817+
try:
4818+
df = DataFrame(values.T, columns=cols_, index=index_, copy=False)
4819+
except UnicodeEncodeError as err:
4820+
if (
4821+
self.errors == "surrogatepass"
4822+
and get_option("future.infer_string")
4823+
and str(err).endswith("surrogates not allowed")
4824+
and HAS_PYARROW
4825+
):
4826+
df = DataFrame(
4827+
values.T,
4828+
columns=cols_,
4829+
index=index_,
4830+
copy=False,
4831+
dtype=StringDtype(storage="python", na_value=np.nan),
4832+
)
4833+
else:
4834+
raise
47684835
elif isinstance(values, Index):
47694836
df = DataFrame(values, columns=cols_, index=index_)
47704837
else:
@@ -4774,23 +4841,10 @@ def read(
47744841
assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype)
47754842

47764843
# If str / string dtype is stored in meta, use that.
4777-
converted = False
47784844
for column in cols_:
47794845
dtype = getattr(self.table.attrs, f"{column}_meta", None)
47804846
if dtype in ["str", "string"]:
47814847
df[column] = df[column].astype(dtype)
4782-
converted = True
4783-
# Otherwise try inference.
4784-
if (
4785-
not converted
4786-
and using_string_dtype()
4787-
and isinstance(values, np.ndarray)
4788-
and is_string_array(
4789-
values,
4790-
skipna=True,
4791-
)
4792-
):
4793-
df = df.astype(StringDtype(na_value=np.nan))
47944848
frames.append(df)
47954849

47964850
if len(frames) == 1:
@@ -5224,7 +5278,7 @@ def _convert_string_array(data: np.ndarray, encoding: str, errors: str) -> np.nd
52245278
# encode if needed
52255279
if len(data):
52265280
data = (
5227-
Series(data.ravel(), copy=False)
5281+
Series(data.ravel(), copy=False, dtype="object")
52285282
.str.encode(encoding, errors)
52295283
._values.reshape(data.shape)
52305284
)
@@ -5264,7 +5318,9 @@ def _unconvert_string_array(
52645318
dtype = f"U{itemsize}"
52655319

52665320
if isinstance(data[0], bytes):
5267-
ser = Series(data, copy=False).str.decode(encoding, errors=errors)
5321+
ser = Series(data, copy=False).str.decode(
5322+
encoding, errors=errors, dtype="object"
5323+
)
52685324
data = ser.to_numpy()
52695325
data.flags.writeable = True
52705326
else:

pandas/tests/frame/methods/test_nlargest.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -153,11 +153,11 @@ def test_nlargest_n_duplicate_index(self, n, order, request):
153153
index=[0, 0, 1, 1, 1],
154154
)
155155
result = df.nsmallest(n, order)
156-
expected = df.sort_values(order).head(n)
156+
expected = df.sort_values(order, kind="stable").head(n)
157157
tm.assert_frame_equal(result, expected)
158158

159159
result = df.nlargest(n, order)
160-
expected = df.sort_values(order, ascending=False).head(n)
160+
expected = df.sort_values(order, ascending=False, kind="stable").head(n)
161161
if Version(np.__version__) >= Version("1.25") and (
162162
(order == ["a"] and n in (1, 2, 3, 4)) or ((order == ["a", "b"]) and n == 5)
163163
):

pandas/tests/io/pytables/test_store.py

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,6 @@
77
import numpy as np
88
import pytest
99

10-
from pandas._config import using_string_dtype
11-
1210
from pandas.compat import PY312
1311

1412
import pandas as pd
@@ -25,7 +23,6 @@
2523
timedelta_range,
2624
)
2725
import pandas._testing as tm
28-
from pandas.conftest import has_pyarrow
2926
from pandas.tests.io.pytables.common import (
3027
_maybe_remove,
3128
ensure_clean_store,
@@ -385,20 +382,24 @@ def test_to_hdf_with_min_itemsize(tmp_path, setup_path):
385382
tm.assert_series_equal(read_hdf(path, "ss4"), concat([df["B"], df2["B"]]))
386383

387384

388-
@pytest.mark.xfail(
389-
using_string_dtype() and has_pyarrow,
390-
reason="TODO(infer_string): can't encode '\ud800': surrogates not allowed",
391-
)
392385
@pytest.mark.parametrize("format", ["fixed", "table"])
393-
def test_to_hdf_errors(tmp_path, format, setup_path):
386+
def test_to_hdf_errors(tmp_path, format, setup_path, using_infer_string):
394387
data = ["\ud800foo"]
395-
ser = Series(data, index=Index(data))
388+
ser = Series(data, index=Index(data, dtype="object"), dtype="object")
396389
path = tmp_path / setup_path
397390
# GH 20835
398391
ser.to_hdf(path, key="table", format=format, errors="surrogatepass")
399392

400393
result = read_hdf(path, "table", errors="surrogatepass")
401-
tm.assert_series_equal(result, ser)
394+
395+
if using_infer_string:
396+
# https://github.com/pandas-dev/pandas/pull/60993
397+
# Surrogates fallback to python storage.
398+
dtype = pd.StringDtype(storage="python", na_value=np.nan)
399+
else:
400+
dtype = "object"
401+
expected = Series(data, index=Index(data, dtype=dtype), dtype=dtype)
402+
tm.assert_series_equal(result, expected)
402403

403404

404405
def test_create_table_index(setup_path):

0 commit comments

Comments
 (0)