Skip to content

Commit c667fc4

Browse files
authored
PERF: pd.concat EA-backed indexes and sort=True (#49178)
1 parent 0dce285 commit c667fc4

File tree

5 files changed

+32
-22
lines changed

5 files changed

+32
-22
lines changed

asv_bench/benchmarks/join_merge.py

+14-7
Original file line numberDiff line numberDiff line change
@@ -97,13 +97,13 @@ class ConcatIndexDtype:
9797

9898
params = (
9999
["datetime64[ns]", "int64", "Int64", "string[python]", "string[pyarrow]"],
100+
["monotonic", "non_monotonic", "has_na"],
100101
[0, 1],
101102
[True, False],
102-
[True, False],
103103
)
104-
param_names = ["dtype", "axis", "sort", "is_monotonic"]
104+
param_names = ["dtype", "structure", "axis", "sort"]
105105

106-
def setup(self, dtype, axis, sort, is_monotonic):
106+
def setup(self, dtype, structure, axis, sort):
107107
N = 10_000
108108
if dtype == "datetime64[ns]":
109109
vals = date_range("1970-01-01", periods=N)
@@ -115,14 +115,21 @@ def setup(self, dtype, axis, sort, is_monotonic):
115115
raise NotImplementedError
116116

117117
idx = Index(vals, dtype=dtype)
118-
if is_monotonic:
118+
119+
if structure == "monotonic":
119120
idx = idx.sort_values()
120-
else:
121+
elif structure == "non_monotonic":
121122
idx = idx[::-1]
123+
elif structure == "has_na":
124+
if not idx._can_hold_na:
125+
raise NotImplementedError
126+
idx = Index([None], dtype=dtype).append(idx)
127+
else:
128+
raise NotImplementedError
122129

123-
self.series = [Series(i, idx[i:]) for i in range(5)]
130+
self.series = [Series(i, idx[:-i]) for i in range(1, 6)]
124131

125-
def time_concat_series(self, dtype, axis, sort, is_monotonic):
132+
def time_concat_series(self, dtype, structure, axis, sort):
126133
concat(self.series, axis=axis, sort=sort)
127134

128135

doc/source/whatsnew/v2.0.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -221,10 +221,10 @@ Performance improvements
221221
- Performance improvement in :func:`merge` and :meth:`DataFrame.join` when joining on a sorted :class:`MultiIndex` (:issue:`48504`)
222222
- Performance improvement in :meth:`DataFrame.loc` and :meth:`Series.loc` for tuple-based indexing of a :class:`MultiIndex` (:issue:`48384`)
223223
- Performance improvement for :meth:`MultiIndex.unique` (:issue:`48335`)
224+
- Performance improvement for :func:`concat` with extension array backed indexes (:issue:`49128`, :issue:`49178`)
224225
- Reduce memory usage of :meth:`DataFrame.to_pickle`/:meth:`Series.to_pickle` when using BZ2 or LZMA (:issue:`49068`)
225226
- Performance improvement for :class:`~arrays.StringArray` constructor passing a numpy array with type ``np.str_`` (:issue:`49109`)
226227
- Performance improvement in :meth:`~arrays.ArrowExtensionArray.factorize` (:issue:`49177`)
227-
- Performance improvement for :func:`concat` with extension array backed indexes (:issue:`49128`)
228228
- Performance improvement in :meth:`DataFrame.join` when joining on a subset of a :class:`MultiIndex` (:issue:`48611`)
229229
- Performance improvement for :meth:`MultiIndex.intersection` (:issue:`48604`)
230230
- Performance improvement in ``var`` for nullable dtypes (:issue:`48379`).

pandas/core/algorithms.py

+14-11
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,6 @@
102102
Categorical,
103103
DataFrame,
104104
Index,
105-
MultiIndex,
106105
Series,
107106
)
108107
from pandas.core.arrays import (
@@ -1768,7 +1767,7 @@ def safe_sort(
17681767
na_sentinel: int | None = -1,
17691768
assume_unique: bool = False,
17701769
verify: bool = True,
1771-
) -> np.ndarray | MultiIndex | tuple[np.ndarray | MultiIndex, np.ndarray]:
1770+
) -> AnyArrayLike | tuple[AnyArrayLike, np.ndarray]:
17721771
"""
17731772
Sort ``values`` and reorder corresponding ``codes``.
17741773
@@ -1797,7 +1796,7 @@ def safe_sort(
17971796
17981797
Returns
17991798
-------
1800-
ordered : ndarray or MultiIndex
1799+
ordered : AnyArrayLike
18011800
Sorted ``values``
18021801
new_codes : ndarray
18031802
Reordered ``codes``; returned when ``codes`` is not None.
@@ -1816,7 +1815,7 @@ def safe_sort(
18161815
"Only list-like objects are allowed to be passed to safe_sort as values"
18171816
)
18181817

1819-
if not isinstance(values, (np.ndarray, ABCExtensionArray, ABCMultiIndex)):
1818+
if not is_array_like(values):
18201819
# don't convert to string types
18211820
dtype, _ = infer_dtype_from_array(values)
18221821
# error: Argument "dtype" to "asarray" has incompatible type "Union[dtype[Any],
@@ -1826,7 +1825,7 @@ def safe_sort(
18261825
values = np.asarray(values, dtype=dtype) # type: ignore[arg-type]
18271826

18281827
sorter = None
1829-
ordered: np.ndarray | MultiIndex
1828+
ordered: AnyArrayLike
18301829

18311830
if (
18321831
not is_extension_array_dtype(values)
@@ -1894,15 +1893,19 @@ def safe_sort(
18941893
return ordered, ensure_platform_int(new_codes)
18951894

18961895

1897-
def _sort_mixed(values) -> np.ndarray:
1896+
def _sort_mixed(values) -> AnyArrayLike:
18981897
"""order ints before strings before nulls in 1d arrays"""
18991898
str_pos = np.array([isinstance(x, str) for x in values], dtype=bool)
19001899
null_pos = np.array([isna(x) for x in values], dtype=bool)
1901-
nums = np.sort(values[~str_pos & ~null_pos])
1902-
strs = np.sort(values[str_pos])
1903-
return np.concatenate(
1904-
[nums, np.asarray(strs, dtype=object), np.array(values[null_pos])]
1905-
)
1900+
num_pos = ~str_pos & ~null_pos
1901+
str_argsort = np.argsort(values[str_pos])
1902+
num_argsort = np.argsort(values[num_pos])
1903+
# convert boolean arrays to positional indices, then order by underlying values
1904+
str_locs = str_pos.nonzero()[0].take(str_argsort)
1905+
num_locs = num_pos.nonzero()[0].take(num_argsort)
1906+
null_locs = null_pos.nonzero()[0]
1907+
locs = np.concatenate([num_locs, str_locs, null_locs])
1908+
return values.take(locs)
19061909

19071910

19081911
def _sort_tuples(values: np.ndarray) -> np.ndarray:

pandas/core/indexes/api.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,7 @@ def safe_sort_index(index: Index) -> Index:
190190
except TypeError:
191191
pass
192192
else:
193-
if isinstance(array_sorted, MultiIndex):
193+
if isinstance(array_sorted, Index):
194194
return array_sorted
195195

196196
array_sorted = cast(np.ndarray, array_sorted)

pandas/tests/window/test_pairwise.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -252,7 +252,7 @@ def test_pairwise_with_self(self, pairwise_frames, pairwise_target_frame, f):
252252
tm.assert_index_equal(
253253
result.index.levels[0], pairwise_frames.index, check_names=False
254254
)
255-
tm.assert_numpy_array_equal(
255+
tm.assert_index_equal(
256256
safe_sort(result.index.levels[1]),
257257
safe_sort(pairwise_frames.columns.unique()),
258258
)
@@ -310,7 +310,7 @@ def test_pairwise_with_other(
310310
tm.assert_index_equal(
311311
result.index.levels[0], pairwise_frames.index, check_names=False
312312
)
313-
tm.assert_numpy_array_equal(
313+
tm.assert_index_equal(
314314
safe_sort(result.index.levels[1]),
315315
safe_sort(pairwise_other_frame.columns.unique()),
316316
)

0 commit comments

Comments
 (0)