From 2e2a02fa3876b246245d58aa16aa0ed504a8c1d4 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Tue, 18 Oct 2022 20:56:54 -0400 Subject: [PATCH 1/5] safe_sort_index perf for EA backed indexes --- asv_bench/benchmarks/join_merge.py | 21 ++++++++++++++------- pandas/core/algorithms.py | 9 ++++----- pandas/core/indexes/api.py | 2 +- 3 files changed, 19 insertions(+), 13 deletions(-) diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index 7a25f77da1c82..2309347ac96d8 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -97,13 +97,13 @@ class ConcatIndexDtype: params = ( ["datetime64[ns]", "int64", "Int64", "string[python]", "string[pyarrow]"], + ["monotonic", "non_monotonic", "has_na"], [0, 1], [True, False], - [True, False], ) - param_names = ["dtype", "axis", "sort", "is_monotonic"] + param_names = ["dtype", "structure", "axis", "sort"] - def setup(self, dtype, axis, sort, is_monotonic): + def setup(self, dtype, structure, axis, sort): N = 10_000 if dtype == "datetime64[ns]": vals = date_range("1970-01-01", periods=N) @@ -115,14 +115,21 @@ def setup(self, dtype, axis, sort, is_monotonic): raise NotImplementedError idx = Index(vals, dtype=dtype) - if is_monotonic: + + if structure == "monotonic": idx = idx.sort_values() - else: + elif structure == "non_monotonic": idx = idx[::-1] + elif structure == "has_na": + if not idx._can_hold_na: + raise NotImplementedError + idx = Index([None], dtype=dtype).append(idx) + else: + raise NotImplementedError - self.series = [Series(i, idx[i:]) for i in range(5)] + self.series = [Series(i, idx[:-i]) for i in range(1, 6)] - def time_concat_series(self, dtype, axis, sort, is_monotonic): + def time_concat_series(self, dtype, structure, axis, sort): concat(self.series, axis=axis, sort=sort) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 0edb711e8824e..48c03c0208503 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -102,7 +102,6 @@ Categorical, DataFrame, Index, - MultiIndex, Series, ) from pandas.core.arrays import ( @@ -1766,7 +1765,7 @@ def safe_sort( na_sentinel: int | None = -1, assume_unique: bool = False, verify: bool = True, -) -> np.ndarray | MultiIndex | tuple[np.ndarray | MultiIndex, np.ndarray]: +) -> AnyArrayLike | tuple[AnyArrayLike, np.ndarray]: """ Sort ``values`` and reorder corresponding ``codes``. @@ -1795,7 +1794,7 @@ def safe_sort( Returns ------- - ordered : ndarray or MultiIndex + ordered : AnyArrayLike Sorted ``values`` new_codes : ndarray Reordered ``codes``; returned when ``codes`` is not None. @@ -1814,7 +1813,7 @@ def safe_sort( "Only list-like objects are allowed to be passed to safe_sort as values" ) - if not isinstance(values, (np.ndarray, ABCExtensionArray, ABCMultiIndex)): + if not is_array_like(values): # don't convert to string types dtype, _ = infer_dtype_from_array(values) # error: Argument "dtype" to "asarray" has incompatible type "Union[dtype[Any], @@ -1824,7 +1823,7 @@ def safe_sort( values = np.asarray(values, dtype=dtype) # type: ignore[arg-type] sorter = None - ordered: np.ndarray | MultiIndex + ordered: AnyArrayLike if ( not is_extension_array_dtype(values) diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index fd8fa50dab99e..58dd207bb4353 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -190,7 +190,7 @@ def safe_sort_index(index: Index) -> Index: except TypeError: pass else: - if isinstance(array_sorted, MultiIndex): + if isinstance(array_sorted, Index): return array_sorted array_sorted = cast(np.ndarray, array_sorted) From 5ad3fbd17a01fe256c7f095fa3c7952ccae9b9f2 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Tue, 18 Oct 2022 22:06:38 -0400 Subject: [PATCH 2/5] whatsnew --- doc/source/whatsnew/v2.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 3e376d15a5a87..88a7f2ace273b 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -156,7 +156,7 @@ Performance improvements - Performance improvement in :func:`merge` and :meth:`DataFrame.join` when joining on a sorted :class:`MultiIndex` (:issue:`48504`) - Performance improvement in :meth:`DataFrame.loc` and :meth:`Series.loc` for tuple-based indexing of a :class:`MultiIndex` (:issue:`48384`) - Performance improvement for :meth:`MultiIndex.unique` (:issue:`48335`) -- Performance improvement for :func:`concat` with extension array backed indexes (:issue:`49128`) +- Performance improvement for :func:`concat` with extension array backed indexes (:issue:`49128`, :issue:`49178`) - Performance improvement in :meth:`DataFrame.join` when joining on a subset of a :class:`MultiIndex` (:issue:`48611`) - Performance improvement for :meth:`MultiIndex.intersection` (:issue:`48604`) - Performance improvement in ``var`` for nullable dtypes (:issue:`48379`). From e4c6c8ae5f9f13cedc919c0c5311f8f0288bd183 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Wed, 19 Oct 2022 06:36:14 -0400 Subject: [PATCH 3/5] _sort_mixed to avoid np conversion --- pandas/core/algorithms.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 41f31ff895c2c..205db9f961199 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1893,15 +1893,18 @@ def safe_sort( return ordered, ensure_platform_int(new_codes) -def _sort_mixed(values) -> np.ndarray: +def _sort_mixed(values) -> AnyArrayLike: """order ints before strings before nulls in 1d arrays""" str_pos = np.array([isinstance(x, str) for x in values], dtype=bool) null_pos = np.array([isna(x) for x in values], dtype=bool) - nums = np.sort(values[~str_pos & ~null_pos]) - strs = np.sort(values[str_pos]) - return np.concatenate( - [nums, np.asarray(strs, dtype=object), np.array(values[null_pos])] - ) + num_pos = ~str_pos & ~null_pos + str_argsort = np.argsort(values[str_pos]) + num_argsort = np.argsort(values[num_pos]) + str_locs = str_pos.nonzero()[0].take(str_argsort) + num_locs = num_pos.nonzero()[0].take(num_argsort) + null_locs = null_pos.nonzero()[0] + locs = np.concatenate([num_locs, str_locs, null_locs]) + return values.take(locs) def _sort_tuples(values: np.ndarray) -> np.ndarray: From c21c33fbc0990773f0c9843e7dfe49906a4181ba Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Wed, 19 Oct 2022 06:41:16 -0400 Subject: [PATCH 4/5] update test --- pandas/tests/window/test_pairwise.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/window/test_pairwise.py b/pandas/tests/window/test_pairwise.py index 6c3f3f7075ce0..04132ced044fc 100644 --- a/pandas/tests/window/test_pairwise.py +++ b/pandas/tests/window/test_pairwise.py @@ -252,7 +252,7 @@ def test_pairwise_with_self(self, pairwise_frames, pairwise_target_frame, f): tm.assert_index_equal( result.index.levels[0], pairwise_frames.index, check_names=False ) - tm.assert_numpy_array_equal( + tm.assert_index_equal( safe_sort(result.index.levels[1]), safe_sort(pairwise_frames.columns.unique()), ) @@ -310,7 +310,7 @@ def test_pairwise_with_other( tm.assert_index_equal( result.index.levels[0], pairwise_frames.index, check_names=False ) - tm.assert_numpy_array_equal( + tm.assert_index_equal( safe_sort(result.index.levels[1]), safe_sort(pairwise_other_frame.columns.unique()), ) From 55cdf5208245926febfbd0fe4c0b62b4f84ed3de Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Thu, 20 Oct 2022 19:09:30 -0400 Subject: [PATCH 5/5] add comment --- pandas/core/algorithms.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 205db9f961199..8ab1789ec04cf 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1900,6 +1900,7 @@ def _sort_mixed(values) -> AnyArrayLike: num_pos = ~str_pos & ~null_pos str_argsort = np.argsort(values[str_pos]) num_argsort = np.argsort(values[num_pos]) + # convert boolean arrays to positional indices, then order by underlying values str_locs = str_pos.nonzero()[0].take(str_argsort) num_locs = num_pos.nonzero()[0].take(num_argsort) null_locs = null_pos.nonzero()[0]