diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index 7a25f77da1c82..2309347ac96d8 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -97,13 +97,13 @@ class ConcatIndexDtype: params = ( ["datetime64[ns]", "int64", "Int64", "string[python]", "string[pyarrow]"], + ["monotonic", "non_monotonic", "has_na"], [0, 1], [True, False], - [True, False], ) - param_names = ["dtype", "axis", "sort", "is_monotonic"] + param_names = ["dtype", "structure", "axis", "sort"] - def setup(self, dtype, axis, sort, is_monotonic): + def setup(self, dtype, structure, axis, sort): N = 10_000 if dtype == "datetime64[ns]": vals = date_range("1970-01-01", periods=N) @@ -115,14 +115,21 @@ def setup(self, dtype, axis, sort, is_monotonic): raise NotImplementedError idx = Index(vals, dtype=dtype) - if is_monotonic: + + if structure == "monotonic": idx = idx.sort_values() - else: + elif structure == "non_monotonic": idx = idx[::-1] + elif structure == "has_na": + if not idx._can_hold_na: + raise NotImplementedError + idx = Index([None], dtype=dtype).append(idx) + else: + raise NotImplementedError - self.series = [Series(i, idx[i:]) for i in range(5)] + self.series = [Series(i, idx[:-i]) for i in range(1, 6)] - def time_concat_series(self, dtype, axis, sort, is_monotonic): + def time_concat_series(self, dtype, structure, axis, sort): concat(self.series, axis=axis, sort=sort) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index e281e250d608e..8cd00724d5d30 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -218,10 +218,10 @@ Performance improvements - Performance improvement in :func:`merge` and :meth:`DataFrame.join` when joining on a sorted :class:`MultiIndex` (:issue:`48504`) - Performance improvement in :meth:`DataFrame.loc` and :meth:`Series.loc` for tuple-based indexing of a :class:`MultiIndex` (:issue:`48384`) - Performance improvement for :meth:`MultiIndex.unique` (:issue:`48335`) +- Performance improvement for :func:`concat` with extension array backed indexes (:issue:`49128`, :issue:`49178`) - Reduce memory usage of :meth:`DataFrame.to_pickle`/:meth:`Series.to_pickle` when using BZ2 or LZMA (:issue:`49068`) - Performance improvement for :class:`~arrays.StringArray` constructor passing a numpy array with type ``np.str_`` (:issue:`49109`) - Performance improvement in :meth:`~arrays.ArrowExtensionArray.factorize` (:issue:`49177`) -- Performance improvement for :func:`concat` with extension array backed indexes (:issue:`49128`) - Performance improvement in :meth:`DataFrame.join` when joining on a subset of a :class:`MultiIndex` (:issue:`48611`) - Performance improvement for :meth:`MultiIndex.intersection` (:issue:`48604`) - Performance improvement in ``var`` for nullable dtypes (:issue:`48379`). diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 1988753de3d01..aca5c4345d247 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -102,7 +102,6 @@ Categorical, DataFrame, Index, - MultiIndex, Series, ) from pandas.core.arrays import ( @@ -1768,7 +1767,7 @@ def safe_sort( na_sentinel: int | None = -1, assume_unique: bool = False, verify: bool = True, -) -> np.ndarray | MultiIndex | tuple[np.ndarray | MultiIndex, np.ndarray]: +) -> AnyArrayLike | tuple[AnyArrayLike, np.ndarray]: """ Sort ``values`` and reorder corresponding ``codes``. @@ -1797,7 +1796,7 @@ def safe_sort( Returns ------- - ordered : ndarray or MultiIndex + ordered : AnyArrayLike Sorted ``values`` new_codes : ndarray Reordered ``codes``; returned when ``codes`` is not None. @@ -1816,7 +1815,7 @@ def safe_sort( "Only list-like objects are allowed to be passed to safe_sort as values" ) - if not isinstance(values, (np.ndarray, ABCExtensionArray, ABCMultiIndex)): + if not is_array_like(values): # don't convert to string types dtype, _ = infer_dtype_from_array(values) # error: Argument "dtype" to "asarray" has incompatible type "Union[dtype[Any], @@ -1826,7 +1825,7 @@ def safe_sort( values = np.asarray(values, dtype=dtype) # type: ignore[arg-type] sorter = None - ordered: np.ndarray | MultiIndex + ordered: AnyArrayLike if ( not is_extension_array_dtype(values) @@ -1894,15 +1893,19 @@ def safe_sort( return ordered, ensure_platform_int(new_codes) -def _sort_mixed(values) -> np.ndarray: +def _sort_mixed(values) -> AnyArrayLike: """order ints before strings before nulls in 1d arrays""" str_pos = np.array([isinstance(x, str) for x in values], dtype=bool) null_pos = np.array([isna(x) for x in values], dtype=bool) - nums = np.sort(values[~str_pos & ~null_pos]) - strs = np.sort(values[str_pos]) - return np.concatenate( - [nums, np.asarray(strs, dtype=object), np.array(values[null_pos])] - ) + num_pos = ~str_pos & ~null_pos + str_argsort = np.argsort(values[str_pos]) + num_argsort = np.argsort(values[num_pos]) + # convert boolean arrays to positional indices, then order by underlying values + str_locs = str_pos.nonzero()[0].take(str_argsort) + num_locs = num_pos.nonzero()[0].take(num_argsort) + null_locs = null_pos.nonzero()[0] + locs = np.concatenate([num_locs, str_locs, null_locs]) + return values.take(locs) def _sort_tuples(values: np.ndarray) -> np.ndarray: diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index fd8fa50dab99e..58dd207bb4353 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -190,7 +190,7 @@ def safe_sort_index(index: Index) -> Index: except TypeError: pass else: - if isinstance(array_sorted, MultiIndex): + if isinstance(array_sorted, Index): return array_sorted array_sorted = cast(np.ndarray, array_sorted) diff --git a/pandas/tests/window/test_pairwise.py b/pandas/tests/window/test_pairwise.py index 6c3f3f7075ce0..04132ced044fc 100644 --- a/pandas/tests/window/test_pairwise.py +++ b/pandas/tests/window/test_pairwise.py @@ -252,7 +252,7 @@ def test_pairwise_with_self(self, pairwise_frames, pairwise_target_frame, f): tm.assert_index_equal( result.index.levels[0], pairwise_frames.index, check_names=False ) - tm.assert_numpy_array_equal( + tm.assert_index_equal( safe_sort(result.index.levels[1]), safe_sort(pairwise_frames.columns.unique()), ) @@ -310,7 +310,7 @@ def test_pairwise_with_other( tm.assert_index_equal( result.index.levels[0], pairwise_frames.index, check_names=False ) - tm.assert_numpy_array_equal( + tm.assert_index_equal( safe_sort(result.index.levels[1]), safe_sort(pairwise_other_frame.columns.unique()), )