Skip to content

PERF: pd.concat EA-backed indexes and sort=True #49178

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Oct 22, 2022
21 changes: 14 additions & 7 deletions asv_bench/benchmarks/join_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,13 +97,13 @@ class ConcatIndexDtype:

params = (
["datetime64[ns]", "int64", "Int64", "string[python]", "string[pyarrow]"],
["monotonic", "non_monotonic", "has_na"],
[0, 1],
[True, False],
[True, False],
)
param_names = ["dtype", "axis", "sort", "is_monotonic"]
param_names = ["dtype", "structure", "axis", "sort"]

def setup(self, dtype, axis, sort, is_monotonic):
def setup(self, dtype, structure, axis, sort):
N = 10_000
if dtype == "datetime64[ns]":
vals = date_range("1970-01-01", periods=N)
Expand All @@ -115,14 +115,21 @@ def setup(self, dtype, axis, sort, is_monotonic):
raise NotImplementedError

idx = Index(vals, dtype=dtype)
if is_monotonic:

if structure == "monotonic":
idx = idx.sort_values()
else:
elif structure == "non_monotonic":
idx = idx[::-1]
elif structure == "has_na":
if not idx._can_hold_na:
raise NotImplementedError
idx = Index([None], dtype=dtype).append(idx)
else:
raise NotImplementedError

self.series = [Series(i, idx[i:]) for i in range(5)]
self.series = [Series(i, idx[:-i]) for i in range(1, 6)]

def time_concat_series(self, dtype, axis, sort, is_monotonic):
def time_concat_series(self, dtype, structure, axis, sort):
concat(self.series, axis=axis, sort=sort)


Expand Down
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -218,10 +218,10 @@ Performance improvements
- Performance improvement in :func:`merge` and :meth:`DataFrame.join` when joining on a sorted :class:`MultiIndex` (:issue:`48504`)
- Performance improvement in :meth:`DataFrame.loc` and :meth:`Series.loc` for tuple-based indexing of a :class:`MultiIndex` (:issue:`48384`)
- Performance improvement for :meth:`MultiIndex.unique` (:issue:`48335`)
- Performance improvement for :func:`concat` with extension array backed indexes (:issue:`49128`, :issue:`49178`)
- Reduce memory usage of :meth:`DataFrame.to_pickle`/:meth:`Series.to_pickle` when using BZ2 or LZMA (:issue:`49068`)
- Performance improvement for :class:`~arrays.StringArray` constructor passing a numpy array with type ``np.str_`` (:issue:`49109`)
- Performance improvement in :meth:`~arrays.ArrowExtensionArray.factorize` (:issue:`49177`)
- Performance improvement for :func:`concat` with extension array backed indexes (:issue:`49128`)
- Performance improvement in :meth:`DataFrame.join` when joining on a subset of a :class:`MultiIndex` (:issue:`48611`)
- Performance improvement for :meth:`MultiIndex.intersection` (:issue:`48604`)
- Performance improvement in ``var`` for nullable dtypes (:issue:`48379`).
Expand Down
25 changes: 14 additions & 11 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,6 @@
Categorical,
DataFrame,
Index,
MultiIndex,
Series,
)
from pandas.core.arrays import (
Expand Down Expand Up @@ -1768,7 +1767,7 @@ def safe_sort(
na_sentinel: int | None = -1,
assume_unique: bool = False,
verify: bool = True,
) -> np.ndarray | MultiIndex | tuple[np.ndarray | MultiIndex, np.ndarray]:
) -> AnyArrayLike | tuple[AnyArrayLike, np.ndarray]:
"""
Sort ``values`` and reorder corresponding ``codes``.

Expand Down Expand Up @@ -1797,7 +1796,7 @@ def safe_sort(

Returns
-------
ordered : ndarray or MultiIndex
ordered : AnyArrayLike
Sorted ``values``
new_codes : ndarray
Reordered ``codes``; returned when ``codes`` is not None.
Expand All @@ -1816,7 +1815,7 @@ def safe_sort(
"Only list-like objects are allowed to be passed to safe_sort as values"
)

if not isinstance(values, (np.ndarray, ABCExtensionArray, ABCMultiIndex)):
if not is_array_like(values):
# don't convert to string types
dtype, _ = infer_dtype_from_array(values)
# error: Argument "dtype" to "asarray" has incompatible type "Union[dtype[Any],
Expand All @@ -1826,7 +1825,7 @@ def safe_sort(
values = np.asarray(values, dtype=dtype) # type: ignore[arg-type]

sorter = None
ordered: np.ndarray | MultiIndex
ordered: AnyArrayLike

if (
not is_extension_array_dtype(values)
Expand Down Expand Up @@ -1894,15 +1893,19 @@ def safe_sort(
return ordered, ensure_platform_int(new_codes)


def _sort_mixed(values) -> np.ndarray:
def _sort_mixed(values) -> AnyArrayLike:
"""order ints before strings before nulls in 1d arrays"""
str_pos = np.array([isinstance(x, str) for x in values], dtype=bool)
null_pos = np.array([isna(x) for x in values], dtype=bool)
nums = np.sort(values[~str_pos & ~null_pos])
strs = np.sort(values[str_pos])
return np.concatenate(
[nums, np.asarray(strs, dtype=object), np.array(values[null_pos])]
)
num_pos = ~str_pos & ~null_pos
str_argsort = np.argsort(values[str_pos])
num_argsort = np.argsort(values[num_pos])
# convert boolean arrays to positional indices, then order by underlying values
str_locs = str_pos.nonzero()[0].take(str_argsort)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is nonzero doing here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it is converting the boolean mask to positional indices within the larger array. we then sort those positional indices via the argsort of the string subset. This is all to be able to call take on the original input so we can avoid converting everything to ndarray.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah so False=0 and True is nonzero is the catch here? If yes, could you add a short comment?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, exactly. I added a comment explaining the operation.

num_locs = num_pos.nonzero()[0].take(num_argsort)
null_locs = null_pos.nonzero()[0]
locs = np.concatenate([num_locs, str_locs, null_locs])
return values.take(locs)


def _sort_tuples(values: np.ndarray) -> np.ndarray:
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/indexes/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ def safe_sort_index(index: Index) -> Index:
except TypeError:
pass
else:
if isinstance(array_sorted, MultiIndex):
if isinstance(array_sorted, Index):
return array_sorted

array_sorted = cast(np.ndarray, array_sorted)
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/window/test_pairwise.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,7 @@ def test_pairwise_with_self(self, pairwise_frames, pairwise_target_frame, f):
tm.assert_index_equal(
result.index.levels[0], pairwise_frames.index, check_names=False
)
tm.assert_numpy_array_equal(
tm.assert_index_equal(
safe_sort(result.index.levels[1]),
safe_sort(pairwise_frames.columns.unique()),
)
Expand Down Expand Up @@ -310,7 +310,7 @@ def test_pairwise_with_other(
tm.assert_index_equal(
result.index.levels[0], pairwise_frames.index, check_names=False
)
tm.assert_numpy_array_equal(
tm.assert_index_equal(
safe_sort(result.index.levels[1]),
safe_sort(pairwise_other_frame.columns.unique()),
)
Expand Down