-
Notifications
You must be signed in to change notification settings - Fork 13.4k
Add optimize_for_size
variants for stable and unstable sort as well as select_nth_unstable
#129587
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
bors
merged 9 commits into
rust-lang:master
from
Voultapher:opt-for-size-variants-of-sort-impls
Sep 24, 2024
Merged
Changes from 1 commit
Commits
Show all changes
9 commits
Select commit
Hold shift + click to select a range
13d7b54
Add binary-size optimized variants for stable and unstable sort as we…
Voultapher 89c9a29
Reduce code duplication by moving partition_lomuto_branchless_simple …
Voultapher 8756ba5
Convert cfg blocks to cfg_if
Voultapher 7815d77
Use last swap optimization in bubblesort
Voultapher 00eca77
Use simpler branchy swap logic in tiny merge sort
Voultapher f2d4198
Drop bubble_sort
Voultapher adb0e27
Shrink heapsort further by combining sift_down loops
Voultapher a0e4303
Select tiny sorts for 16-bit platforms
Voultapher 5439198
Use non-overlapping swap for inner heapsort loop
Voultapher File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,15 +1,24 @@ | ||
//! This module contains the entry points for `slice::sort`. | ||
|
||
#[cfg(not(feature = "optimize_for_size"))] | ||
use crate::cmp; | ||
use crate::intrinsics; | ||
use crate::mem::{self, MaybeUninit, SizedTypeProperties}; | ||
#[cfg(not(feature = "optimize_for_size"))] | ||
use crate::slice::sort::shared::smallsort::{ | ||
insertion_sort_shift_left, StableSmallSortTypeImpl, SMALL_SORT_GENERAL_SCRATCH_LEN, | ||
}; | ||
use crate::{cmp, intrinsics}; | ||
|
||
pub(crate) mod drift; | ||
pub(crate) mod merge; | ||
|
||
#[cfg(not(feature = "optimize_for_size"))] | ||
pub(crate) mod drift; | ||
#[cfg(not(feature = "optimize_for_size"))] | ||
pub(crate) mod quicksort; | ||
|
||
#[cfg(feature = "optimize_for_size")] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This likely needs to be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good catch. |
||
pub(crate) mod tiny; | ||
|
||
/// Stable sort called driftsort by Orson Peters and Lukas Bergdoll. | ||
/// Design document: | ||
/// <https://github.com/Voultapher/sort-research-rs/blob/main/writeup/driftsort_introduction/text.md> | ||
|
@@ -30,25 +39,48 @@ pub fn sort<T, F: FnMut(&T, &T) -> bool, BufT: BufGuard<T>>(v: &mut [T], is_less | |
return; | ||
} | ||
|
||
// More advanced sorting methods than insertion sort are faster if called in | ||
// a hot loop for small inputs, but for general-purpose code the small | ||
// binary size of insertion sort is more important. The instruction cache in | ||
// modern processors is very valuable, and for a single sort call in general | ||
// purpose code any gains from an advanced method are cancelled by i-cache | ||
// misses during the sort, and thrashing the i-cache for surrounding code. | ||
const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20; | ||
if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) { | ||
insertion_sort_shift_left(v, 1, is_less); | ||
return; | ||
#[cfg(not(feature = "optimize_for_size"))] | ||
{ | ||
// More advanced sorting methods than insertion sort are faster if called in | ||
// a hot loop for small inputs, but for general-purpose code the small | ||
// binary size of insertion sort is more important. The instruction cache in | ||
// modern processors is very valuable, and for a single sort call in general | ||
// purpose code any gains from an advanced method are cancelled by i-cache | ||
// misses during the sort, and thrashing the i-cache for surrounding code. | ||
const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20; | ||
if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) { | ||
insertion_sort_shift_left(v, 1, is_less); | ||
return; | ||
} | ||
|
||
driftsort_main::<T, F, BufT>(v, is_less); | ||
} | ||
|
||
driftsort_main::<T, F, BufT>(v, is_less); | ||
#[cfg(feature = "optimize_for_size")] | ||
{ | ||
let alloc_len = len / 2; | ||
|
||
// For small inputs 4KiB of stack storage suffices, which allows us to avoid | ||
// calling the (de-)allocator. Benchmarks showed this was quite beneficial. | ||
let mut stack_buf = AlignedStorage::<T, 4096>::new(); | ||
let stack_scratch = stack_buf.as_uninit_slice_mut(); | ||
let mut heap_buf; | ||
let scratch = if stack_scratch.len() >= alloc_len { | ||
stack_scratch | ||
} else { | ||
heap_buf = BufT::with_capacity(alloc_len); | ||
heap_buf.as_uninit_slice_mut() | ||
}; | ||
|
||
tiny::mergesort(v, scratch, is_less); | ||
} | ||
} | ||
|
||
/// See [`sort`] | ||
/// | ||
/// Deliberately don't inline the main sorting routine entrypoint to ensure the | ||
/// inlined insertion sort i-cache footprint remains minimal. | ||
#[cfg(not(feature = "optimize_for_size"))] | ||
#[inline(never)] | ||
fn driftsort_main<T, F: FnMut(&T, &T) -> bool, BufT: BufGuard<T>>(v: &mut [T], is_less: &mut F) { | ||
// By allocating n elements of memory we can ensure the entire input can | ||
|
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
//! Binary-size optimized mergesort inspired by https://github.com/voultapher/tiny-sort-rs. | ||
|
||
use crate::mem::{ManuallyDrop, MaybeUninit}; | ||
use crate::ptr; | ||
use crate::slice::sort::stable::merge; | ||
|
||
/// Tiny recursive top-down merge sort optimized for binary size. It has no adaptiveness whatsoever, | ||
/// no run detection, etc. | ||
#[inline(always)] | ||
pub fn mergesort<T, F: FnMut(&T, &T) -> bool>( | ||
v: &mut [T], | ||
scratch: &mut [MaybeUninit<T>], | ||
is_less: &mut F, | ||
) { | ||
let len = v.len(); | ||
|
||
if len > 2 { | ||
let mid = len / 2; | ||
|
||
// SAFETY: mid is in-bounds. | ||
unsafe { | ||
// Sort the left half recursively. | ||
mergesort(v.get_unchecked_mut(..mid), scratch, is_less); | ||
// Sort the right half recursively. | ||
mergesort(v.get_unchecked_mut(mid..), scratch, is_less); | ||
} | ||
|
||
merge::merge(v, scratch, mid, is_less); | ||
} else if len == 2 { | ||
// Branchless swap the two elements. This reduces the recursion depth and improves | ||
// perf significantly at a small binary-size cost. Trades ~10% perf boost for integers | ||
// for ~50 bytes in the binary. | ||
|
||
// SAFETY: We checked the len, the pointers we create are valid and don't overlap. | ||
unsafe { | ||
swap_if_less(v.as_mut_ptr(), 0, 1, is_less); | ||
} | ||
} | ||
} | ||
|
||
/// Swap two values in the slice pointed to by `v_base` at the position `a_pos` and `b_pos` if the | ||
/// value at position `b_pos` is less than the one at position `a_pos`. | ||
unsafe fn swap_if_less<T, F>(v_base: *mut T, a_pos: usize, b_pos: usize, is_less: &mut F) | ||
Voultapher marked this conversation as resolved.
Show resolved
Hide resolved
|
||
where | ||
F: FnMut(&T, &T) -> bool, | ||
{ | ||
// SAFETY: the caller must guarantee that `a` and `b` each added to `v_base` yield valid | ||
// pointers into `v_base`, and are properly aligned, and part of the same allocation. | ||
unsafe { | ||
let v_a = v_base.add(a_pos); | ||
let v_b = v_base.add(b_pos); | ||
|
||
// PANIC SAFETY: if is_less panics, no scratch memory was created and the slice should still be | ||
// in a well defined state, without duplicates. | ||
|
||
// Important to only swap if it is more and not if it is equal. is_less should return false for | ||
// equal, so we don't swap. | ||
let should_swap = is_less(&*v_b, &*v_a); | ||
|
||
// This is a branchless version of swap if. | ||
// The equivalent code with a branch would be: | ||
// | ||
// if should_swap { | ||
// ptr::swap(left, right, 1); | ||
// } | ||
|
||
// The goal is to generate cmov instructions here. | ||
let left_swap = if should_swap { v_b } else { v_a }; | ||
let right_swap = if should_swap { v_a } else { v_b }; | ||
|
||
let right_swap_tmp = ManuallyDrop::new(ptr::read(right_swap)); | ||
ptr::copy(left_swap, v_a, 1); | ||
ptr::copy_nonoverlapping(&*right_swap_tmp, v_b, 1); | ||
} | ||
} |
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.