Skip to content

Commit b573e10

Browse files
committed
Auto merge of #98553 - the8472:next_chunk_opt, r=Mark-Simulacrum
Optimized vec::IntoIter::next_chunk impl ``` x86_64v1, default test vec::bench_next_chunk ... bench: 696 ns/iter (+/- 22) x86_64v1, pr test vec::bench_next_chunk ... bench: 309 ns/iter (+/- 4) znver2, default test vec::bench_next_chunk ... bench: 17,272 ns/iter (+/- 117) znver2, pr test vec::bench_next_chunk ... bench: 211 ns/iter (+/- 3) ``` On znver2 the default impl seems to be slow due to different inlining decisions. It goes through `core::array::iter_next_chunk` which has a deep call tree.
2 parents 4d6d601 + 4ba7cac commit b573e10

File tree

6 files changed

+75
-2
lines changed

6 files changed

+75
-2
lines changed

library/alloc/benches/lib.rs

+1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
// See https://github.com/rust-lang/rust/issues/73535#event-3477699747
33
#![cfg(not(target_os = "android"))]
44
#![feature(btree_drain_filter)]
5+
#![feature(iter_next_chunk)]
56
#![feature(map_first_last)]
67
#![feature(repr_simd)]
78
#![feature(slice_partition_dedup)]

library/alloc/benches/vec.rs

+20
Original file line numberDiff line numberDiff line change
@@ -762,3 +762,23 @@ fn bench_retain_whole_100000(b: &mut Bencher) {
762762
let mut v = black_box(vec![826u32; 100000]);
763763
b.iter(|| v.retain(|x| *x == 826u32));
764764
}
765+
766+
#[bench]
767+
fn bench_next_chunk(b: &mut Bencher) {
768+
let v = vec![13u8; 2048];
769+
770+
b.iter(|| {
771+
const CHUNK: usize = 8;
772+
773+
let mut sum = [0u32; CHUNK];
774+
let mut iter = black_box(v.clone()).into_iter();
775+
776+
while let Ok(chunk) = iter.next_chunk::<CHUNK>() {
777+
for i in 0..CHUNK {
778+
sum[i] += chunk[i] as u32;
779+
}
780+
}
781+
782+
sum
783+
})
784+
}

library/alloc/src/lib.rs

+4
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@
8989
#![feature(alloc_layout_extra)]
9090
#![feature(allocator_api)]
9191
#![feature(array_chunks)]
92+
#![feature(array_into_iter_constructors)]
9293
#![feature(array_methods)]
9394
#![feature(array_windows)]
9495
#![feature(assert_matches)]
@@ -117,8 +118,11 @@
117118
#![feature(hasher_prefixfree_extras)]
118119
#![feature(inplace_iteration)]
119120
#![feature(iter_advance_by)]
121+
#![feature(iter_next_chunk)]
120122
#![feature(layout_for_ptr)]
123+
#![feature(maybe_uninit_array_assume_init)]
121124
#![feature(maybe_uninit_slice)]
125+
#![feature(maybe_uninit_uninit_array)]
122126
#![cfg_attr(test, feature(new_uninit))]
123127
#![feature(nonnull_slice_from_raw_parts)]
124128
#![feature(pattern)]

library/alloc/src/vec/into_iter.rs

+39-2
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,14 @@
22
use super::AsVecIntoIter;
33
use crate::alloc::{Allocator, Global};
44
use crate::raw_vec::RawVec;
5+
use core::array;
56
use core::fmt;
67
use core::intrinsics::arith_offset;
78
use core::iter::{
89
FusedIterator, InPlaceIterable, SourceIter, TrustedLen, TrustedRandomAccessNoCoerce,
910
};
1011
use core::marker::PhantomData;
11-
use core::mem::{self, ManuallyDrop};
12+
use core::mem::{self, ManuallyDrop, MaybeUninit};
1213
#[cfg(not(no_global_oom_handling))]
1314
use core::ops::Deref;
1415
use core::ptr::{self, NonNull};
@@ -124,7 +125,6 @@ impl<T, A: Allocator> IntoIter<T, A> {
124125
}
125126

126127
/// Forgets to Drop the remaining elements while still allowing the backing allocation to be freed.
127-
#[cfg(not(no_global_oom_handling))]
128128
pub(crate) fn forget_remaining_elements(&mut self) {
129129
self.ptr = self.end;
130130
}
@@ -204,6 +204,43 @@ impl<T, A: Allocator> Iterator for IntoIter<T, A> {
204204
self.len()
205205
}
206206

207+
#[inline]
208+
fn next_chunk<const N: usize>(&mut self) -> Result<[T; N], core::array::IntoIter<T, N>> {
209+
let mut raw_ary = MaybeUninit::uninit_array();
210+
211+
let len = self.len();
212+
213+
if mem::size_of::<T>() == 0 {
214+
if len < N {
215+
self.forget_remaining_elements();
216+
// Safety: ZSTs can be conjured ex nihilo, only the amount has to be correct
217+
return Err(unsafe { array::IntoIter::new_unchecked(raw_ary, 0..len) });
218+
}
219+
220+
self.ptr = unsafe { arith_offset(self.ptr as *const i8, N as isize) as *mut T };
221+
// Safety: ditto
222+
return Ok(unsafe { MaybeUninit::array_assume_init(raw_ary) });
223+
}
224+
225+
if len < N {
226+
// Safety: `len` indicates that this many elements are available and we just checked that
227+
// it fits into the array.
228+
unsafe {
229+
ptr::copy_nonoverlapping(self.ptr, raw_ary.as_mut_ptr() as *mut T, len);
230+
self.forget_remaining_elements();
231+
return Err(array::IntoIter::new_unchecked(raw_ary, 0..len));
232+
}
233+
}
234+
235+
// Safety: `len` is larger than the array size. Copy a fixed amount here to fully initialize
236+
// the array.
237+
return unsafe {
238+
ptr::copy_nonoverlapping(self.ptr, raw_ary.as_mut_ptr() as *mut T, N);
239+
self.ptr = self.ptr.add(N);
240+
Ok(MaybeUninit::array_assume_init(raw_ary))
241+
};
242+
}
243+
207244
unsafe fn __iterator_get_unchecked(&mut self, i: usize) -> Self::Item
208245
where
209246
Self: TrustedRandomAccessNoCoerce,

library/alloc/tests/lib.rs

+1
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
#![feature(binary_heap_as_slice)]
2828
#![feature(inplace_iteration)]
2929
#![feature(iter_advance_by)]
30+
#![feature(iter_next_chunk)]
3031
#![feature(round_char_boundary)]
3132
#![feature(slice_group_by)]
3233
#![feature(slice_partition_dedup)]

library/alloc/tests/vec.rs

+10
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
use core::alloc::{Allocator, Layout};
2+
use core::iter::IntoIterator;
23
use core::ptr::NonNull;
34
use std::alloc::System;
45
use std::assert_matches::assert_matches;
@@ -930,6 +931,15 @@ fn test_into_iter_count() {
930931
assert_eq!([1, 2, 3].into_iter().count(), 3);
931932
}
932933

934+
#[test]
935+
fn test_into_iter_next_chunk() {
936+
let mut iter = b"lorem".to_vec().into_iter();
937+
938+
assert_eq!(iter.next_chunk().unwrap(), [b'l', b'o']); // N is inferred as 2
939+
assert_eq!(iter.next_chunk().unwrap(), [b'r', b'e', b'm']); // N is inferred as 3
940+
assert_eq!(iter.next_chunk::<4>().unwrap_err().as_slice(), &[]); // N is explicitly 4
941+
}
942+
933943
#[test]
934944
fn test_into_iter_clone() {
935945
fn iter_equal<I: Iterator<Item = i32>>(it: I, slice: &[i32]) {

0 commit comments

Comments
 (0)