Auto merge of rust-lang#114860 - Zoxc:sharded-layout, r=SparrowLii

bors · bors · commit 840ed5d133e7 · 2023-08-24T02:24:25.000Z
Make `Sharded` an enum and specialize it for the single thread case

This changes `Sharded` to use a single shard by an enum, reducing the size of `Sharded` for greater cache efficiency.

Performance improvement with 1 thread and `cfg(parallel_compiler)`:
&lt;table&gt;&lt;tr&gt;&lt;td rowspan="2"&gt;Benchmark&lt;/td&gt;&lt;td colspan="1"&gt;&lt;b&gt;Before&lt;/b&gt;&lt;/th&gt;&lt;td colspan="2"&gt;&lt;b&gt;After&lt;/b&gt;&lt;/th&gt;&lt;/tr&gt;&lt;tr&gt;&lt;td align="right"&gt;Time&lt;/td&gt;&lt;td align="right"&gt;Time&lt;/td&gt;&lt;td align="right"&gt;%&lt;/th&gt;&lt;/tr&gt;&lt;tr&gt;&lt;td&gt;🟣 &lt;b&gt;clap&lt;/b&gt;:check&lt;/td&gt;&lt;td align="right"&gt;1.7009s&lt;/td&gt;&lt;td align="right"&gt;1.6748s&lt;/td&gt;&lt;td align="right"&gt;💚  -1.53%&lt;/td&gt;&lt;/tr&gt;&lt;tr&gt;&lt;td&gt;🟣 &lt;b&gt;hyper&lt;/b&gt;:check&lt;/td&gt;&lt;td align="right"&gt;0.2525s&lt;/td&gt;&lt;td align="right"&gt;0.2451s&lt;/td&gt;&lt;td align="right"&gt;💚  -2.90%&lt;/td&gt;&lt;/tr&gt;&lt;tr&gt;&lt;td&gt;🟣 &lt;b&gt;regex&lt;/b&gt;:check&lt;/td&gt;&lt;td align="right"&gt;0.9519s&lt;/td&gt;&lt;td align="right"&gt;0.9353s&lt;/td&gt;&lt;td align="right"&gt;💚  -1.74%&lt;/td&gt;&lt;/tr&gt;&lt;tr&gt;&lt;td&gt;🟣 &lt;b&gt;syn&lt;/b&gt;:check&lt;/td&gt;&lt;td align="right"&gt;1.5504s&lt;/td&gt;&lt;td align="right"&gt;1.5280s&lt;/td&gt;&lt;td align="right"&gt;💚  -1.45%&lt;/td&gt;&lt;/tr&gt;&lt;tr&gt;&lt;td&gt;🟣 &lt;b&gt;syntex_syntax&lt;/b&gt;:check&lt;/td&gt;&lt;td align="right"&gt;5.9536s&lt;/td&gt;&lt;td align="right"&gt;5.8873s&lt;/td&gt;&lt;td align="right"&gt;💚  -1.11%&lt;/td&gt;&lt;/tr&gt;&lt;tr&gt;&lt;td&gt;Total&lt;/td&gt;&lt;td align="right"&gt;10.4092s&lt;/td&gt;&lt;td align="right"&gt;10.2706s&lt;/td&gt;&lt;td align="right"&gt;💚  -1.33%&lt;/td&gt;&lt;/tr&gt;&lt;tr&gt;&lt;td&gt;Summary&lt;/td&gt;&lt;td align="right"&gt;1.0000s&lt;/td&gt;&lt;td align="right"&gt;0.9825s&lt;/td&gt;&lt;td align="right"&gt;💚  -1.75%&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;

I did see an unexpected 0.23% change for the serial compiler, so this could use a perf run to see if that reproduces.

cc `@SparrowLii`
diff --git a/compiler/rustc_data_structures/src/sharded.rs b/compiler/rustc_data_structures/src/sharded.rs
@@ -1,31 +1,26 @@
 use crate::fx::{FxHashMap, FxHasher};
 #[cfg(parallel_compiler)]
-use crate::sync::is_dyn_thread_safe;
-use crate::sync::{CacheAligned, Lock, LockGuard};
+use crate::sync::{is_dyn_thread_safe, CacheAligned};
+use crate::sync::{Lock, LockGuard};
 use std::borrow::Borrow;
 use std::collections::hash_map::RawEntryMut;
 use std::hash::{Hash, Hasher};
 use std::mem;
 
-#[cfg(parallel_compiler)]
 // 32 shards is sufficient to reduce contention on an 8-core Ryzen 7 1700,
 // but this should be tested on higher core count CPUs. How the `Sharded` type gets used
 // may also affect the ideal number of shards.
 const SHARD_BITS: usize = 5;
 
-#[cfg(not(parallel_compiler))]
-const SHARD_BITS: usize = 0;
-
-pub const SHARDS: usize = 1 << SHARD_BITS;
+#[cfg(parallel_compiler)]
+const SHARDS: usize = 1 << SHARD_BITS;
 
 /// An array of cache-line aligned inner locked structures with convenience methods.
-pub struct Sharded<T> {
-    /// This mask is used to ensure that accesses are inbounds of `shards`.
-    /// When dynamic thread safety is off, this field is set to 0 causing only
-    /// a single shard to be used for greater cache efficiency.
+/// A single field is used when the compiler uses only one thread.
+pub enum Sharded<T> {
+    Single(Lock<T>),
     #[cfg(parallel_compiler)]
-    mask: usize,
-    shards: [CacheAligned<Lock<T>>; SHARDS],
+    Shards(Box<[CacheAligned<Lock<T>>; SHARDS]>),
 }
 
 impl<T: Default> Default for Sharded<T> {
@@ -38,35 +33,24 @@ impl<T: Default> Default for Sharded<T> {
 impl<T> Sharded<T> {
     #[inline]
     pub fn new(mut value: impl FnMut() -> T) -> Self {
-        Sharded {
-            #[cfg(parallel_compiler)]
-            mask: if is_dyn_thread_safe() { SHARDS - 1 } else { 0 },
-            shards: [(); SHARDS].map(|()| CacheAligned(Lock::new(value()))),
-        }
-    }
-
-    #[inline(always)]
-    fn mask(&self) -> usize {
         #[cfg(parallel_compiler)]
-        {
-            if SHARDS == 1 { 0 } else { self.mask }
-        }
-        #[cfg(not(parallel_compiler))]
-        {
-            0
+        if is_dyn_thread_safe() {
+            return Sharded::Shards(Box::new(
+                [(); SHARDS].map(|()| CacheAligned(Lock::new(value()))),
+            ));
         }
-    }
 
-    #[inline(always)]
-    fn count(&self) -> usize {
-        // `self.mask` is always one below the used shard count
-        self.mask() + 1
+        Sharded::Single(Lock::new(value()))
     }
 
     /// The shard is selected by hashing `val` with `FxHasher`.
     #[inline]
-    pub fn get_shard_by_value<K: Hash + ?Sized>(&self, val: &K) -> &Lock<T> {
-        self.get_shard_by_hash(if SHARDS == 1 { 0 } else { make_hash(val) })
+    pub fn get_shard_by_value<K: Hash + ?Sized>(&self, _val: &K) -> &Lock<T> {
+        match self {
+            Self::Single(single) => &single,
+            #[cfg(parallel_compiler)]
+            Self::Shards(..) => self.get_shard_by_hash(make_hash(_val)),
+        }
     }
 
     #[inline]
@@ -75,20 +59,44 @@ impl<T> Sharded<T> {
     }
 
     #[inline]
-    pub fn get_shard_by_index(&self, i: usize) -> &Lock<T> {
-        // SAFETY: The index get ANDed with the mask, ensuring it is always inbounds.
-        unsafe { &self.shards.get_unchecked(i & self.mask()).0 }
+    pub fn get_shard_by_index(&self, _i: usize) -> &Lock<T> {
+        match self {
+            Self::Single(single) => &single,
+            #[cfg(parallel_compiler)]
+            Self::Shards(shards) => {
+                // SAFETY: The index gets ANDed with the shard mask, ensuring it is always inbounds.
+                unsafe { &shards.get_unchecked(_i & (SHARDS - 1)).0 }
+            }
+        }
     }
 
     pub fn lock_shards(&self) -> Vec<LockGuard<'_, T>> {
-        (0..self.count()).map(|i| self.get_shard_by_index(i).lock()).collect()
+        match self {
+            Self::Single(single) => vec![single.lock()],
+            #[cfg(parallel_compiler)]
+            Self::Shards(shards) => shards.iter().map(|shard| shard.0.lock()).collect(),
+        }
     }
 
     pub fn try_lock_shards(&self) -> Option<Vec<LockGuard<'_, T>>> {
-        (0..self.count()).map(|i| self.get_shard_by_index(i).try_lock()).collect()
+        match self {
+            Self::Single(single) => Some(vec![single.try_lock()?]),
+            #[cfg(parallel_compiler)]
+            Self::Shards(shards) => shards.iter().map(|shard| shard.0.try_lock()).collect(),
+        }
     }
 }
 
+#[inline]
+pub fn shards() -> usize {
+    #[cfg(parallel_compiler)]
+    if is_dyn_thread_safe() {
+        return SHARDS;
+    }
+
+    1
+}
+
 pub type ShardedHashMap<K, V> = Sharded<FxHashMap<K, V>>;
 
 impl<K: Eq, V> ShardedHashMap<K, V> {
diff --git a/compiler/rustc_query_system/src/dep_graph/graph.rs b/compiler/rustc_query_system/src/dep_graph/graph.rs
@@ -1166,7 +1166,7 @@ impl<K: DepKind> CurrentDepGraph<K> {
             )),
             new_node_to_index: Sharded::new(|| {
                 FxHashMap::with_capacity_and_hasher(
-                    new_node_count_estimate / sharded::SHARDS,
+                    new_node_count_estimate / sharded::shards(),
                     Default::default(),
                 )
             }),

Original file line number	Diff line number	Diff line change
`@@ -1166,7 +1166,7 @@ impl<K: DepKind> CurrentDepGraph<K> {`
`1166`	`1166`	`)),`
`1167`	`1167`	`new_node_to_index: Sharded::new(\|\| {`
`1168`	`1168`	`FxHashMap::with_capacity_and_hasher(`
`1169`		`- new_node_count_estimate / sharded::SHARDS,`
	`1169`	`+ new_node_count_estimate / sharded::shards(),`
`1170`	`1170`	`Default::default(),`
`1171`	`1171`	`)`
`1172`	`1172`	`}),`