@@ -1594,11 +1594,10 @@ pub(crate) unsafe fn align_offset<T: Sized>(p: *const T, a: usize) -> usize {
1594
1594
// FIXME(#75598): Direct use of these intrinsics improves codegen significantly at opt-level <=
1595
1595
// 1, where the method versions of these operations are not inlined.
1596
1596
use intrinsics:: {
1597
- unchecked_shl, unchecked_shr, unchecked_sub, wrapping_add, wrapping_mul, wrapping_sub,
1597
+ cttz_nonzero, exact_div, unchecked_rem, unchecked_shl, unchecked_shr, unchecked_sub,
1598
+ wrapping_add, wrapping_mul, wrapping_sub,
1598
1599
} ;
1599
1600
1600
- let addr = p. addr ( ) ;
1601
-
1602
1601
/// Calculate multiplicative modular inverse of `x` modulo `m`.
1603
1602
///
1604
1603
/// This implementation is tailored for `align_offset` and has following preconditions:
@@ -1648,36 +1647,61 @@ pub(crate) unsafe fn align_offset<T: Sized>(p: *const T, a: usize) -> usize {
1648
1647
}
1649
1648
}
1650
1649
1650
+ let addr = p. addr ( ) ;
1651
1651
let stride = mem:: size_of :: < T > ( ) ;
1652
1652
// SAFETY: `a` is a power-of-two, therefore non-zero.
1653
1653
let a_minus_one = unsafe { unchecked_sub ( a, 1 ) } ;
1654
- if stride == 1 {
1655
- // `stride == 1` case can be computed more simply through `-p (mod a)`, but doing so
1656
- // inhibits LLVM's ability to select instructions like `lea`. Instead we compute
1654
+
1655
+ if stride == 0 {
1656
+ // SPECIAL_CASE: handle 0-sized types. No matter how many times we step, the address will
1657
+ // stay the same, so no offset will be able to align the pointer unless it is already
1658
+ // aligned. This branch _will_ be optimized out as `stride` is known at compile-time.
1659
+ let p_mod_a = addr & a_minus_one;
1660
+ return if p_mod_a == 0 { 0 } else { usize:: MAX } ;
1661
+ }
1662
+
1663
+ // SAFETY: `stride == 0` case has been handled by the special case above.
1664
+ let a_mod_stride = unsafe { unchecked_rem ( a, stride) } ;
1665
+ if a_mod_stride == 0 {
1666
+ // SPECIAL_CASE: In cases where the `a` is divisible by `stride`, byte offset to align a
1667
+ // pointer can be computed more simply through `-p (mod a)`. In the off-chance the byte
1668
+ // offset is not a multiple of `stride`, the input pointer was misaligned and no pointer
1669
+ // offset will be able to produce a `p` aligned to the specified `a`.
1657
1670
//
1658
- // round_up_to_next_alignment(p, a) - p
1671
+ // The naive `-p (mod a)` equation inhibits LLVM's ability to select instructions
1672
+ // like `lea`. We compute `(round_up_to_next_alignment(p, a) - p)` instead. This
1673
+ // redistributes operations around the load-bearing, but pessimizing `and` instruction
1674
+ // sufficiently for LLVM to be able to utilize the various optimizations it knows about.
1659
1675
//
1660
- // which distributes operations around the load-bearing, but pessimizing `and` sufficiently
1661
- // for LLVM to be able to utilize the various optimizations it knows about.
1662
- return wrapping_sub ( wrapping_add ( addr, a_minus_one) & wrapping_sub ( 0 , a) , addr) ;
1663
- }
1676
+ // LLVM handles the branch here particularly nicely. If this branch needs to be evaluated
1677
+ // at runtime, it will produce a mask `if addr_mod_stride == 0 { 0 } else { usize::MAX }`
1678
+ // in a branch-free way and then bitwise-OR it with whatever result the `-p mod a`
1679
+ // computation produces.
1680
+
1681
+ // SAFETY: `stride == 0` case has been handled by the special case above.
1682
+ let addr_mod_stride = unsafe { unchecked_rem ( addr, stride) } ;
1664
1683
1665
- let pmoda = addr & a_minus_one;
1666
- if pmoda == 0 {
1667
- // Already aligned. Yay!
1668
- return 0 ;
1669
- } else if stride == 0 {
1670
- // If the pointer is not aligned, and the element is zero-sized, then no amount of
1671
- // elements will ever align the pointer.
1672
- return usize:: MAX ;
1684
+ return if addr_mod_stride == 0 {
1685
+ let aligned_address = wrapping_add ( addr, a_minus_one) & wrapping_sub ( 0 , a) ;
1686
+ let byte_offset = wrapping_sub ( aligned_address, addr) ;
1687
+ // SAFETY: `stride` is non-zero. This is guaranteed to divide exactly as well, because
1688
+ // addr has been verified to be aligned to the original type’s alignment requirements.
1689
+ unsafe { exact_div ( byte_offset, stride) }
1690
+ } else {
1691
+ usize:: MAX
1692
+ } ;
1673
1693
}
1674
1694
1675
- let smoda = stride & a_minus_one;
1695
+ // GENERAL_CASE: From here on we’re handling the very general case where `addr` may be
1696
+ // misaligned, there isn’t an obvious relationship between `stride` and `a` that we can take an
1697
+ // advantage of, etc. This case produces machine code that isn’t particularly high quality,
1698
+ // compared to the special cases above. The code produced here is still within the realm of
1699
+ // miracles, given the situations this case has to deal with.
1700
+
1676
1701
// SAFETY: a is power-of-two hence non-zero. stride == 0 case is handled above.
1677
- let gcdpow = unsafe { intrinsics :: cttz_nonzero ( stride) . min ( intrinsics :: cttz_nonzero ( a) ) } ;
1702
+ let gcdpow = unsafe { cttz_nonzero ( stride) . min ( cttz_nonzero ( a) ) } ;
1678
1703
// SAFETY: gcdpow has an upper-bound that’s at most the number of bits in a usize.
1679
1704
let gcd = unsafe { unchecked_shl ( 1usize , gcdpow) } ;
1680
-
1681
1705
// SAFETY: gcd is always greater or equal to 1.
1682
1706
if addr & unsafe { unchecked_sub ( gcd, 1 ) } == 0 {
1683
1707
// This branch solves for the following linear congruence equation:
@@ -1693,14 +1717,13 @@ pub(crate) unsafe fn align_offset<T: Sized>(p: *const T, a: usize) -> usize {
1693
1717
// ` p' + s'o = 0 mod a' `
1694
1718
// ` o = (a' - (p' mod a')) * (s'^-1 mod a') `
1695
1719
//
1696
- // The first term is "the relative alignment of `p` to `a`" (divided by the `g`), the second
1697
- // term is "how does incrementing `p` by `s` bytes change the relative alignment of `p`" (again
1698
- // divided by `g`).
1699
- // Division by `g` is necessary to make the inverse well formed if `a` and `s` are not
1700
- // co-prime.
1720
+ // The first term is "the relative alignment of `p` to `a`" (divided by the `g`), the
1721
+ // second term is "how does incrementing `p` by `s` bytes change the relative alignment of
1722
+ // `p`" (again divided by `g`). Division by `g` is necessary to make the inverse well
1723
+ // formed if `a` and `s` are not co-prime.
1701
1724
//
1702
1725
// Furthermore, the result produced by this solution is not "minimal", so it is necessary
1703
- // to take the result `o mod lcm(s, a)`. We can replace `lcm(s, a)` with just a `a'`.
1726
+ // to take the result `o mod lcm(s, a)`. This `lcm(s, a)` is the same as `a'`.
1704
1727
1705
1728
// SAFETY: `gcdpow` has an upper-bound not greater than the number of trailing 0-bits in
1706
1729
// `a`.
@@ -1710,11 +1733,11 @@ pub(crate) unsafe fn align_offset<T: Sized>(p: *const T, a: usize) -> usize {
1710
1733
let a2minus1 = unsafe { unchecked_sub ( a2, 1 ) } ;
1711
1734
// SAFETY: `gcdpow` has an upper-bound not greater than the number of trailing 0-bits in
1712
1735
// `a`.
1713
- let s2 = unsafe { unchecked_shr ( smoda , gcdpow) } ;
1736
+ let s2 = unsafe { unchecked_shr ( stride & a_minus_one , gcdpow) } ;
1714
1737
// SAFETY: `gcdpow` has an upper-bound not greater than the number of trailing 0-bits in
1715
1738
// `a`. Furthermore, the subtraction cannot overflow, because `a2 = a >> gcdpow` will
1716
1739
// always be strictly greater than `(p % a) >> gcdpow`.
1717
- let minusp2 = unsafe { unchecked_sub ( a2, unchecked_shr ( pmoda , gcdpow) ) } ;
1740
+ let minusp2 = unsafe { unchecked_sub ( a2, unchecked_shr ( addr & a_minus_one , gcdpow) ) } ;
1718
1741
// SAFETY: `a2` is a power-of-two, as proven above. `s2` is strictly less than `a2`
1719
1742
// because `(s % a) >> gcdpow` is strictly less than `a >> gcdpow`.
1720
1743
return wrapping_mul ( minusp2, unsafe { mod_inv ( s2, a2) } ) & a2minus1;
0 commit comments