Skip to content

Commit c9b42ce

Browse files
committed
Replace division implementations with code from the specialized-div-rem crate
Puts the asymmetric division behind a feature flag Makes asymmetric-asm a default feature
1 parent 6de4f8f commit c9b42ce

File tree

9 files changed

+1165
-314
lines changed

9 files changed

+1165
-314
lines changed

Cargo.toml

+4-1
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ cc = { optional = true, version = "1.0" }
3838
panic-handler = { path = 'crates/panic-handler' }
3939

4040
[features]
41-
default = ["compiler-builtins"]
41+
default = ["compiler-builtins", "asymmetric-asm"]
4242

4343
# Enable compilation of C code in compiler-rt, filling in some more optimized
4444
# implementations and also filling in unimplemented intrinsics
@@ -60,6 +60,9 @@ no-lang-items = []
6060
# Only used in the compiler's build system
6161
rustc-dep-of-std = ['compiler-builtins', 'core']
6262

63+
# Used for faster u128 division on x86_64
64+
asymmetric-asm = []
65+
6366
[[example]]
6467
name = "intrinsics"
6568
required-features = ["compiler-builtins"]

src/int/mod.rs

+3-13
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,11 @@
11
use core::ops;
22

3-
macro_rules! hty {
4-
($ty:ty) => {
5-
<$ty as LargeInt>::HighHalf
6-
};
7-
}
8-
9-
macro_rules! os_ty {
10-
($ty:ty) => {
11-
<$ty as Int>::OtherSign
12-
};
13-
}
14-
153
pub mod addsub;
164
pub mod mul;
17-
pub mod sdiv;
185
pub mod shift;
6+
7+
pub mod sdiv;
8+
mod specialized_div_rem;
199
pub mod udiv;
2010

2111
/// Trait for some basic operations on integers

src/int/sdiv.rs

+35-75
Original file line numberDiff line numberDiff line change
@@ -1,101 +1,61 @@
1-
use int::Int;
1+
use super::specialized_div_rem::*;
22

3-
trait Div: Int {
4-
/// Returns `a / b`
5-
fn div(self, other: Self) -> Self {
6-
let s_a = self >> (Self::BITS - 1);
7-
let s_b = other >> (Self::BITS - 1);
8-
// NOTE it's OK to overflow here because of the `.unsigned()` below.
9-
// This whole operation is computing the absolute value of the inputs
10-
// So some overflow will happen when dealing with e.g. `i64::MIN`
11-
// where the absolute value is `(-i64::MIN) as u64`
12-
let a = (self ^ s_a).wrapping_sub(s_a);
13-
let b = (other ^ s_b).wrapping_sub(s_b);
14-
let s = s_a ^ s_b;
15-
16-
let r = a.unsigned().aborting_div(b.unsigned());
17-
(Self::from_unsigned(r) ^ s) - s
18-
}
19-
}
20-
21-
impl Div for i32 {}
22-
impl Div for i64 {}
23-
impl Div for i128 {}
24-
25-
trait Mod: Int {
26-
/// Returns `a % b`
27-
fn mod_(self, other: Self) -> Self {
28-
let s = other >> (Self::BITS - 1);
29-
// NOTE(wrapping_sub) see comment in the `div`
30-
let b = (other ^ s).wrapping_sub(s);
31-
let s = self >> (Self::BITS - 1);
32-
let a = (self ^ s).wrapping_sub(s);
33-
34-
let r = a.unsigned().aborting_rem(b.unsigned());
35-
(Self::from_unsigned(r) ^ s) - s
36-
}
37-
}
38-
39-
impl Mod for i32 {}
40-
impl Mod for i64 {}
41-
impl Mod for i128 {}
42-
43-
trait Divmod: Int {
44-
/// Returns `a / b` and sets `*rem = n % d`
45-
fn divmod<F>(self, other: Self, rem: &mut Self, div: F) -> Self
46-
where
47-
F: Fn(Self, Self) -> Self,
48-
{
49-
let r = div(self, other);
50-
// NOTE won't overflow because it's using the result from the
51-
// previous division
52-
*rem = self - r.wrapping_mul(other);
53-
r
54-
}
55-
}
56-
57-
impl Divmod for i32 {}
58-
impl Divmod for i64 {}
3+
// NOTE: there are aborts inside the specialized_div_rem functions if division by 0
4+
// is encountered, however these should be unreachable and optimized away unless
5+
// uses of `std/core::intrinsics::unchecked_div/rem` do not have a 0 check in front
6+
// of them.
597

608
intrinsics! {
619
#[maybe_use_optimized_c_shim]
6210
#[arm_aeabi_alias = __aeabi_idiv]
11+
/// Returns `n / d`
6312
pub extern "C" fn __divsi3(a: i32, b: i32) -> i32 {
64-
a.div(b)
13+
i32_div_rem(a, b).0
6514
}
6615

6716
#[maybe_use_optimized_c_shim]
68-
pub extern "C" fn __divdi3(a: i64, b: i64) -> i64 {
69-
a.div(b)
17+
/// Returns `n % d`
18+
pub extern "C" fn __modsi3(a: i32, b: i32) -> i32 {
19+
i32_div_rem(a, b).1
7020
}
7121

72-
#[win64_128bit_abi_hack]
73-
pub extern "C" fn __divti3(a: i128, b: i128) -> i128 {
74-
a.div(b)
22+
#[maybe_use_optimized_c_shim]
23+
/// Returns `n / d` and sets `*rem = n % d`
24+
pub extern "C" fn __divmodsi4(a: i32, b: i32, rem: &mut i32) -> i32 {
25+
let quo_rem = i32_div_rem(a, b);
26+
*rem = quo_rem.1;
27+
quo_rem.0
7528
}
7629

7730
#[maybe_use_optimized_c_shim]
78-
pub extern "C" fn __modsi3(a: i32, b: i32) -> i32 {
79-
a.mod_(b)
31+
/// Returns `n / d`
32+
pub extern "C" fn __divdi3(a: i64, b: i64) -> i64 {
33+
i64_div_rem(a, b).0
8034
}
8135

8236
#[maybe_use_optimized_c_shim]
37+
/// Returns `n % d`
8338
pub extern "C" fn __moddi3(a: i64, b: i64) -> i64 {
84-
a.mod_(b)
39+
i64_div_rem(a, b).1
8540
}
8641

87-
#[win64_128bit_abi_hack]
88-
pub extern "C" fn __modti3(a: i128, b: i128) -> i128 {
89-
a.mod_(b)
42+
#[aapcs_on_arm]
43+
/// Returns `n / d` and sets `*rem = n % d`
44+
pub extern "C" fn __divmoddi4(a: i64, b: i64, rem: &mut i64) -> i64 {
45+
let quo_rem = i64_div_rem(a, b);
46+
*rem = quo_rem.1;
47+
quo_rem.0
9048
}
9149

92-
#[maybe_use_optimized_c_shim]
93-
pub extern "C" fn __divmodsi4(a: i32, b: i32, rem: &mut i32) -> i32 {
94-
a.divmod(b, rem, |a, b| __divsi3(a, b))
50+
#[win64_128bit_abi_hack]
51+
/// Returns `n / d`
52+
pub extern "C" fn __divti3(a: i128, b: i128) -> i128 {
53+
i128_div_rem(a, b).0
9554
}
9655

97-
#[aapcs_on_arm]
98-
pub extern "C" fn __divmoddi4(a: i64, b: i64, rem: &mut i64) -> i64 {
99-
a.divmod(b, rem, |a, b| __divdi3(a, b))
56+
#[win64_128bit_abi_hack]
57+
/// Returns `n % d`
58+
pub extern "C" fn __modti3(a: i128, b: i128) -> i128 {
59+
i128_div_rem(a, b).1
10060
}
10161
}
+175
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
macro_rules! impl_asymmetric {
2+
(
3+
$unsigned_name:ident, // name of the unsigned function
4+
$signed_name:ident, // name of the signed function
5+
$half_division:ident, // function for division of a $uX by a $uX
6+
$asymmetric_division:ident, // function for division of a $uD by a $uX
7+
$n_h:expr, // the number of bits in $iH or $uH
8+
$uH:ident, // unsigned integer with half the bit width of $uX
9+
$uX:ident, // unsigned integer with half the bit width of $uD
10+
$uD:ident, // unsigned integer with double the bit width of $uX
11+
$iD:ident, // signed version of $uD
12+
$($unsigned_attr:meta),*; // attributes for the unsigned function
13+
$($signed_attr:meta),* // attributes for the signed function
14+
) => {
15+
/// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
16+
/// tuple.
17+
///
18+
/// This is optimized for dividing integers with the same bitwidth as the largest operand in
19+
/// an asymmetrically sized division. For example, the x86-64 `divq` assembly instruction
20+
/// can divide a 128 bit integer by a 64 bit integer if the quotient fits in 64 bits.
21+
///
22+
/// # Panics
23+
///
24+
/// When attempting to divide by zero, this function will panic.
25+
$(
26+
#[$unsigned_attr]
27+
)*
28+
pub fn $unsigned_name(duo: $uD, div: $uD) -> ($uD,$uD) {
29+
#[inline(always)]
30+
fn carrying_mul(lhs: $uX, rhs: $uX) -> ($uX, $uX) {
31+
let tmp = (lhs as $uD).wrapping_mul(rhs as $uD);
32+
(tmp as $uX, (tmp >> ($n_h * 2)) as $uX)
33+
}
34+
#[inline(always)]
35+
fn carrying_mul_add(lhs: $uX, mul: $uX, add: $uX) -> ($uX, $uX) {
36+
let tmp = (lhs as $uD).wrapping_mul(mul as $uD).wrapping_add(add as $uD);
37+
(tmp as $uX, (tmp >> ($n_h * 2)) as $uX)
38+
}
39+
40+
let n: u32 = $n_h * 2;
41+
42+
// Many of these subalgorithms are taken from trifecta.rs, see that for better
43+
// documentation
44+
45+
let duo_lo = duo as $uX;
46+
let duo_hi = (duo >> n) as $uX;
47+
let div_lo = div as $uX;
48+
let div_hi = (div >> n) as $uX;
49+
if div_hi == 0 {
50+
if div_lo == 0 {
51+
// division by zero
52+
::abort();
53+
}
54+
if duo_hi < div_lo {
55+
// plain $uD by $uX division that will fit into $uX
56+
let tmp = unsafe { $asymmetric_division(duo, div_lo) };
57+
return (tmp.0 as $uD, tmp.1 as $uD)
58+
} else if (div_lo >> $n_h) == 0 {
59+
// Short division of $uD by a $uH.
60+
let div_0 = div_lo as $uH as $uX;
61+
let (quo_hi, rem_3) = $half_division(duo_hi, div_0);
62+
63+
let duo_mid =
64+
((duo >> $n_h) as $uH as $uX)
65+
| (rem_3 << $n_h);
66+
let (quo_1, rem_2) = $half_division(duo_mid, div_0);
67+
68+
let duo_lo =
69+
(duo as $uH as $uX)
70+
| (rem_2 << $n_h);
71+
let (quo_0, rem_1) = $half_division(duo_lo, div_0);
72+
73+
return (
74+
(quo_0 as $uD)
75+
| ((quo_1 as $uD) << $n_h)
76+
| ((quo_hi as $uD) << n),
77+
rem_1 as $uD
78+
)
79+
} else {
80+
// Short division using the $uD by $uX division
81+
let (quo_hi, rem_hi) = $half_division(duo_hi, div_lo);
82+
let tmp = unsafe {
83+
$asymmetric_division((duo_lo as $uD) | ((rem_hi as $uD) << n), div_lo)
84+
};
85+
return ((tmp.0 as $uD) | ((quo_hi as $uD) << n), tmp.1 as $uD)
86+
}
87+
}
88+
89+
let duo_lz = duo_hi.leading_zeros();
90+
let div_lz = div_hi.leading_zeros();
91+
let rel_leading_sb = div_lz.wrapping_sub(duo_lz);
92+
if rel_leading_sb < $n_h {
93+
// Some x86_64 CPUs have bad `divq` implementations that make putting
94+
// a `mul` or `mul - 1` algorithm here beneficial
95+
let shift = n.wrapping_sub(duo_lz);
96+
let duo_sig_n = (duo >> shift) as $uX;
97+
let div_sig_n = (div >> shift) as $uX;
98+
let mul = $half_division(duo_sig_n, div_sig_n).0;
99+
let div_lo = div as $uX;
100+
let div_hi = (div >> n) as $uX;
101+
let (tmp_lo, carry) = carrying_mul(mul,div_lo);
102+
let (tmp_hi, overflow) = carrying_mul_add(mul,div_hi,carry);
103+
let tmp = (tmp_lo as $uD) | ((tmp_hi as $uD) << n);
104+
if ((overflow & 1) != 0) || (duo < tmp) {
105+
return (
106+
mul.wrapping_sub(1) as $uD,
107+
duo.wrapping_add(div.wrapping_sub(tmp))
108+
)
109+
} else {
110+
return (
111+
mul as $uD,
112+
duo.wrapping_sub(tmp)
113+
)
114+
}
115+
} else {
116+
// This has been adapted from
117+
// https://www.codeproject.com/tips/785014/uint-division-modulus which was in turn
118+
// adapted from www.hackersdelight.org
119+
120+
// This is similar to the `mul` or `mul - 1` algorithm in that it uses only more
121+
// significant parts of `duo` and `div` to divide a large integer with a smaller
122+
// division instruction.
123+
let tmp = unsafe {
124+
$asymmetric_division(duo >> 1, ((div << div_lz) >> n) as $uX)
125+
};
126+
let mut quo = tmp.0 >> ((n - 1) - div_lz);
127+
if quo != 0 {
128+
quo -= 1;
129+
}
130+
// Note that this is a large $uD multiplication being used here
131+
let mut rem = duo - ((quo as $uD) * div);
132+
133+
if rem >= div {
134+
quo += 1;
135+
rem -= div;
136+
}
137+
return (quo as $uD, rem)
138+
}
139+
}
140+
141+
/// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
142+
/// tuple.
143+
///
144+
/// This is optimized for dividing integers with the same bitwidth as the largest operand in
145+
/// an asymmetrically sized division. For example, the x86-64 `divq` assembly instruction
146+
/// can divide a 128 bit integer by a 64 bit integer if the quotient fits in 64 bits.
147+
///
148+
/// # Panics
149+
///
150+
/// When attempting to divide by zero, this function will panic.
151+
$(
152+
#[$signed_attr]
153+
)*
154+
pub fn $signed_name(duo: $iD, div: $iD) -> ($iD,$iD) {
155+
match (duo < 0, div < 0) {
156+
(false,false) => {
157+
let t = $unsigned_name(duo as $uD,div as $uD);
158+
(t.0 as $iD,t.1 as $iD)
159+
},
160+
(true,false) => {
161+
let t = $unsigned_name(duo.wrapping_neg() as $uD,div as $uD);
162+
((t.0 as $iD).wrapping_neg(),(t.1 as $iD).wrapping_neg())
163+
},
164+
(false,true) => {
165+
let t = $unsigned_name(duo as $uD,div.wrapping_neg() as $uD);
166+
((t.0 as $iD).wrapping_neg(),t.1 as $iD)
167+
},
168+
(true,true) => {
169+
let t = $unsigned_name(duo.wrapping_neg() as $uD,div.wrapping_neg() as $uD);
170+
(t.0 as $iD,(t.1 as $iD).wrapping_neg())
171+
},
172+
}
173+
}
174+
}
175+
}

0 commit comments

Comments
 (0)