Skip to content

Commit 56ae4c2

Browse files
committed
fix division on SPARC (rust-lang#393)
1 parent ca1b344 commit 56ae4c2

File tree

3 files changed

+190
-27
lines changed

3 files changed

+190
-27
lines changed

src/int/specialized_div_rem/delegate.rs

+130
Original file line numberDiff line numberDiff line change
@@ -185,3 +185,133 @@ macro_rules! impl_delegate {
185185
}
186186
};
187187
}
188+
189+
/// Returns `n / d` and sets `*rem = n % d`.
190+
///
191+
/// This specialization exists because:
192+
/// - The LLVM backend for 32-bit SPARC cannot compile functions that return `(u128, u128)`,
193+
/// so we have to use an old fashioned `&mut u128` argument to return the remainder.
194+
/// - 64-bit SPARC does not have u64 * u64 => u128 widening multiplication, which makes the
195+
/// delegate algorithm strategy the only reasonably fast way to perform `u128` division.
196+
#[doc(hidden)]
197+
pub fn u128_divide_sparc(duo: u128, div: u128, rem: &mut u128) -> u128 {
198+
use super::*;
199+
let duo_lo = duo as u64;
200+
let duo_hi = (duo >> 64) as u64;
201+
let div_lo = div as u64;
202+
let div_hi = (div >> 64) as u64;
203+
204+
match (div_lo == 0, div_hi == 0, duo_hi == 0) {
205+
(true, true, _) => zero_div_fn(),
206+
(_, false, true) => {
207+
*rem = duo;
208+
return 0;
209+
}
210+
(false, true, true) => {
211+
let tmp = u64_by_u64_div_rem(duo_lo, div_lo);
212+
*rem = tmp.1 as u128;
213+
return tmp.0 as u128;
214+
}
215+
(false, true, false) => {
216+
if duo_hi < div_lo {
217+
let norm_shift = u64_normalization_shift(div_lo, duo_hi, false);
218+
let shl = if norm_shift == 0 {
219+
64 - 1
220+
} else {
221+
64 - norm_shift
222+
};
223+
224+
let mut div: u128 = div << shl;
225+
let mut pow_lo: u64 = 1 << shl;
226+
let mut quo_lo: u64 = 0;
227+
let mut duo = duo;
228+
loop {
229+
let sub = duo.wrapping_sub(div);
230+
if 0 <= (sub as i128) {
231+
duo = sub;
232+
quo_lo |= pow_lo;
233+
let duo_hi = (duo >> 64) as u64;
234+
if duo_hi == 0 {
235+
let tmp = u64_by_u64_div_rem(duo as u64, div_lo);
236+
*rem = tmp.1 as u128;
237+
return (quo_lo | tmp.0) as u128;
238+
}
239+
}
240+
div >>= 1;
241+
pow_lo >>= 1;
242+
}
243+
} else if duo_hi == div_lo {
244+
let tmp = u64_by_u64_div_rem(duo as u64, div as u64);
245+
*rem = tmp.1 as u128;
246+
return (1 << 64) | (tmp.0 as u128);
247+
} else {
248+
if (div_lo >> 32) == 0 {
249+
let div_0 = div_lo as u32 as u64;
250+
let (quo_hi, rem_3) = u64_by_u64_div_rem(duo_hi, div_0);
251+
252+
let duo_mid = ((duo >> 32) as u32 as u64) | (rem_3 << 32);
253+
let (quo_1, rem_2) = u64_by_u64_div_rem(duo_mid, div_0);
254+
255+
let duo_lo = (duo as u32 as u64) | (rem_2 << 32);
256+
let (quo_0, rem_1) = u64_by_u64_div_rem(duo_lo, div_0);
257+
258+
*rem = rem_1 as u128;
259+
return (quo_0 as u128) | ((quo_1 as u128) << 32) | ((quo_hi as u128) << 64);
260+
}
261+
262+
let duo_lo = duo as u64;
263+
let tmp = u64_by_u64_div_rem(duo_hi, div_lo);
264+
let quo_hi = tmp.0;
265+
let mut duo = (duo_lo as u128) | ((tmp.1 as u128) << 64);
266+
if duo < div {
267+
*rem = duo;
268+
return (quo_hi as u128) << 64;
269+
}
270+
271+
let mut div: u128 = div << (64 - 1);
272+
let mut pow_lo: u64 = 1 << (64 - 1);
273+
let mut quo_lo: u64 = 0;
274+
loop {
275+
let sub = duo.wrapping_sub(div);
276+
if 0 <= (sub as i128) {
277+
duo = sub;
278+
quo_lo |= pow_lo;
279+
let duo_hi = (duo >> 64) as u64;
280+
if duo_hi == 0 {
281+
let tmp = u64_by_u64_div_rem(duo as u64, div_lo);
282+
*rem = tmp.1 as u128;
283+
return (tmp.0) as u128 | (quo_lo as u128) | ((quo_hi as u128) << 64);
284+
}
285+
}
286+
div >>= 1;
287+
pow_lo >>= 1;
288+
}
289+
}
290+
}
291+
(_, false, false) => {
292+
if duo < div {
293+
*rem = duo;
294+
return 0;
295+
}
296+
let div_original = div;
297+
let shl = u64_normalization_shift(duo_hi, div_hi, false);
298+
let mut duo = duo;
299+
let mut div: u128 = div << shl;
300+
let mut pow_lo: u64 = 1 << shl;
301+
let mut quo_lo: u64 = 0;
302+
loop {
303+
let sub = duo.wrapping_sub(div);
304+
if 0 <= (sub as i128) {
305+
duo = sub;
306+
quo_lo |= pow_lo;
307+
if duo < div_original {
308+
*rem = duo;
309+
return quo_lo as u128;
310+
}
311+
}
312+
div >>= 1;
313+
pow_lo >>= 1;
314+
}
315+
}
316+
}
317+
}

src/int/specialized_div_rem/mod.rs

+28-21
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ mod binary_long;
4646

4747
#[macro_use]
4848
mod delegate;
49+
pub use self::delegate::u128_divide_sparc;
4950

5051
#[macro_use]
5152
mod trifecta;
@@ -60,27 +61,31 @@ fn zero_div_fn() -> ! {
6061
unsafe { core::hint::unreachable_unchecked() }
6162
}
6263

63-
// The `B` extension on RISC-V determines if a CLZ assembly instruction exists
64-
#[cfg(any(target_arch = "riscv32", target_arch = "riscv64"))]
65-
const USE_LZ: bool = cfg!(target_feature = "b");
66-
67-
#[cfg(target_arch = "arm")]
68-
const USE_LZ: bool = if cfg!(target_feature = "thumb-mode") {
69-
// ARM thumb targets have CLZ instructions if the instruction set of ARMv6T2 is supported. This
70-
// is needed to successfully differentiate between targets like `thumbv8.base` and
71-
// `thumbv8.main`.
72-
cfg!(target_feature = "v6t2")
73-
} else {
74-
// Regular ARM targets have CLZ instructions if the ARMv5TE instruction set is supported.
75-
// Technically, ARMv5T was the first to have CLZ, but the "v5t" target feature does not seem to
76-
// work.
77-
cfg!(target_feature = "v5te")
64+
const USE_LZ: bool = {
65+
if cfg!(target_arch = "arm") {
66+
if cfg!(target_feature = "thumb-mode") {
67+
// ARM thumb targets have CLZ instructions if the instruction set of ARMv6T2 is
68+
// supported. This is needed to successfully differentiate between targets like
69+
// `thumbv8.base` and `thumbv8.main`.
70+
cfg!(target_feature = "v6t2")
71+
} else {
72+
// Regular ARM targets have CLZ instructions if the ARMv5TE instruction set is
73+
// supported. Technically, ARMv5T was the first to have CLZ, but the "v5t" target
74+
// feature does not seem to work.
75+
cfg!(target_feature = "v5te")
76+
}
77+
} else if cfg!(any(target_arch = "sparc", target_arch = "sparc64")) {
78+
// LZD or LZCNT on SPARC only exists for the VIS 3 extension and later.
79+
cfg!(target_feature = "vis3")
80+
} else if cfg!(any(target_arch = "riscv32", target_arch = "riscv64")) {
81+
// The `B` extension on RISC-V determines if a CLZ assembly instruction exists
82+
cfg!(target_feature = "b")
83+
} else {
84+
// All other common targets Rust supports should have CLZ instructions
85+
true
86+
}
7887
};
7988

80-
// All other targets Rust supports have CLZ instructions
81-
#[cfg(not(any(target_arch = "arm", target_arch = "riscv32", target_arch = "riscv64")))]
82-
const USE_LZ: bool = true;
83-
8489
impl_normalization_shift!(
8590
u32_normalization_shift,
8691
USE_LZ,
@@ -115,8 +120,9 @@ fn u64_by_u64_div_rem(duo: u64, div: u64) -> (u64, u64) {
115120
// microarchitecture can multiply and divide. We decide to be optimistic and assume `trifecta` is
116121
// faster if the target pointer width is at least 64.
117122
#[cfg(all(
123+
not(any(target_pointer_width = "16", target_pointer_width = "32")),
118124
not(all(not(feature = "no-asm"), target_arch = "x86_64")),
119-
not(any(target_pointer_width = "16", target_pointer_width = "32"))
125+
not(any(target_arch = "sparc", target_arch = "sparc64"))
120126
))]
121127
impl_trifecta!(
122128
u128_div_rem,
@@ -131,8 +137,9 @@ impl_trifecta!(
131137
// If the pointer width less than 64, then the target architecture almost certainly does not have
132138
// the fast 64 to 128 bit widening multiplication needed for `trifecta` to be faster.
133139
#[cfg(all(
140+
any(target_pointer_width = "16", target_pointer_width = "32"),
134141
not(all(not(feature = "no-asm"), target_arch = "x86_64")),
135-
any(target_pointer_width = "16", target_pointer_width = "32")
142+
not(any(target_arch = "sparc", target_arch = "sparc64"))
136143
))]
137144
impl_delegate!(
138145
u128_div_rem,

src/int/udiv.rs

+32-6
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
pub use int::specialized_div_rem::u128_divide_sparc;
12
use int::specialized_div_rem::*;
23

34
intrinsics! {
@@ -46,25 +47,50 @@ intrinsics! {
4647
quo_rem.0
4748
}
4849

50+
// Note: we use block configuration and not `if cfg!(...)`, because we need to entirely disable
51+
// the existence of `u128_div_rem` to get 32-bit SPARC to compile, see `u128_divide_sparc` docs.
52+
4953
#[win64_128bit_abi_hack]
5054
/// Returns `n / d`
5155
pub extern "C" fn __udivti3(n: u128, d: u128) -> u128 {
52-
u128_div_rem(n, d).0
56+
#[cfg(not(any(target_arch = "sparc", target_arch = "sparc64")))] {
57+
u128_div_rem(n, d).0
58+
}
59+
#[cfg(any(target_arch = "sparc", target_arch = "sparc64"))] {
60+
u128_divide_sparc(n, d, &mut 0)
61+
}
5362
}
5463

5564
#[win64_128bit_abi_hack]
5665
/// Returns `n % d`
5766
pub extern "C" fn __umodti3(n: u128, d: u128) -> u128 {
58-
u128_div_rem(n, d).1
67+
#[cfg(not(any(target_arch = "sparc", target_arch = "sparc64")))] {
68+
u128_div_rem(n, d).1
69+
}
70+
#[cfg(any(target_arch = "sparc", target_arch = "sparc64"))] {
71+
let mut rem = 0;
72+
u128_divide_sparc(n, d, &mut rem);
73+
rem
74+
}
5975
}
6076

6177
#[win64_128bit_abi_hack]
6278
/// Returns `n / d` and sets `*rem = n % d`
6379
pub extern "C" fn __udivmodti4(n: u128, d: u128, rem: Option<&mut u128>) -> u128 {
64-
let quo_rem = u128_div_rem(n, d);
65-
if let Some(rem) = rem {
66-
*rem = quo_rem.1;
80+
#[cfg(not(any(target_arch = "sparc", target_arch = "sparc64")))] {
81+
let quo_rem = u128_div_rem(n, d);
82+
if let Some(rem) = rem {
83+
*rem = quo_rem.1;
84+
}
85+
quo_rem.0
86+
}
87+
#[cfg(any(target_arch = "sparc", target_arch = "sparc64"))] {
88+
let mut tmp = 0;
89+
let quo = u128_divide_sparc(n, d, &mut tmp);
90+
if let Some(rem) = rem {
91+
*rem = tmp;
92+
}
93+
quo
6794
}
68-
quo_rem.0
6995
}
7096
}

0 commit comments

Comments
 (0)