compiler_rt: optimize mulo
- use usize to decide if register size is big enough to store multiplication result or if division is necessary - multiplication routine with check of integer bounds - wrapping multipliation and division routine from Hacker's Delight
This commit is contained in:
parent
04f379dd41
commit
01d48e55a5
|
@ -1,67 +1,68 @@
|
|||
const builtin = @import("builtin");
|
||||
const std = @import("std");
|
||||
const math = std.math;
|
||||
|
||||
// mulo - multiplication overflow
|
||||
// - muloXi4_generic for unoptimized version
|
||||
// * return a*b.
|
||||
// * return if a*b overflows => 1 else => 0
|
||||
// - muloXi4_genericSmall as default
|
||||
// - muloXi4_genericFast for 2*bitsize <= usize
|
||||
|
||||
// return a*b.
|
||||
// return if a*b overflows => 1 else => 0
|
||||
// see https://stackoverflow.com/a/26320664 for possible implementations
|
||||
|
||||
inline fn muloXi4_generic(comptime ST: type, a: ST, b: ST, overflow: *c_int) ST {
|
||||
inline fn muloXi4_genericSmall(comptime ST: type, a: ST, b: ST, overflow: *c_int) ST {
|
||||
@setRuntimeSafety(builtin.is_test);
|
||||
const BSIZE = @bitSizeOf(ST);
|
||||
comptime var UT = switch (ST) {
|
||||
i32 => u32,
|
||||
i64 => u64,
|
||||
i128 => u128,
|
||||
overflow.* = 0;
|
||||
const min = math.minInt(ST);
|
||||
var res: ST = a *% b;
|
||||
// Hacker's Delight section Overflow subsection Multiplication
|
||||
// case a=-2^{31}, b=-1 problem, because
|
||||
// on some machines a*b = -2^{31} with overflow
|
||||
// Then -2^{31}/-1 overflows and any result is possible.
|
||||
// => check with a<0 and b=-2^{31}
|
||||
if ((a < 0 and b == min) or (a != 0 and @divTrunc(res, a) != b))
|
||||
overflow.* = 1;
|
||||
return res;
|
||||
}
|
||||
|
||||
inline fn muloXi4_genericFast(comptime ST: type, a: ST, b: ST, overflow: *c_int) ST {
|
||||
@setRuntimeSafety(builtin.is_test);
|
||||
overflow.* = 0;
|
||||
const EST = switch (ST) {
|
||||
i32 => i64,
|
||||
i64 => i128,
|
||||
i128 => i256,
|
||||
else => unreachable,
|
||||
};
|
||||
const min = @bitCast(ST, @as(UT, 1 << (BSIZE - 1)));
|
||||
const max = ~min;
|
||||
overflow.* = 0;
|
||||
const result = a *% b;
|
||||
|
||||
// edge cases
|
||||
if (a == min) {
|
||||
if (b != 0 and b != 1) overflow.* = 1;
|
||||
return result;
|
||||
}
|
||||
if (b == min) {
|
||||
if (a != 0 and a != 1) overflow.* = 1;
|
||||
return result;
|
||||
}
|
||||
|
||||
// take sign of x sx
|
||||
const sa = a >> (BSIZE - 1);
|
||||
const sb = b >> (BSIZE - 1);
|
||||
// take absolute value of a and b via
|
||||
// abs(x) = (x^sx)) - sx
|
||||
const abs_a = (a ^ sa) -% sa;
|
||||
const abs_b = (b ^ sb) -% sb;
|
||||
|
||||
// unitary magnitude, cannot have overflow
|
||||
if (abs_a < 2 or abs_b < 2) return result;
|
||||
|
||||
// compare the signs of operands
|
||||
if ((a ^ b) >> (BSIZE - 1) != 0) {
|
||||
if (abs_a > @divTrunc(max, abs_b)) overflow.* = 1;
|
||||
} else {
|
||||
if (abs_a > @divTrunc(min, -abs_b)) overflow.* = 1;
|
||||
}
|
||||
|
||||
return result;
|
||||
const min = math.minInt(ST);
|
||||
const max = math.maxInt(ST);
|
||||
var res: EST = @as(EST, a) * @as(EST, b);
|
||||
//invariant: -2^{bitwidth(EST)} < res < 2^{bitwidth(EST)-1}
|
||||
if (res < min or max < res)
|
||||
overflow.* = 1;
|
||||
return @truncate(ST, res);
|
||||
}
|
||||
|
||||
pub fn __mulosi4(a: i32, b: i32, overflow: *c_int) callconv(.C) i32 {
|
||||
return muloXi4_generic(i32, a, b, overflow);
|
||||
if (2 * @bitSizeOf(i32) <= @bitSizeOf(usize)) {
|
||||
return muloXi4_genericFast(i32, a, b, overflow);
|
||||
} else {
|
||||
return muloXi4_genericSmall(i32, a, b, overflow);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn __mulodi4(a: i64, b: i64, overflow: *c_int) callconv(.C) i64 {
|
||||
return muloXi4_generic(i64, a, b, overflow);
|
||||
if (2 * @bitSizeOf(i64) <= @bitSizeOf(usize)) {
|
||||
return muloXi4_genericFast(i64, a, b, overflow);
|
||||
} else {
|
||||
return muloXi4_genericSmall(i64, a, b, overflow);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn __muloti4(a: i128, b: i128, overflow: *c_int) callconv(.C) i128 {
|
||||
return muloXi4_generic(i128, a, b, overflow);
|
||||
if (2 * @bitSizeOf(i128) <= @bitSizeOf(usize)) {
|
||||
return muloXi4_genericFast(i128, a, b, overflow);
|
||||
} else {
|
||||
return muloXi4_genericSmall(i128, a, b, overflow);
|
||||
}
|
||||
}
|
||||
|
||||
test {
|
||||
|
|
Loading…
Reference in New Issue
Block a user