diff options
author | S. Gilles <sgilles@math.umd.edu> | 2019-05-01 15:57:06 -0400 |
---|---|---|
committer | S. Gilles <sgilles@math.umd.edu> | 2019-05-01 15:57:59 -0400 |
commit | 218258e4ac2940303be597272245cb09662fd759 (patch) | |
tree | a82aa527a29f863e16216bfc1725efbf209fbc50 | |
parent | fb2a06c91f60a948687939741aa884b17219f715 (diff) | |
download | mc-218258e4ac2940303be597272245cb09662fd759.tar.gz |
Add 2sum in math util
Fast2sum is only for when magnitudes of the arguments are known. 2sum
(called "slow2sum") avoids this, at the cost of speed. It ends up being
necessary in the forthcoming log-overkill function.
-rw-r--r-- | lib/math/atan-impl.myr | 2 | ||||
-rw-r--r-- | lib/math/sin-impl.myr | 81 | ||||
-rw-r--r-- | lib/math/sum-impl.myr | 14 | ||||
-rw-r--r-- | lib/math/tan-impl.myr | 10 | ||||
-rw-r--r-- | lib/math/util.myr | 74 |
5 files changed, 103 insertions, 78 deletions
diff --git a/lib/math/atan-impl.myr b/lib/math/atan-impl.myr index 78f235d..38b973a 100644 --- a/lib/math/atan-impl.myr +++ b/lib/math/atan-impl.myr @@ -427,7 +427,7 @@ const atan264 = {y, x du = 0.0 else var t1, t2 - (t1, t2) = two_by_two(u, x) + (t1, t2) = two_by_two64(u, x) du = ((y - t1) - t2)/x ;; diff --git a/lib/math/sin-impl.myr b/lib/math/sin-impl.myr index ba7099b..5a321a4 100644 --- a/lib/math/sin-impl.myr +++ b/lib/math/sin-impl.myr @@ -508,8 +508,8 @@ const w64 = {x : flt64, want_sin : bool, want_cos : bool in the polynomial approximations. */ var delta1, delta2, deltat - (delta1, deltat) = fast2sum(-std.flt64frombits(xi), x1) - (delta2, _) = fast2sum(deltat, x2) + (delta1, deltat) = slow2sum(-std.flt64frombits(xi), x1) + (delta2, _) = slow2sum(deltat, x2) /* sin(delta); the degree 2 coefficient is near 0, so delta_2 @@ -528,16 +528,16 @@ const w64 = {x : flt64, want_sin : bool, want_cos : bool var q : flt64[4] if (want_sin && !swap_sin_cos) || (want_cos && swap_sin_cos) - (q[0], q[1]) = two_by_two(std.flt64frombits(sin_xi), cos_delta) - (q[2], q[3]) = two_by_two(std.flt64frombits(cos_xi), sin_delta) - std.sort(q[:], fltabscmp) + (q[0], q[1]) = two_by_two64(std.flt64frombits(sin_xi), cos_delta) + (q[2], q[3]) = two_by_two64(std.flt64frombits(cos_xi), sin_delta) + std.sort(q[:], mag_cmp64) (sin_ret, sin_ret2) = double_compensated_sum(q[:]) ;; if (want_cos && !swap_sin_cos) || (want_sin && swap_sin_cos) - (q[0], q[1]) = two_by_two(std.flt64frombits(cos_xi), cos_delta) - (q[2], q[3]) = two_by_two(std.flt64frombits(sin_xi), sin_delta) + (q[0], q[1]) = two_by_two64(std.flt64frombits(cos_xi), cos_delta) + (q[2], q[3]) = two_by_two64(std.flt64frombits(sin_xi), sin_delta) q[2] = -q[2] q[3] = -q[3] @@ -620,16 +620,16 @@ const trig_reduce = {x : flt64 total_N = rn(x1 * std.flt64frombits(two_over_pi)) Nf = (-total_N : flt64) (q[0], q[ 4], q[5]) = (x1, x2, x3) - (q[1], q[ 3]) = two_by_two(Nf, pi_o_2_0) - (q[2], q[ 6]) = two_by_two(Nf, pi_o_2_1) - (q[7], q[ 8]) = two_by_two(Nf, pi_o_2_2) - (q[9], q[10]) = two_by_two(Nf, pi_o_2_3) + (q[1], q[ 3]) = two_by_two64(Nf, pi_o_2_0) + (q[2], q[ 6]) = two_by_two64(Nf, pi_o_2_1) + (q[7], q[ 8]) = two_by_two64(Nf, pi_o_2_2) + (q[9], q[10]) = two_by_two64(Nf, pi_o_2_3) /* Sorting is very slow, but it's only the top five or so that are in question */ - std.sort(q[0:5], fltabscmp) + std.sort(q[0:5], mag_cmp64) (x1, x2) = double_compensated_sum(q[:]) while !(le_22(x1, x2, pi_o_4_0, pi_o_4_1) && le_22(-pi_o_4_0, -pi_o_4_1, x1, x2)) @@ -641,10 +641,10 @@ const trig_reduce = {x : flt64 cancelled by Nf * pi_o_2_0, so line those up. */ (q[0], q[2]) = (x1, x2) - (q[1], q[3]) = two_by_two(Nf, pi_o_2_0) - (q[4], q[5]) = two_by_two(Nf, pi_o_2_1) - (q[6], q[7]) = two_by_two(Nf, pi_o_2_2) - (q[8], q[9]) = two_by_two(Nf, pi_o_2_3) + (q[1], q[3]) = two_by_two64(Nf, pi_o_2_0) + (q[4], q[5]) = two_by_two64(Nf, pi_o_2_1) + (q[6], q[7]) = two_by_two64(Nf, pi_o_2_2) + (q[8], q[9]) = two_by_two64(Nf, pi_o_2_3) (x1, x2) = double_compensated_sum(q[0:10]) total_N += (N % 4) ;; @@ -731,15 +731,15 @@ const huge_reduce_2pi = {x : flt64 since xc is so small. */ var q : flt64[18] - (q[ 0], q[ 1]) = two_by_two(xa, a1) - (q[ 2], q[ 3]) = two_by_two(xa, a2) - (q[ 4], q[ 5]) = two_by_two(xa, a3) - (q[ 6], q[ 7]) = two_by_two(xb, b1) - (q[ 8], q[ 9]) = two_by_two(xb, b2) - (q[10], q[11]) = two_by_two(xb, b3) - (q[12], q[13]) = two_by_two(xc, c1) - (q[14], q[15]) = two_by_two(xc, c2) - (q[16], q[17]) = two_by_two(xc, c3) + (q[ 0], q[ 1]) = two_by_two64(xa, a1) + (q[ 2], q[ 3]) = two_by_two64(xa, a2) + (q[ 4], q[ 5]) = two_by_two64(xa, a3) + (q[ 6], q[ 7]) = two_by_two64(xb, b1) + (q[ 8], q[ 9]) = two_by_two64(xb, b2) + (q[10], q[11]) = two_by_two64(xb, b3) + (q[12], q[13]) = two_by_two64(xc, c1) + (q[14], q[15]) = two_by_two64(xc, c2) + (q[16], q[17]) = two_by_two64(xc, c3) -> triple_compensated_sum(q[:]) } @@ -764,37 +764,6 @@ const trig_table_approx = {x1 : flt64, C : (uint64, uint64, uint64)[257] -> (xi, cos_xi, sin_xi) } -const triple_compensated_sum = {q : flt64[:] - /* TODO: verify, with GAPPA or something, that this is correct. */ - std.sort(q, fltabscmp) - var s1 : flt64, s2 : flt64, s3 - var t1 : flt64, t2 : flt64, t3 : flt64, t4 : flt64, t5 : flt64, t6 - s1 = q[0] - s2 = 0.0 - s3 = 0.0 - for qq : q[1:] - (t5, t6) = fast2sum(s3, qq) - (t3, t4) = fast2sum(s2, t5) - (t1, t2) = fast2sum(s1, t3) - s1 = t1 - (s2, s3) = fast2sum(t2, t4 + t6) - ;; - - -> (s1, s2, s3) -} - -const fltabscmp = {x : flt64, y : flt64 - var xb = std.flt64bits(x) & ~(1 << 63) - var yb = std.flt64bits(y) & ~(1 << 63) - if xb == yb - -> `std.Equal - elif xb > yb - -> `std.Before - else - -> `std.After - ;; -} - /* Return true iff (a1 + a2) <= (b1 + b2) */ diff --git a/lib/math/sum-impl.myr b/lib/math/sum-impl.myr index 9826947..81f9a23 100644 --- a/lib/math/sum-impl.myr +++ b/lib/math/sum-impl.myr @@ -1,5 +1,7 @@ use std +use "util" + /* For references, see [Mul+10] section 6.3 */ pkg math = pkglocal const kahan_sum32 : (l : flt32[:] -> flt32) @@ -67,12 +69,6 @@ pkglocal const priest_sum32 = {l : flt32[:] -> s } -const mag_cmp32 = {f : flt32, g : flt32 - var u = std.flt32bits(f) & ~(1 << 31) - var v = std.flt32bits(g) & ~(1 << 31) - -> std.numcmp(v, u) -} - pkglocal const priest_sum64 = {l : flt64[:] var l2 = std.sldup(l) std.sort(l, mag_cmp64) @@ -82,12 +78,6 @@ pkglocal const priest_sum64 = {l : flt64[:] -> s } -const mag_cmp64 = {f : flt64, g : flt64 - var u = std.flt64bits(f) & ~(1 << 63) - var v = std.flt64bits(g) & ~(1 << 63) - -> std.numcmp(v, u) -} - generic double_compensated_sum = {l : @f[:] :: numeric,floating @f /* l should be sorted in descending order */ if l.len == 0 diff --git a/lib/math/tan-impl.myr b/lib/math/tan-impl.myr index adbc0ff..e2cbadf 100644 --- a/lib/math/tan-impl.myr +++ b/lib/math/tan-impl.myr @@ -430,8 +430,8 @@ const tanorcot = {x : flt64, want_tan : bool ;; var delta1, delta2, deltat - (delta1, deltat) = fast2sum(-std.flt64frombits(xi), x1) - (delta2, _) = fast2sum(deltat, x2) + (delta1, deltat) = slow2sum(-std.flt64frombits(xi), x1) + (delta2, _) = slow2sum(deltat, x2) var p1, p2 /* @@ -459,7 +459,7 @@ const tanorcot = {x : flt64, want_tan : bool var u1 = std.flt64frombits(std.flt64bits(u0) & split_mask) var u2 = u0 - u1 - (ret1, ret2) = fast2sum(u0 - u0*u0*g, u0*((1.0 - u1*f) - u2*f)) + (ret1, ret2) = slow2sum(u0 - u0*u0*g, u0*((1.0 - u1*f) - u2*f)) goto have_result ;; @@ -499,12 +499,12 @@ const ptan = {x1 : flt64, x2 : flt64 var s : flt64 = x1 * x1 var p : flt64 = horner_polyu(s, tan_coeff[:]) var r1, r2 - (r1, r2) = two_by_two(p, x1) + (r1, r2) = two_by_two64(p, x1) -> fast2sum(r1, r2 + x2) } const pcot = {x1 : flt64, x2 : flt64 var s : flt64 = x1 * x1 var p : flt64 = horner_polyu(s, cot_coeff[:]) - -> fast2sum(p/x1, std.flt64frombits(0x3fd5555555555555)*x2) + -> slow2sum(p/x1, std.flt64frombits(0x3fd5555555555555)*x2) } diff --git a/lib/math/util.myr b/lib/math/util.myr index dadd180..8223d13 100644 --- a/lib/math/util.myr +++ b/lib/math/util.myr @@ -16,10 +16,19 @@ pkg math = const need_round_away : (h : uint64, l : uint64, bitpos_last : int64 -> bool) /* Multiply x * y to z1 + z2 */ - const two_by_two : (x : flt64, y : flt64 -> (flt64, flt64)) + const two_by_two64 : (x : flt64, y : flt64 -> (flt64, flt64)) + const two_by_two32 : (x : flt32, y : flt32 -> (flt32, flt32)) + + /* Compare by magnitude */ + const mag_cmp32 : (f : flt32, g : flt32 -> std.order) + const mag_cmp64 : (f : flt64, g : flt64 -> std.order) /* Return (s, t) such that s + t = a + b, with s = rn(a + b). */ generic fast2sum : (x : @f, y : @f -> (@f, @f)) :: floating, numeric @f + generic slow2sum : (x : @f, y : @f -> (@f, @f)) :: floating, numeric @f + + /* return (a, b, c), a decent sum for q */ + const triple_compensated_sum : (q : flt64[:] -> (flt64, flt64, flt64)) /* Rounds a + b (as flt64s) to a flt32. */ const round_down : (a : flt64, b : flt64 -> flt32) @@ -188,7 +197,7 @@ const need_round_away = {h : uint64, l : uint64, bitpos_last : int64 /* Perform high-prec multiplication: x * y = z1 + z2. */ -const two_by_two = {x : flt64, y : flt64 +const two_by_two64 = {x : flt64, y : flt64 var xh : flt64 = std.flt64frombits(std.flt64bits(x) & twentysix_bits_mask) var xl : flt64 = x - xh var yh : flt64 = std.flt64frombits(std.flt64bits(y) & twentysix_bits_mask) @@ -213,13 +222,13 @@ const two_by_two = {x : flt64, y : flt64 (s, c) = fast2sum(s, a2) /* a3 */ - (yy, u) = fast2sum(c, a3) + (yy, u) = slow2sum(c, a3) (t, v) = fast2sum(s, yy) z = u + v (s, c) = fast2sum(t, z) /* a4 */ - (yy, u) = fast2sum(c, a4) + (yy, u) = slow2sum(c, a4) (t, v) = fast2sum(s, yy) z = u + v (s, c) = fast2sum(t, z) @@ -227,7 +236,45 @@ const two_by_two = {x : flt64, y : flt64 -> (s, c) } +/* + The same, for flt32s + */ +const two_by_two32 = {x : flt32, y : flt32 + var xL : flt64 = (x : flt64) + var yL : flt64 = (y : flt64) + var zL : flt64 = xL * yL + var s : flt32 = (zL : flt32) + var sL : flt64 = (s : flt64) + var cL : flt64 = zL - sL + var c : flt32 = (cL : flt32) + + -> (s, c) +} + +const mag_cmp32 = {f : flt32, g : flt32 + var u = std.flt32bits(f) & ~(1 << 31) + var v = std.flt32bits(g) & ~(1 << 31) + -> std.numcmp(v, u) +} + +const mag_cmp64 = {f : flt64, g : flt64 + var u = std.flt64bits(f) & ~(1 << 63) + var v = std.flt64bits(g) & ~(1 << 63) + -> std.numcmp(v, u) +} + /* Return (s, t) such that s + t = a + b, with s = rn(a + b). */ +generic slow2sum = {a : @f, b : @f :: floating, numeric @f + var s = a + b + var aa = s - b + var bb = s - aa + var da = a - aa + var db = b - bb + var t = da + db + -> (s, t) +} + +/* Return (s, t) such that s + t = a + b, with s = rn(a + b), when you KNOW |a| > |b|. */ generic fast2sum = {a : @f, b : @f :: floating, numeric @f var s = a + b var z = s - a @@ -235,6 +282,25 @@ generic fast2sum = {a : @f, b : @f :: floating, numeric @f -> (s, t) } +const triple_compensated_sum = {q : flt64[:] + /* TODO: verify, with GAPPA or something, that this is correct. */ + std.sort(q, mag_cmp64) + var s1 : flt64, s2 : flt64, s3 + var t1 : flt64, t2 : flt64, t3 : flt64, t4 : flt64, t5 : flt64, t6 + s1 = q[0] + s2 = 0.0 + s3 = 0.0 + for qq : q[1:] + (t5, t6) = slow2sum(s3, qq) + (t3, t4) = slow2sum(s2, t5) + (t1, t2) = slow2sum(s1, t3) + s1 = t1 + (s2, s3) = slow2sum(t2, t4 + t6) + ;; + + -> (s1, s2, s3) +} + /* Round a + b to a flt32. Only notable if round(a) is a rounding tie, and b is non-zero |