summaryrefslogtreecommitdiff
path: root/lib/math
diff options
context:
space:
mode:
authorS. Gilles <sgilles@math.umd.edu>2018-07-20 03:44:04 -0400
committerS. Gilles <sgilles@math.umd.edu>2018-07-20 03:44:04 -0400
commit4acb5822344f2dc7dc360720ef803e1d36d2e927 (patch)
treeca9f724b9f296402ba19b6954d5a488de1465641 /lib/math
parentaa7834eae6417772a8d2e6752913c6e4874db0a8 (diff)
downloadmc-4acb5822344f2dc7dc360720ef803e1d36d2e927.tar.gz
First compiling tan/cot functions.
Diffstat (limited to 'lib/math')
-rw-r--r--lib/math/ancillary/generate-minimax-by-Remez.gp24
-rw-r--r--lib/math/bld.sub3
-rw-r--r--lib/math/fpmath.myr16
-rw-r--r--lib/math/sin-impl.myr138
-rw-r--r--lib/math/tan-impl.myr473
-rw-r--r--lib/math/util.myr80
6 files changed, 636 insertions, 98 deletions
diff --git a/lib/math/ancillary/generate-minimax-by-Remez.gp b/lib/math/ancillary/generate-minimax-by-Remez.gp
index 7bb0434..f8db70a 100644
--- a/lib/math/ancillary/generate-minimax-by-Remez.gp
+++ b/lib/math/ancillary/generate-minimax-by-Remez.gp
@@ -121,15 +121,39 @@
return(if(x == 0, 1, sin(x)/x));
}
+{ tanoverx(x) =
+ return(if(x == 0, 1, tan(x)/x));
+}
+
+{ cotx(x) =
+ return(1/tanoverx(x));
+}
+
print("\n");
print("Minimaxing sin(x) / x, degree 6, on [-Pi/(4 * 256), Pi/(4 * 256)]:");
find_minimax(sinoverx, 6, -Pi/1024, Pi/1024)
print("\n");
print("(You'll need to add a 0x0 at the beginning to make a degree 7...\n");
print("\n");
+print("---\n");
print("\n");
print("Minimaxing cos(x), degree 7, on [-Pi/(4 * 256), Pi/(4 * 256)]:");
find_minimax(cos, 7, -Pi/1024, Pi/1024)
print("\n");
+print("---\n");
+print("\n");
+print("Minmimaxing tan(x) / x, degree 6, on [-Pi/(4 * 256), Pi/(4 * 256)]:");
+find_minimax(tanoverx, 6, -Pi/1024, Pi/1024)
+print("\n");
+print("(You'll need to add a 0x0 at the beginning to make a degree 7...\n");
+print("\n");
+print("---\n");
+print("\n");
+print("Minmimaxing x*cot(x), degree 8, on [-Pi/(4 * 256), Pi/(4 * 256)]:");
+find_minimax(cotx, 8, -Pi/1024, Pi/1024)
+print("\n");
+print("(Take the first v, and remember to divide by x)\n");
+print("\n");
+print("---\n");
print("\n");
print("Remember that there's that extra, ugly E term at the end of the vector that you want to lop off.\n");
diff --git a/lib/math/bld.sub b/lib/math/bld.sub
index 9937ce4..0a6af63 100644
--- a/lib/math/bld.sub
+++ b/lib/math/bld.sub
@@ -38,6 +38,9 @@ lib math =
trunc-impl+posixy-x64-sse4.s
trunc-impl.myr
+ # tan, cot
+ tan-impl.myr
+
# util
util.myr
ftrap.myr
diff --git a/lib/math/fpmath.myr b/lib/math/fpmath.myr
index f1f45c4..1bc26ef 100644
--- a/lib/math/fpmath.myr
+++ b/lib/math/fpmath.myr
@@ -36,6 +36,10 @@ pkg math =
kahan_sum : (a : @f[:] -> @f)
priest_sum : (a : @f[:] -> @f)
+ /* tan-impl */
+ tan : (x : @f -> @f)
+ cot : (x : @f -> @f)
+
/* trunc-impl */
trunc : (x : @f -> @f)
ceil : (x : @f -> @f)
@@ -104,6 +108,9 @@ impl fpmath flt32 =
kahan_sum = {l; -> kahan_sum32(l) }
priest_sum = {l; -> priest_sum32(l) }
+ tan = {x; -> tan32(x)}
+ cot = {x; -> cot32(x)}
+
trunc = {x; -> trunc32(x)}
floor = {x; -> floor32(x)}
ceil = {x; -> ceil32(x)}
@@ -135,6 +142,9 @@ impl fpmath flt64 =
kahan_sum = {l; -> kahan_sum64(l) }
priest_sum = {l; -> priest_sum64(l) }
+ tan = {x; -> tan64(x)}
+ cot = {x; -> cot64(x)}
+
trunc = {x; -> trunc64(x)}
floor = {x; -> floor64(x)}
ceil = {x; -> ceil64(x)}
@@ -188,6 +198,12 @@ extern const kahan_sum64 : (l : flt64[:] -> flt64)
extern const priest_sum32 : (l : flt32[:] -> flt32)
extern const priest_sum64 : (l : flt64[:] -> flt64)
+extern const tan32 : (x : flt32 -> flt32)
+extern const tan64 : (x : flt64 -> flt64)
+
+extern const cot32 : (x : flt32 -> flt32)
+extern const cot64 : (x : flt64 -> flt64)
+
extern const trunc32 : (x : flt32 -> flt32)
extern const trunc64 : (x : flt64 -> flt64)
diff --git a/lib/math/sin-impl.myr b/lib/math/sin-impl.myr
index 1f92981..7bb44e2 100644
--- a/lib/math/sin-impl.myr
+++ b/lib/math/sin-impl.myr
@@ -40,20 +40,25 @@ use "util"
See files pi-constants.c, generate-triples-for-GB91.c, and
generate-minimax-by-Remez.gp for where the constants come from.
+
+ Note that in a departure from [GB91], we use the smaller degree
+ polynomials over a smaller range for the input. This requires
+ that we store more C-values for smaller xi values, but makes
+ the implementation a bit simpler to read.
*/
pkg math =
- pkglocal const sin32 : (x : flt32 -> flt32)
- pkglocal const sin64 : (x : flt64 -> flt64)
+ pkglocal const sin32 : (x : flt32 -> flt32)
+ pkglocal const sin64 : (x : flt64 -> flt64)
- pkglocal const cos32 : (x : flt32 -> flt32)
- pkglocal const cos64 : (x : flt64 -> flt64)
+ pkglocal const cos32 : (x : flt32 -> flt32)
+ pkglocal const cos64 : (x : flt64 -> flt64)
- pkglocal const sincos32 : (x : flt32 -> (flt32, flt32))
- pkglocal const sincos64 : (x : flt64 -> (flt64, flt64))
-;;
+ pkglocal const sincos32 : (x : flt32 -> (flt32, flt32))
+ pkglocal const sincos64 : (x : flt64 -> (flt64, flt64))
-/* Split precision down the middle */
-const twentysix_bits_mask = (0xffffffffffffffff << 27)
+ pkglocal const trig_reduce : (x : flt64 -> (int64, flt64, flt64))
+ pkglocal const trig_table_approx : (x1 : flt64, C : (uint64, uint64, uint64)[257] -> (uint64, uint64, uint64))
+;;
/* Pi/2 in lots of detail, for range reducing sin(2^18) or so */
const pi_over_2 : uint64[4] = [
@@ -458,8 +463,8 @@ const w64 = {x : flt64, want_sin : bool, want_cos : bool
;;
var N : int64
- var x1 : flt64, x2 : flt64, x3 : flt64
- (N, x1, x2) = reduce(x)
+ var x1 : flt64, x2 : flt64
+ (N, x1, x2) = trig_reduce(x)
/* Handle multiples of pi/2 */
var swap_sin_cos : bool = false
@@ -489,22 +494,8 @@ const w64 = {x : flt64, want_sin : bool, want_cos : bool
;;
/* Figure out where in the C table x lies */
- var j = (rn(x1 * std.flt64frombits(oneohtwofour_over_pi)) : uint64)
- var x1u = std.flt64bits(x1)
- var xi, sin_xi, cos_xi, sin_delta, cos_delta
-
- /* We either want j or j - 1. */
- j = std.max(std.min(j, 256), 0)
-
- (xi, cos_xi, sin_xi) = C[j]
- if j > 0
- var test_xi
- (test_xi, _, _) = C[j - 1]
- if std.abs(x1u - test_xi) < std.abs(x1u - xi)
- (xi, cos_xi, sin_xi) = C[j - 1]
- j--
- ;;
- ;;
+ var xi : uint64, sin_xi : uint64, cos_xi : uint64, sin_delta, cos_delta
+ (xi, cos_xi, sin_xi) = trig_table_approx(x1, C)
/*
Compute x - xi with compensated summation. Because xi
@@ -583,8 +574,8 @@ const w64 = {x : flt64, want_sin : bool, want_cos : bool
-> (sin_ret, sin_ret2, cos_ret, cos_ret2)
}
-/* Reduce x to N*(pi/4) + x', with x' in [-pi/4, pi/4] */
-const reduce = {x : flt64
+/* Reduce x to N*(pi/2) + x', with x' in [-pi/4, pi/4] */
+const trig_reduce = {x : flt64
var N : int64 = 0
var Nf : flt64
@@ -753,6 +744,26 @@ const huge_reduce_2pi = {x : flt64
-> triple_compensated_sum(q[:])
}
+const trig_table_approx = {x1 : flt64, C : (uint64, uint64, uint64)[257]
+ var j = (rn(x1 * std.flt64frombits(oneohtwofour_over_pi)) : uint64)
+ var x1u = std.flt64bits(x1)
+ var xi, sin_xi, cos_xi, test_cos_xi, test_sin_xi
+
+ /* We either want j or j - 1. */
+ j = std.max(std.min(j, 256), 0)
+
+ (xi, cos_xi, sin_xi) = C[j]
+ if j > 0
+ var test_xi
+ (test_xi, test_cos_xi, test_sin_xi) = C[j - 1]
+ if std.abs(x1u - test_xi) < std.abs(x1u - xi)
+ -> (test_xi, test_cos_xi, test_sin_xi)
+ ;;
+ ;;
+
+ -> (xi, cos_xi, sin_xi)
+}
+
const triple_compensated_sum = {q : flt64[:]
/* TODO: verify, with GAPPA or something, that this is correct. */
std.sort(q, fltabscmp)
@@ -785,48 +796,6 @@ const fltabscmp = {x : flt64, y : flt64
}
/*
- Perform high-prec multiplication: x * y = z1 + z2.
- */
-const two_by_two = {x : flt64, y : flt64
- var xh : flt64 = std.flt64frombits(std.flt64bits(x) & twentysix_bits_mask)
- var xl : flt64 = x - xh
- var yh : flt64 = std.flt64frombits(std.flt64bits(y) & twentysix_bits_mask)
- var yl : flt64 = y - yh
-
- /* Multiply out */
- var a1 : flt64 = xh * yh
- var a2 : flt64 = xh * yl
- var a3 : flt64 = xl * yh
- var a4 : flt64 = xl * yl
-
- /* By-hand compensated summation */
- var yy, u, t, v, z, s, c
- if a2 < a3
- std.swap(&a3, &a2)
- ;;
-
- s = a1
- c = 0.0
-
- /* a2 */
- (s, c) = fast2sum(s, a2)
-
- /* a3 */
- (yy, u) = fast2sum(c, a3)
- (t, v) = fast2sum(s, yy)
- z = u + v
- (s, c) = fast2sum(t, z)
-
- /* a4 */
- (yy, u) = fast2sum(c, a4)
- (t, v) = fast2sum(s, yy)
- z = u + v
- (s, c) = fast2sum(t, z)
-
- -> (s, c)
-}
-
-/*
Return true iff (a1 + a2) <= (b1 + b2)
*/
const le_22 = { a1, a2, b1, b2
@@ -842,30 +811,3 @@ const le_22 = { a1, a2, b1, b2
-> true
}
-
-/* Return (s, t) such that s + t = a + b, with s = rn(a + b). */
-const fast2sum = {a : flt64, b : flt64
- var s : flt64 = a + b
- var z : flt64 = s - a
- var t : flt64 = b - z
- -> (s, t)
-
-}
-
-/*
- Round a + b to a flt32. Only notable if round(a) is a rounding
- tie, and b is non-zero
- */
-const round_down = {a : flt64, b : flt64
- var au : uint64 = std.flt64bits(a)
- if au & 0x0000000070000000 == 0x0000000070000000
- if b > 0.0
- au++
- elif b < 0.0
- au--
- ;;
- -> (std.flt64frombits(au) : flt32)
- ;;
-
- -> (a : flt32)
-}
diff --git a/lib/math/tan-impl.myr b/lib/math/tan-impl.myr
new file mode 100644
index 0000000..22746dc
--- /dev/null
+++ b/lib/math/tan-impl.myr
@@ -0,0 +1,473 @@
+use std
+
+use "fpmath"
+
+/* We need trig_reduce only. */
+use "sin-impl"
+use "util"
+
+/*
+ See sin-impl.myr, this implementation is a very close copy;
+ [GB91] provides the guide for implementation. As with sin and
+ cos, our polynomials are of lower degree, and we restrict the
+ polynomial approximation to a smaller range than in [GB91].
+
+ See files pi-constants.c, generate-triples-for-GB91.c, and
+ generate-minimax-by-Remez.gp for where the constants come from.
+ */
+pkg math =
+ pkglocal const tan32 : (x : flt32 -> flt32)
+ pkglocal const tan64 : (x : flt64 -> flt64)
+
+ pkglocal const cot32 : (x : flt32 -> flt32)
+ pkglocal const cot64 : (x : flt64 -> flt64)
+;;
+
+/*
+ Coefficients for minimax polynomial approximating p(x), with
+ tan(x) = x*p(x^2), expanding out to a degree 7 polynomial for
+ tan(x).
+ */
+const tan_coeff : uint64[4] = [
+ 0x3ff0000000000000,
+ 0x3fd5555555555555,
+ 0x3fc1111111111111,
+ 0x3faba1c7ec067952,
+]
+
+/*
+ Coefficients for minimax polynomial approximating p(x), with
+ cot(x) = p(x^2)/x, expanding to a degree 7 polynomial for cot(x).
+ */
+const cot_coeff : uint64[5] = [
+ 0x3ff0000000000000,
+ 0xbfd5555555555555,
+ 0xbf96c16c16c16c17,
+ 0xbf61566abc00f016,
+ 0xbf2bbd7be567ea80,
+]
+
+/*
+ The Highly Accurate Tables for use in a [GB91]-type algorithm;
+ generated by ancillary/generate-triples-for-GB91.c.
+
+ Note the 0th entry has infinite cotangent: we have to specially
+ handle arguments close to 0 to avoid inf pollution.
+ */
+const C : (uint64, uint64, uint64)[257] = [
+ /* xi cot(xi) tan(xi) */
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ (0x0000000000000000, 0x7ff0000000000000, 0x0000000000000000),
+ (0x3f6921fb43c5e5fb, 0x40745f2c4ad38437, 0x3f6922006eb65bc4),
+ (0x3f7921fb69404898, 0x40645f1f9b724978, 0x3f7922101511c956),
+ (0x3f82d97c65697c99, 0x405b2963c8d7c55e, 0x3f82d99f4785288e),
+ (0x3f8921fb51371df1, 0x40545eed6acdf109, 0x3f89224e01710949),
+ (0x3f8f6a7a2c57dd2f, 0x40504bd2f5b9fc24, 0x3f8f6b1badf01256),
+ (0x3f92d97c892e0401, 0x404b28ccc8515f8d, 0x3f92da0815417fe7),
+ (0x3f95fdbbfe00c048, 0x4047474caf184945, 0x3f95fe99994b31ca),
+ (0x3f9921fb362e82a0, 0x40445e246e48a7ee, 0x3f9923460660434c),
+ (0x3f9c463aa05ba8df, 0x40421a8bbcf0d59c, 0x3f9c4811ad83840e),
+ (0x3f9f6a7a32fd798b, 0x40404ad799698985, 0x3f9f6d0068177f47),
+ (0x3fa1475cadce672e, 0x403d9ed9b607478d, 0x3fa1490ac38b33a8),
+ (0x3fa2d97c804f3e96, 0x403b267194186780, 0x3fa2dbaaeabcdd1f),
+ (0x3fa46b9c34e4ab15, 0x40390f4a6551abeb, 0x3fa46e623ffdc4da),
+ (0x3fa5fdbbfbcb8cbd, 0x4037448cde3bdf75, 0x3fa60132e6a3c57c),
+ (0x3fa78fdba272466d, 0x4035b6f0dc2e1d02, 0x3fa7941e9f8a200e),
+ (0x3fa921fb46f590b9, 0x40345afff7eb9ed9, 0x3fa927277ce43c43),
+ (0x3faab41af9b5a4a1, 0x403327f64d5ddbe8, 0x3faaba4f83b868b9),
+ (0x3fac463aa15aa204, 0x40321702ba17ae76, 0x3fac4d988fd3c37d),
+ (0x3fadd85a541eb1e6, 0x403122c3623e77ea, 0x3fade104ad5a19ae),
+ (0x3faf6a7a28fb8b3c, 0x403046e9fe8f3bcb, 0x3faf7495e9e533ba),
+ (0x3fb07e4cd90f69c4, 0x402efff4fb017e86, 0x3fb08426e73b18e9),
+ (0x3fb1475ccf08bc32, 0x402d9634f7939f69, 0x3fb14e17812554ba),
+ (0x3fb2106c9ca80bcd, 0x402c4bde69326818, 0x3fb2181d64f21f1e),
+ (0x3fb2d97c6e8f0d99, 0x402b1d03d8431e54, 0x3fb2e239bc43f5ea),
+ (0x3fb3a28c5e9cda74, 0x402a0658d1258f8a, 0x3fb3ac6d9e61f454),
+ (0x3fb46b9c1af1ca7b, 0x40290513541f6bae, 0x3fb476b9b6a5b103),
+ (0x3fb534abf74c4f86, 0x402816d25c8e2911, 0x3fb5411f577b5e4b),
+ (0x3fb5fdbbe76972c5, 0x4027398c5a6a94ca, 0x3fb60b9f7341612c),
+ (0x3fb6c6cbcafad5b1, 0x40266b7fbfc9a854, 0x3fb6d63ae89faaee),
+ (0x3fb78fdb9c7d39ab, 0x4025ab26d260cbba, 0x3fb7a0f2b1ba77b4),
+ (0x3fb858eb82b930eb, 0x4024f72dfaa05197, 0x3fb86bc7f5f32f7d),
+ (0x3fb921fb549b731e, 0x40244e6c591405be, 0x3fb936bb8cb34c63),
+ (0x3fb9eb0b2e8adcf9, 0x4023afdcd1160af4, 0x3fba01ce94149e0a),
+ (0x3fbab41b1e68f64c, 0x40231a990b29843b, 0x3fbacd021c321ede),
+ (0x3fbb7d2ad90eb4c1, 0x40228dd5408b39fd, 0x3fbb9856dbc170d0),
+ (0x3fbc463aa9e0e6e1, 0x402208dbe8546fd1, 0x3fbc63ce22521e17),
+ (0x3fbd0f4a9ed6ac6b, 0x40218b0b447a840e, 0x3fbd2f69021ade8d),
+ (0x3fbdd85a6a1e3500, 0x402113d2d3340c0f, 0x3fbdfb283104590b),
+ (0x3fbea16a43111eb3, 0x4020a2b0974c728d, 0x3fbec70cec8d28e2),
+ (0x3fbf6a7a152ae159, 0x4020372fbdef0db6, 0x3fbf93182614dae9),
+ (0x3fc019c5091530ae, 0x401fa1cd6c211be1, 0x3fc02fa58b882864),
+ (0x3fc07e4cd60bb2ac, 0x401edeecb2e792c1, 0x3fc095d31b7ab15a),
+ (0x3fc0e2d4f9759641, 0x401e250f354f8dbe, 0x3fc0fc15d1664ec5),
+ (0x3fc1475ccb271aea, 0x401d7398cf43b80c, 0x3fc1626d86e71cc7),
+ (0x3fc1abe4ae6b768f, 0x401cc9f96a972d15, 0x3fc1c8db2614cbc0),
+ (0x3fc2106c905dda31, 0x401c27ae41d1edd4, 0x3fc22f5f2139f59d),
+ (0x3fc274f4896db29e, 0x401b8c3f6c6a4a71, 0x3fc295fa1740027b),
+ (0x3fc2d97c6d32e764, 0x401af73f66c3ead4, 0x3fc2fcac61406613),
+ (0x3fc33e045f0619c2, 0x401a6849297d859a, 0x3fc36376aa36e713),
+ (0x3fc3a28c3a37c404, 0x4019df00269fbec5, 0x3fc3ca5953fa7c1a),
+ (0x3fc407144075ebd8, 0x40195b0e8afc5158, 0x3fc4315529a44c20),
+ (0x3fc46b9c2c6b4a28, 0x4018dc25cb0e86a9, 0x3fc4986a6c8dde38),
+ (0x3fc4d02440498d08, 0x401861fca0c2727d, 0x3fc4ff99e978d677),
+ (0x3fc534ac1fd87d02, 0x4017ec4fef47d5ac, 0x3fc566e3cb1ac3b8),
+ (0x3fc59933e96f8d02, 0x40177ae0ebe39dcb, 0x3fc5ce48ba6cbc8c),
+ (0x3fc5fdbbe79e8588, 0x40170d751b4b5351, 0x3fc635c98ea0a06e),
+ (0x3fc66243c3de5553, 0x4016a3d6c893e89c, 0x3fc69d6679ac9090),
+ (0x3fc6c6cbd97ff2eb, 0x40163dd33b7cd8a1, 0x3fc70520654e8fbc),
+ (0x3fc72b53b2dfa886, 0x4015db3bfb9718c5, 0x3fc76cf7644f5456),
+ (0x3fc78fdba07a2b15, 0x40157be4e94511b4, 0x3fc7d4ec5682b6ec),
+ (0x3fc7f4639158cb7b, 0x40151fa5267b1359, 0x3fc83cffb7c5bd13),
+ (0x3fc858eb5d585717, 0x4014c6568b6182c0, 0x3fc8a531ec767b5b),
+ (0x3fc8bd735284b4b1, 0x40146fd4f60aa6ea, 0x3fc90d83d41787e6),
+ (0x3fc921fb44d8c003, 0x40141bfeeeeb615b, 0x3fc975f5d04d9510),
+ (0x3fc986833b7f08e5, 0x4013cab4e2b1ea03, 0x3fc9de8878736efe),
+ (0x3fc9eb0b3ba4fca5, 0x40137bd929ce2e37, 0x3fca473c62857446),
+ (0x3fca4f9336067d8d, 0x40132f4ff1744efc, 0x3fcab0120fe01a1b),
+ (0x3fcab41b018ad3ad, 0x4012e4ff1bf8fe76, 0x3fcb1909e77f6d3f),
+ (0x3fcb18a2f15fe343, 0x40129ccdb5a1a27c, 0x3fcb8224d2de4be1),
+ (0x3fcb7d2ae4498be8, 0x401256a48b1995d8, 0x3fcbeb6342b07ca5),
+ (0x3fcbe1b2c39aaa4c, 0x4012126daf49101c, 0x3fcc54c5b349b00d),
+ (0x3fcc463acb8ca010, 0x4011d01436249f34, 0x3fccbe4cf8afd08f),
+ (0x3fccaac2bf18961b, 0x40118f84a7fdac06, 0x3fcd27f968798e0c),
+ (0x3fcd0f4a797c05fe, 0x401150ac8980c6bc, 0x3fcd91cb72380077),
+ (0x3fcd73d270bdc038, 0x40111379fd8fc155, 0x3fcdfbc42953c563),
+ (0x3fcdd85a6542243b, 0x4010d7dc85aa36a5, 0x3fce65e3e2d8019c),
+ (0x3fce3ce24380c9db, 0x40109dc463970d1f, 0x3fced02b22de7ced),
+ (0x3fcea16a4eb76e81, 0x4010652276f8722a, 0x3fcf3a9aca25e284),
+ (0x3fcf05f225fcc8fc, 0x40102de8be5f5e9b, 0x3fcfa5330c823e20),
+ (0x3fcf6a7a2e7e6f81, 0x400ff01300763015, 0x3fd007fa78443530),
+ (0x3fcfcf021570cd37, 0x400f86f024238a5e, 0x3fd03d705d41a70c),
+ (0x3fd019c4e66f72bd, 0x400f205085a9838c, 0x3fd072fb7c3be48f),
+ (0x3fd04c08e54cf113, 0x400ebc1c6d564497, 0x3fd0a89c62f5e593),
+ (0x3fd07e4ceb3a2775, 0x400e5a3df148f481, 0x3fd0de53430437af),
+ (0x3fd0b090d14d1132, 0x400dfaa0416fb320, 0x3fd1142042a947fe),
+ (0x3fd0e2d4e9b68203, 0x400d9d2ea3b4f2db, 0x3fd14a040a4c0488),
+ (0x3fd11518dadb07fc, 0x400d41d68d4614b2, 0x3fd17ffe8abf93f8),
+ (0x3fd1475cabbbd1a5, 0x400ce8859a42336f, 0x3fd1b6101cb2785a),
+ (0x3fd179a0b7f1e7a4, 0x400c9129a990cfbb, 0x3fd1ec3974987375),
+ (0x3fd1abe4b50fd9a0, 0x400c3bb2863ab5e2, 0x3fd2227a94b976bd),
+ (0x3fd1de28a830b251, 0x400be8102968bfac, 0x3fd258d3d564ded0),
+ (0x3fd2106ca9bf313e, 0x400b96331fe26d7e, 0x3fd28f45a4688f6e),
+ (0x3fd242b0b89d73ec, 0x400b460cca6bd89f, 0x3fd2c5d0548f5454),
+ (0x3fd274f47ec75990, 0x400af78fac257637, 0x3fd2fc73dccd4043),
+ (0x3fd2a738a04751ef, 0x400aaaad59bb9e25, 0x3fd3333144835986),
+ (0x3fd2d97c85bb113f, 0x400a5f59df55d5a8, 0x3fd36a083c7e2ced),
+ (0x3fd30bc06f220a02, 0x400a1588855353e2, 0x3fd3a0f96085b6dd),
+ (0x3fd33e046e3c6329, 0x4009cd2d5e7d9094, 0x3fd3d8051ac8b850),
+ (0x3fd370486fc1fac0, 0x4009863d2da63ac8, 0x3fd40f2bad85dbce),
+ (0x3fd3a28c45f65785, 0x400940ad4be7678c, 0x3fd4466d3e4e392f),
+ (0x3fd3d4d04d809c18, 0x4008fc72c1faf287, 0x3fd47dca8b7b83b2),
+ (0x3fd407143dded9c0, 0x4008b983e519bfd8, 0x3fd4b5439e5d1536),
+ (0x3fd439583676eabf, 0x400877d6e093e9da, 0x3fd4ecd8f3427dcc),
+ (0x3fd46b9c4f298d45, 0x400837624b870e31, 0x3fd5248aff0c1bb9),
+ (0x3fd49de0389dbfc9, 0x4007f81d9a994879, 0x3fd55c59c4bc190f),
+ (0x3fd4d02415978a87, 0x4007ba0005be9705, 0x3fd59445c6583fea),
+ (0x3fd502680d976ddd, 0x40077d0114c36575, 0x3fd5cc4f8c28a849),
+ (0x3fd534ac009ad6ad, 0x40074118f5d5a027, 0x3fd604774f854e2e),
+ (0x3fd566effa9677a1, 0x4007063fec410192, 0x3fd63cbd7b6d7874),
+ (0x3fd59933e0e9ba24, 0x4006cc6eafc7e66d, 0x3fd675225058af7c),
+ (0x3fd5cb77eb6b2450, 0x4006939dddeba442, 0x3fd6ada66c018e76),
+ (0x3fd5fdbbdfcd5e78, 0x40065bc6d77251ea, 0x3fd6e649eca3c379),
+ (0x3fd62ffff08a2e0c, 0x400624e2c13b5a9b, 0x3fd71f0d6b9a2d05),
+ (0x3fd66243f23b0c45, 0x4005eeeb636b3b0e, 0x3fd757f1191ef14c),
+ (0x3fd69487b10a42f2, 0x4005b9dac4d6f74e, 0x3fd790f51c268135),
+ (0x3fd6c6cbae457fb1, 0x400585aa64d56be0, 0x3fd7ca1a6a0740e5),
+ (0x3fd6f90fbd5ea7f3, 0x40055254afeb627f, 0x3fd8036133dcb0d0),
+ (0x3fd72b539c1458e5, 0x40051fd4572bd740, 0x3fd83cc99243b1cc),
+ (0x3fd75d97a0772b20, 0x4004ee23a1eecaf5, 0x3fd876544c81706d),
+ (0x3fd78fdba1490a6d, 0x4004bd3d85c95693, 0x3fd8b001995400f4),
+ (0x3fd7c21f93c9a6f0, 0x40048d1d04c633fd, 0x3fd8e9d1d2f476b5),
+ (0x3fd7f463804f4cfb, 0x40045dbd38ab6945, 0x3fd923c56a5915d6),
+ (0x3fd826a7892f6d44, 0x40042f194c51167e, 0x3fd95ddcef7a54f2),
+ (0x3fd858eb80c2f052, 0x4004012cdbe1f350, 0x3fd9981896c629e4),
+ (0x3fd88b2f737b7ea4, 0x4003d3f372664ef1, 0x3fd9d278d890d001),
+ (0x3fd8bd735b872d13, 0x4003a768cf7d7edb, 0x3fda0cfe18e865e8),
+ (0x3fd8efb749b5b0fa, 0x40037b88c14a0190, 0x3fda47a8d71f4a59),
+ (0x3fd921fb4f73c582, 0x4003504f375b8a3b, 0x3fda827994591520),
+ (0x3fd9543f498e072b, 0x400325b86e2a7edc, 0x3fdabd70950976a6),
+ (0x3fd986833cb12fe2, 0x4002fbc09df08da7, 0x3fdaf88e4d1af906),
+ (0x3fd9b8c735da9e8f, 0x4002d26415c0dcf8, 0x3fdb33d33b48b23c),
+ (0x3fd9eb0b2b14f6b2, 0x4002a99f541052bd, 0x3fdb6f3fc4412f63),
+ (0x3fda1d4f1f259ed7, 0x4002816ee7faf9c9, 0x3fdbaad45ca7ae2f),
+ (0x3fda4f9317d51314, 0x400259cf78974618, 0x3fdbe6917dba0bac),
+ (0x3fda81d71d9a00d3, 0x400232bdc46748af, 0x3fdc2277a4fbcf04),
+ (0x3fdab41b1864b159, 0x40020c36bb584296, 0x3fdc5e872a26269b),
+ (0x3fdae65f0eadda20, 0x4001e6374cd17a1a, 0x3fdc9ac08a4bd320),
+ (0x3fdb18a2f6bdb980, 0x4001c0bc8b1148ee, 0x3fdcd724302664c9),
+ (0x3fdb4ae6f429a87d, 0x40019bc37c9e36a0, 0x3fdd13b2be0455c5),
+ (0x3fdb7d2adbb2f0d4, 0x400177497734de95, 0x3fdd506c786465b9),
+ (0x3fdbaf6ee37914fd, 0x4001534b9ddfc460, 0x3fdd8d521a67ad8d),
+ (0x3fdbe1b2dd912d13, 0x40012fc76f757964, 0x3fddca63e75202cb),
+ (0x3fdc13f6cc9e4889, 0x40010cba5a491b0c, 0x3fde07a25e238018),
+ (0x3fdc463ad9174142, 0x4000ea21c51caa0c, 0x3fde450e2d5365b9),
+ (0x3fdc787e9c84e6c4, 0x4000c7fb8a2ba964, 0x3fde82a755ab4856),
+ (0x3fdcaac2926527ca, 0x4000a644fa83f84e, 0x3fdec06eedc91cd2),
+ (0x3fdcdd06a43ce130, 0x400084fbdbdf1711, 0x3fdefe655b10819e),
+ (0x3fdd0f4a88e8826e, 0x4000641e23e987e1, 0x3fdf3c8ac50fd12d),
+ (0x3fdd418e8374c347, 0x400043a97b437c0b, 0x3fdf7ae00195ead5),
+ (0x3fdd73d27c65ab26, 0x4000239bd4e2229d, 0x3fdfb96577e9d701),
+ (0x3fdda6167d37836d, 0x400003f31ca39f94, 0x3fdff81bb96f0bef),
+ (0x3fddd85a68c19b42, 0x3fffc95ac8d38084, 0x3fe01b81944403fa),
+ (0x3fde0a9e587b4fd8, 0x3fff8b91526e80e7, 0x3fe03b0e368fb566),
+ (0x3fde3ce26298b772, 0x3fff4e85ef7772cc, 0x3fe05ab4165d6dbe),
+ (0x3fde6f26438b72e7, 0x3fff12353eaec03d, 0x3fe07a734e81ae40),
+ (0x3fdea16a54d3b255, 0x3ffed69b3a1ab4c7, 0x3fe09a4c5da07b69),
+ (0x3fded3ae2f825dc7, 0x3ffe9bb4d86851dd, 0x3fe0ba3f4945c2d6),
+ (0x3fdf05f21cdb9169, 0x3ffe617e5679952d, 0x3fe0da4c8763cb70),
+ (0x3fdf38363eaf5154, 0x3ffe27f4381597d6, 0x3fe0fa7475f5d590),
+ (0x3fdf6a7a098dc074, 0x3ffdef13dafcc870, 0x3fe11ab7049a4fea),
+ (0x3fdf9cbe1c32adf4, 0x3ffdb6d95ebfe258, 0x3fe13b14e2ceeb4a),
+ (0x3fdfcf01fc1a98f2, 0x3ffd7f4234582037, 0x3fe15b8e0c2a1e89),
+ (0x3fe000a30caf6ef5, 0x3ffd484adae8622e, 0x3fe17c23144ba5c9),
+ (0x3fe019c503e450d1, 0x3ffd11f0d6c93084, 0x3fe19cd4010aead7),
+ (0x3fe032e705d8ff9a, 0x3ffcdc30fd97dc6d, 0x3fe1bda14b7f7db5),
+ (0x3fe04c08e5135e13, 0x3ffca708e170a297, 0x3fe1de8b05a932bb),
+ (0x3fe0652af2ed30d5, 0x3ffc7275198adb9d, 0x3fe1ff91e87789c0),
+ (0x3fe07e4ce6bf0049, 0x3ffc3e7392215303, 0x3fe220b5e3b802d7),
+ (0x3fe0976f08fafefc, 0x3ffc0b0119a5b475, 0x3fe241f7a70e93a4),
+ (0x3fe0b090df3a9868, 0x3ffbd81c1d0ca878, 0x3fe26356e120198a),
+ (0x3fe0c9b2ead68147, 0x3ffba5c118435eb7, 0x3fe284d48f30abf1),
+ (0x3fe0e2d4dd7a7e38, 0x3ffb73ee3a8bbffd, 0x3fe2a6709b9c98c5),
+ (0x3fe0fbf6ec9a1e2c, 0x3ffb42a0b9cabf46, 0x3fe2c82ba151a704),
+ (0x3fe11518d95be884, 0x3ffb11d6bed203fe, 0x3fe2ea05a03d2b1c),
+ (0x3fe12e3acb91e4cc, 0x3ffae18db6b570dc, 0x3fe30bff230686f5),
+ (0x3fe1475cd97c8037, 0x3ffab1c3401e1903, 0x3fe32e189e288a82),
+ (0x3fe1607ecb17c3cc, 0x3ffa82759ac22a53, 0x3fe350521cba4339),
+ (0x3fe179a0be38d76d, 0x3ffa53a26f5af984, 0x3fe372ac1f34797e),
+ (0x3fe192c2b4ff17e0, 0x3ffa2547a7d6b75e, 0x3fe39527018c28f4),
+ (0x3fe1abe4a9d366b4, 0x3ff9f763480dbddc, 0x3fe3b7c316198532),
+ (0x3fe1c506b8926f4d, 0x3ff9c9f322aecbea, 0x3fe3da80de6b2be4),
+ (0x3fe1de289627ee62, 0x3ff99cf5cedc0b11, 0x3fe3fd604e8e3dcd),
+ (0x3fe1f74aa0f1e663, 0x3ff97068be5efcbc, 0x3fe4206246783e1e),
+ (0x3fe2106cc47cbcd5, 0x3ff9444a3c9eec82, 0x3fe4438708557940),
+ (0x3fe2298e9b74941b, 0x3ff9189929b725cd, 0x3fe466ce653433d7),
+ (0x3fe242b0875b781e, 0x3ff8ed53144242d5, 0x3fe48a3945bd5bdc),
+ (0x3fe25bd2a2ef236a, 0x3ff8c276131e7537, 0x3fe4adc83188d315),
+ (0x3fe274f49cd2ff63, 0x3ff89800fe076bc9, 0x3fe4d17b180d8e25),
+ (0x3fe28e16910c215f, 0x3ff86df1fa8545fa, 0x3fe4f552845a769c),
+ (0x3fe2a7387dfc7bb2, 0x3ff8444769c189a7, 0x3fe5194ed8c26526),
+ (0x3fe2c05a7c12d2de, 0x3ff81aff8aec0241, 0x3fe53d709e4082fa),
+ (0x3fe2d97c7c743349, 0x3ff7f218e6cd24d9, 0x3fe561b826bf35d4),
+ (0x3fe2f29e9716e420, 0x3ff7c991cf39c7a9, 0x3fe58625fd66563d),
+ (0x3fe30bc06e7f4a84, 0x3ff7a1695a457c1d, 0x3fe5aaba03e29b8c),
+ (0x3fe324e27374f758, 0x3ff7799d5b5545db, 0x3fe5cf7548f044c5),
+ (0x3fe33e047e936633, 0x3ff7522ca19d9740, 0x3fe5f457ff83db15),
+ (0x3fe35726746c3aee, 0x3ff72b15ef009b93, 0x3fe619626c900682),
+ (0x3fe3704880dc13de, 0x3ff704579db64f16, 0x3fe63e953f64e98f),
+ (0x3fe3896a79a01dfd, 0x3ff6ddf094709c2e, 0x3fe663f0a979932c),
+ (0x3fe3a28c5a0ef0b4, 0x3ff6b7df85cf5700, 0x3fe6897514cb6700),
+ (0x3fe3bbae72257473, 0x3ff69222ac668190, 0x3fe6af236bc6b21d),
+ (0x3fe3d4d05198d166, 0x3ff66cb96a79e48f, 0x3fe6d4fb7a2ab498),
+ (0x3fe3edf2390d7a1b, 0x3ff647a21ef19a42, 0x3fe6fafe165c04e5),
+ (0x3fe4071446b9849f, 0x3ff622db64677758, 0x3fe7212be57c166b),
+ (0x3fe4203660cc90a3, 0x3ff5fe642df6afa6, 0x3fe74785394a80fe),
+ (0x3fe4395830f053c5, 0x3ff5da3bca164098, 0x3fe76e0a075c1f61),
+ (0x3fe4527a51ef6abd, 0x3ff5b6603251fdeb, 0x3fe794bbb8782811),
+ (0x3fe46b9c35efc871, 0x3ff592d10f30a380, 0x3fe7bb99ef70fcb0),
+ (0x3fe484be1da35a2f, 0x3ff56f8ce774b6cf, 0x3fe7e2a58e4d7f2c),
+ (0x3fe49de0204a38f7, 0x3ff54c9283b95dee, 0x3fe809df39ac0a62),
+ (0x3fe4b70222176b17, 0x3ff529e0f9169404, 0x3fe83147483ce15a),
+ (0x3fe4d02423eab88d, 0x3ff50777396a8b69, 0x3fe858de3ecfe279),
+ (0x3fe4e9460a6bd40d, 0x3ff4e55461ce669f, 0x3fe880a47725d222),
+ (0x3fe50268221eb1c5, 0x3ff4c37707b189e0, 0x3fe8a89af16b273d),
+ (0x3fe51b8a1a015c9e, 0x3ff4a1de9a4d2715, 0x3fe8d0c1b5967d2c),
+ (0x3fe534abef35b431, 0x3ff4808a22df6aeb, 0x3fe8f91948a87d95),
+ (0x3fe54dce0e654a86, 0x3ff45f781cf2f1af, 0x3fe921a2e5a509da),
+ (0x3fe566effca9d1d5, 0x3ff43ea83a204e02, 0x3fe94a5e54941ccd),
+ (0x3fe58011f1e2647b, 0x3ff41e1944ab750f, 0x3fe9734c7f58c26c),
+ (0x3fe59933f5e86301, 0x3ff3fdca4ada27bb, 0x3fe99c6e04b5c577),
+ (0x3fe5b2560aab7515, 0x3ff3ddba66ec8460, 0x3fe9c5c37bdd6320),
+ (0x3fe5cb77e480bb2a, 0x3ff3bde919244f1b, 0x3fe9ef4cfd75f328),
+ (0x3fe5e499d2fe12b9, 0x3ff39e552085ea71, 0x3fea190ba428e394),
+ (0x3fe5fdbbd711540c, 0x3ff37efda4b7da0b, 0x3fea43000ba79067),
+ (0x3fe616dde7b3195c, 0x3ff35fe1dd7a4fd0, 0x3fea6d2ac0ff2200),
+ (0x3fe62fffe292e013, 0x3ff3410124e06ee8, 0x3fea978c284b69e9),
+ (0x3fe64921d3354ae1, 0x3ff3225a9fb1c71e, 0x3feac224f47e4717),
+ (0x3fe66243d9450683, 0x3ff303ed5e631dff, 0x3feaecf5fd84dff9),
+ (0x3fe67b65c258c493, 0x3ff2e5b8d84a4bf9, 0x3feb17ff91b6c931),
+ (0x3fe69487ced483eb, 0x3ff2c7bbfde79a7d, 0x3feb4342c6025af0),
+ (0x3fe6ada9d635f2e4, 0x3ff2a9f641170b2e, 0x3feb6ebffdf58dac),
+ (0x3fe6c6cbb61f0b15, 0x3ff28c670e616749, 0x3feb9a77a8b3dd2c),
+ (0x3fe6dfedafa2adc5, 0x3ff26f0d60a506c4, 0x3febc66ae4d5e2a0),
+ (0x3fe6f90fbe78ac86, 0x3ff251e88829ec96, 0x3febf29a5b89d1cc),
+ (0x3fe71231ac5b981e, 0x3ff234f81179eb17, 0x3fec1f065fb7e3ae),
+ (0x3fe72b539a5b2817, 0x3ff2183b26da1bf7, 0x3fec4bafe056a3e8),
+ (0x3fe74475a88b9513, 0x3ff1fbb0f7a7f5d2, 0x3fec7897ce6c8a57),
+ (0x3fe75d979d84d1b4, 0x3ff1df591bae6879, 0x3feca5be7daa52f0),
+ (0x3fe776b996729f58, 0x3ff1c332cb1b268b, 0x3fecd324def80a12),
+ (0x3fe78fdbb6c61400, 0x3ff1a73d3acc02d5, 0x3fed00cbf2860934),
+ (0x3fe7a8fd8e5473ec, 0x3ff18b7845e8ff44, 0x3fed2eb3ae59bf1f),
+ (0x3fe7c21fa821e9ae, 0x3ff16fe2b42697c2, 0x3fed5cddd5595691),
+ (0x3fe7db419cad411c, 0x3ff1547c5b66dd8f, 0x3fed8b4a71c71e74),
+ (0x3fe7f4638e3011f5, 0x3ff139447c65650b, 0x3fedb9fa8d11c553),
+ (0x3fe80d858178d451, 0x3ff11e3a7adfe5dc, 0x3fede8eefdefdd61),
+ (0x3fe826a79a4bde87, 0x3ff1035d9c2731dc, 0x3fee1828d8aa0d05),
+ (0x3fe83fc97febfc73, 0x3ff0e8adacafb5e2, 0x3fee47a84a31b400),
+ (0x3fe858eb90a17ea9, 0x3ff0ce29b858ff89, 0x3fee776edba13ed6),
+ (0x3fe8720d5f79bf77, 0x3ff0b3d1a433bc56, 0x3feea77c97bb286a),
+ (0x3fe88b2f85704599, 0x3ff099a444afc50a, 0x3feed7d380dfa51f),
+ (0x3fe8a4516bc9e19d, 0x3ff07fa1adcd5635, 0x3fef087356602ce4),
+ (0x3fe8bd7375248413, 0x3ff065c8f21e21d3, 0x3fef395dbac2fb6d),
+ (0x3fe8d69552cff0a1, 0x3ff04c19dd7ddf24, 0x3fef6a92fc644460),
+ (0x3fe8efb76cb96eab, 0x3ff0329382f2fcfc, 0x3fef9c14d23b84cb),
+ (0x3fe908d951162887, 0x3ff01935d537533b, 0x3fefcde34b070181),
+ (0x3fe921fb54442d18, 0x3ff0000000000000, 0x3ff0000000000000),
+]
+
+const tan32 = {x : flt32
+ var r, s
+ (r, s) = tanorcot((x : flt64), true)
+ -> round_down(r, s)
+}
+
+const cot32 = {x : flt32
+ var r, s
+ (r, s) = tanorcot((x : flt64), false)
+ -> round_down(r, s)
+}
+
+const tan64 = {x : flt64
+ var r
+ (r, _) = tanorcot(x, true)
+ -> r
+}
+
+const cot64 = {x : flt64
+ var r
+ (r, _) = tanorcot(x, false)
+ -> r
+}
+
+const tanorcot = {x : flt64, want_tan : bool
+ var N : int64
+ var x1 : flt64, x2 : flt64
+ (N, x1, x2) = trig_reduce(x)
+
+ var then_negate : bool = false
+
+ if (N % 2 != 0)
+ /* tan(x + Pi/2) = -cot(x) */
+ want_tan = !want_tan
+ then_negate = true
+ ;;
+
+ if (x1 < 0.0)
+ then_negate = !then_negate
+ x1 = -1.0 * x1
+ x2 = -1.0 * x2
+ ;;
+
+ /* from sin-impl.myr */
+ var xi, tan_xi, cot_xi
+ (xi, cot_xi, tan_xi) = trig_table_approx(x1, C)
+ var cot = std.flt64frombits(cot_xi)
+ var tan = std.flt64frombits(tan_xi)
+
+ var ret1 : flt64 = 0.0, ret2 : flt64 = 0.0
+
+ if xi == 0x0
+ /*
+ Special case to avoid infinity in cotan
+ */
+ if want_tan
+ (ret1, ret2) = ptan(x1, x2)
+ else
+ (ret1, ret2) = pcot(x1, x2)
+ ;;
+
+ goto have_result
+ ;;
+
+ var delta1, delta2, deltat
+ (delta1, deltat) = fast2sum(-std.flt64frombits(xi), x1)
+ (delta2, _) = fast2sum(deltat, x2)
+ var p1, p2
+
+ if want_tan
+ (p1, p2) = ptan(delta1, delta2)
+ var num = cot + tan
+ var den = (cot - p1) - p2
+ var f = num/den
+ var q1 = tan
+ var q2 = p1 * f
+ var q3 = p2 * f
+ (q1, q2) = fast2sum(q1, q2)
+ (ret1, ret2) = fast2sum(q1, q2 + q3)
+ else
+ (p1, p2) = pcot(delta1, delta2)
+ var num = cot + tan
+ var den = (tan + p1) + p2
+ var f = num/den
+ var q1 = cot
+ var q2 = -1.0 * p1 * f
+ var q3 = -1.0 * p2 * f
+ (q1, q2) = fast2sum(q1, q2)
+ (ret1, ret2) = fast2sum(q1, q2 + q3)
+ ;;
+
+:have_result
+
+ if then_negate
+ ret1 = -1.0 * ret1
+ ret2 = -1.0 * ret2
+ ;;
+
+ -> (ret1, ret2)
+}
+
+const ptan = {x1 : flt64, x2 : flt64
+ var s : flt64 = x1 * x1
+ var p : flt64 = horner_polyu(s, tan_coeff[:])
+ var r1, r2
+ (r1, r2) = two_by_two(s, x1)
+ -> fast2sum(r1, r2 + x2)
+}
+
+const pcot = {x1 : flt64, x2 : flt64
+ var s : flt64 = x1 * x1
+ var p : flt64 = horner_polyu(s, cot_coeff[:])
+ -> fast2sum(p/x1, std.flt64frombits(0x3fd5555555555555)*x2)
+}
diff --git a/lib/math/util.myr b/lib/math/util.myr
index d2e82c8..dadd180 100644
--- a/lib/math/util.myr
+++ b/lib/math/util.myr
@@ -14,8 +14,20 @@ pkg math =
/* Whether RN() requires incrementing after truncating */
const need_round_away : (h : uint64, l : uint64, bitpos_last : int64 -> bool)
+
+ /* Multiply x * y to z1 + z2 */
+ const two_by_two : (x : flt64, y : flt64 -> (flt64, flt64))
+
+ /* Return (s, t) such that s + t = a + b, with s = rn(a + b). */
+ generic fast2sum : (x : @f, y : @f -> (@f, @f)) :: floating, numeric @f
+
+ /* Rounds a + b (as flt64s) to a flt32. */
+ const round_down : (a : flt64, b : flt64 -> flt32)
;;
+/* Split precision down the middle */
+const twentysix_bits_mask = (0xffffffffffffffff << 27)
+
const flt64fromflt32 = {f : flt32
var n, e, s
(n, e, s) = std.flt32explode(f)
@@ -172,3 +184,71 @@ const need_round_away = {h : uint64, l : uint64, bitpos_last : int64
-> hl_is_odd
}
+
+/*
+ Perform high-prec multiplication: x * y = z1 + z2.
+ */
+const two_by_two = {x : flt64, y : flt64
+ var xh : flt64 = std.flt64frombits(std.flt64bits(x) & twentysix_bits_mask)
+ var xl : flt64 = x - xh
+ var yh : flt64 = std.flt64frombits(std.flt64bits(y) & twentysix_bits_mask)
+ var yl : flt64 = y - yh
+
+ /* Multiply out */
+ var a1 : flt64 = xh * yh
+ var a2 : flt64 = xh * yl
+ var a3 : flt64 = xl * yh
+ var a4 : flt64 = xl * yl
+
+ /* By-hand compensated summation */
+ var yy, u, t, v, z, s, c
+ if a2 < a3
+ std.swap(&a3, &a2)
+ ;;
+
+ s = a1
+ c = 0.0
+
+ /* a2 */
+ (s, c) = fast2sum(s, a2)
+
+ /* a3 */
+ (yy, u) = fast2sum(c, a3)
+ (t, v) = fast2sum(s, yy)
+ z = u + v
+ (s, c) = fast2sum(t, z)
+
+ /* a4 */
+ (yy, u) = fast2sum(c, a4)
+ (t, v) = fast2sum(s, yy)
+ z = u + v
+ (s, c) = fast2sum(t, z)
+
+ -> (s, c)
+}
+
+/* Return (s, t) such that s + t = a + b, with s = rn(a + b). */
+generic fast2sum = {a : @f, b : @f :: floating, numeric @f
+ var s = a + b
+ var z = s - a
+ var t = b - z
+ -> (s, t)
+}
+
+/*
+ Round a + b to a flt32. Only notable if round(a) is a rounding
+ tie, and b is non-zero
+ */
+const round_down = {a : flt64, b : flt64
+ var au : uint64 = std.flt64bits(a)
+ if au & 0x0000000070000000 == 0x0000000070000000
+ if b > 0.0
+ au++
+ elif b < 0.0
+ au--
+ ;;
+ -> (std.flt64frombits(au) : flt32)
+ ;;
+
+ -> (a : flt32)
+}