summaryrefslogtreecommitdiff
path: root/lib/math
diff options
context:
space:
mode:
authorS. Gilles <sgilles@math.umd.edu>2018-06-27 09:47:45 -0400
committerS. Gilles <sgilles@math.umd.edu>2018-06-27 20:59:22 -0400
commit9843290d44b6266758c478838d57a9478af39aa3 (patch)
tree5b4b37ef0390adaf00fa6a70ed663a799e129264 /lib/math
parent56741fafc347f664a0ef0aebae51054b544891ed (diff)
downloadmc-9843290d44b6266758c478838d57a9478af39aa3.tar.gz
Initial (only compile-tested) implementation of sin, cos.
Diffstat (limited to 'lib/math')
-rw-r--r--lib/math/bld.sub3
-rw-r--r--lib/math/fpmath.myr22
-rw-r--r--lib/math/sin-impl.myr805
3 files changed, 830 insertions, 0 deletions
diff --git a/lib/math/bld.sub b/lib/math/bld.sub
index abbbad4..fc67ed6 100644
--- a/lib/math/bld.sub
+++ b/lib/math/bld.sub
@@ -24,6 +24,9 @@ lib math =
# scalb (multiply x by 2^m)
scale2-impl.myr
+ # sin, cos
+ sin-impl.myr
+
# sqrt
sqrt-impl+posixy-x64-sse2.s
sqrt-impl.myr
diff --git a/lib/math/fpmath.myr b/lib/math/fpmath.myr
index 28075ba..f1f45c4 100644
--- a/lib/math/fpmath.myr
+++ b/lib/math/fpmath.myr
@@ -24,6 +24,11 @@ pkg math =
/* scale2-impl */
scale2 : (x : @f, m : @i -> @f)
+ /* sin-impl */
+ sin : (x : @f -> @f)
+ cos : (x : @f -> @f)
+ sincos : (x : @f -> (@f, @f))
+
/* sqrt-impl */
sqrt : (x : @f -> @f)
@@ -90,6 +95,10 @@ impl fpmath flt32 =
scale2 = {x, m; -> scale232(x, m)}
+ sin = {x; -> sin32(x)}
+ cos = {x; -> cos32(x)}
+ sincos = {x; -> sincos32(x)}
+
sqrt = {x; -> sqrt32(x)}
kahan_sum = {l; -> kahan_sum32(l) }
@@ -117,6 +126,10 @@ impl fpmath flt64 =
scale2 = {x, m; -> scale264(x, m)}
+ sin = {x; -> sin64(x)}
+ cos = {x; -> cos64(x)}
+ sincos = {x; -> sincos64(x)}
+
sqrt = {x; -> sqrt64(x)}
kahan_sum = {l; -> kahan_sum64(l) }
@@ -157,6 +170,15 @@ extern const powr64 : (x : flt64, y : flt64 -> flt64)
extern const scale232 : (x : flt32, m : int32 -> flt32)
extern const scale264 : (x : flt64, m : int64 -> flt64)
+extern const sin32 : (x : flt32 -> flt32)
+extern const sin64 : (x : flt64 -> flt64)
+
+extern const cos32 : (x : flt32 -> flt32)
+extern const cos64 : (x : flt64 -> flt64)
+
+extern const sincos32 : (x : flt32 -> (flt32, flt32))
+extern const sincos64 : (x : flt64 -> (flt64, flt64))
+
extern const sqrt32 : (x : flt32 -> flt32)
extern const sqrt64 : (x : flt64 -> flt64)
diff --git a/lib/math/sin-impl.myr b/lib/math/sin-impl.myr
new file mode 100644
index 0000000..4ede807
--- /dev/null
+++ b/lib/math/sin-impl.myr
@@ -0,0 +1,805 @@
+use std
+
+use "fpmath"
+use "fma-impl"
+use "scale2-impl"
+use "util"
+
+/*
+ This implmentation of sin and cos uses the "Highly Accurate
+ Tables" method of [GB91]. The idea is to tabulate [0, pi/4] at
+ 256 slightly irregular intervals xi, with the lucky property
+ that the infinite binary expansions of significands of sin(xi)
+ and cos(xi) look like
+
+ / 53 bits \ / many '0's \ / noise \
+ 1.bbbbbb...bb00000000000...0???????????...
+
+ This allows us to use storage only for a single floating point
+ number, but get more than 53 bits of precision.
+
+ Using that, we express x as (N * pi/4) + (xi) + (h), where h is
+ tiny. Using identities
+
+ sin(u+v) = sin(u)cos(v) + cos(u)sin(v),
+ cos(u+v) = cos(u)cos(v) - sin(u)sin(v),
+
+ we arrive at a sum where every term is known with greater than
+ 53 bits of precision except for sin(h) and cos(h), which we can
+ approximate well.
+
+ As a note, everything is performed in double-precision. Storing
+ a second set of tables for single precision constants would
+ occupy another 3KiB of memory for a not-very-significant speed
+ gain.
+
+ See files pi-constants.c, generate-triples-for-GB91.c, and
+ generate-minimax-by-Remez.gp for where the constants come from.
+ */
+pkg math =
+ pkglocal const sin32 : (x : flt32 -> flt32)
+ pkglocal const sin64 : (x : flt64 -> flt64)
+
+ pkglocal const cos32 : (x : flt32 -> flt32)
+ pkglocal const cos64 : (x : flt64 -> flt64)
+
+ pkglocal const sincos32 : (x : flt32 -> (flt32, flt32))
+ pkglocal const sincos64 : (x : flt64 -> (flt64, flt64))
+;;
+
+/* Split precision down the middle */
+const twentysix_bits_mask = (0xffffffffffffffff << 27)
+
+/* Pi/4 in lots of detail, for range reducing sin(2^18) or so */
+const pi_over_4 : uint64[4] = [
+ 0x3fe921fb54442d18,
+ 0x3c61a62633145c07,
+ 0xb8cf1976b7ed8fbc,
+ 0x3544cf98e804177d,
+]
+
+/* Pre-computed inverses */
+const four_over_pi : uint64 = 0x3ff45f306dc9c883 /* 1/(pi/4) */
+const oneohtwofour_over_pi : uint64 = 0x40745f306dc9c883 /* 1/(pi/(4 * 256)) */
+
+/*
+ Coefficients for minimax, degree 7 polynomials approximating sin
+ and cos on [-Pi/1024, Pi/1024] (generated by a Remez algorithm).
+ */
+const sin_coeff : uint64[8] = [
+ 0xb791800000000000,
+ 0x3ff0000000000000,
+ 0x38b6000000000000,
+ 0xbfc5555555555555,
+ 0xb9cc000000000000,
+ 0x3f81111111111024,
+ 0x3ab0000000000000,
+ 0xbf2a019f99ab90ae,
+]
+
+const cos_coeff : uint64[8] = [
+ 0x3ff0000000000000,
+ 0x38c0800000000000,
+ 0xbfe0000000000000,
+ 0x39a0000000000000,
+ 0x3fa55555555553ee,
+ 000000000000000000,
+ 0xbf56c16b9bfd9fd6,
+ 0xbbec000000000000,
+]
+
+/*
+ The Highly Accurate Tables for use in a [GB91]-type algorithm;
+ generated by ancillary/generate-triples-for-GB91.c.
+ */
+const C : (uint64, uint64, uint64)[257] = [
+ /* xi cos(xi) sin(xi) */
+ (0x0000000000000000, 0x3ff0000000000000, 0x0000000000000000),
+ (0x3f6921fb42e71072, 0x3feffff621622aa5, 0x3f6921f8ad6f8d6f),
+ (0x3f7921fb576a8e70, 0x3fefffd8858e80ad, 0x3f7921f1018d5de6),
+ (0x3f82d97c6d961293, 0x3fefffa72c98324f, 0x3f82d96afcb3b8a8),
+ (0x3f8921fba95a4ba5, 0x3fefff62169765a8, 0x3f8921d251f34230),
+ (0x3f8f6a79afcf7cc4, 0x3fefff0943ccb09c, 0x3f8f6a28f137852f),
+ (0x3f92d97d6168427b, 0x3feffe9cb42a0249, 0x3f92d9379e0e6011),
+ (0x3f95fdbbdcaafdf1, 0x3feffe1c68730a0e, 0x3f95fd4d14eace13),
+ (0x3f9921fb47134298, 0x3feffd886087640a, 0x3f992155ea73805c),
+ (0x3f9c463ab6204953, 0x3feffce09ce490e2, 0x3f9c454f4439aa60),
+ (0x3f9f6a7a230622cf, 0x3feffc251df3604f, 0x3f9f69372b837c03),
+ (0x3fa1475c20c40678, 0x3feffb55e4815141, 0x3fa1468532309197),
+ (0x3fa2d97c761e7dc7, 0x3feffa72f0045061, 0x3fa2d8656c814503),
+ (0x3fa46b9c4167ee58, 0x3feff97c42007eca, 0x3fa46a397ce648ce),
+ (0x3fa5fdbbb3fec154, 0x3feff871db006d07, 0x3fa5fc009ce09729),
+ (0x3fa78fdba698a573, 0x3feff753bb15f9f8, 0x3fa78dbaad1df29e),
+ (0x3fa921fb569e20a0, 0x3feff621e37794e9, 0x3fa91f65f36711fd),
+ (0x3faab41b20e054f3, 0x3feff4dc549e4649, 0x3faab101d4af47e4),
+ (0x3fac463aa9c96aef, 0x3feff3830f9fe5ee, 0x3fac428cfdc5c35c),
+ (0x3fadd85a67cbe296, 0x3feff21614eca1d2, 0x3fadd406ed40d193),
+ (0x3faf6a7a3eff7872, 0x3feff09565793013, 0x3faf656e8f97f0f9),
+ (0x3fb07e4ceacc9a97, 0x3fefef01028ba740, 0x3fb07b6149c87151),
+ (0x3fb1475c76962b85, 0x3fefed58ed6a54be, 0x3fb14400e1aeeb44),
+ (0x3fb2106c98abe6cd, 0x3fefeb9d254b1740, 0x3fb20c96690fbe9f),
+ (0x3fb2d97c6bdba26e, 0x3fefe9cdad2f1061, 0x3fb2d5207f840524),
+ (0x3fb3a28c18e279a9, 0x3fefe7ea85e76da6, 0x3fb39d9ed203bab6),
+ (0x3fb46b9c42a3d8e4, 0x3fefe5f3af0a1548, 0x3fb4661187480b43),
+ (0x3fb534abcd8d6c89, 0x3fefe3e92c9764ff, 0x3fb52e7708fabcdf),
+ (0x3fb5fdbbf6d380b0, 0x3fefe1cafc99687d, 0x3fb5f6d017a62147),
+ (0x3fb6c6cbcc2f7248, 0x3fefdf9922e0f7ad, 0x3fb6bf1b46436fc7),
+ (0x3fb78fdb8cff3236, 0x3fefdd53a026e6fa, 0x3fb78758587024ce),
+ (0x3fb858eb79d953d8, 0x3fefdafa7513ab8e, 0x3fb84f8712f838d4),
+ (0x3fb921fb5df13728, 0x3fefd88da3b2cbcb, 0x3fb917a6c5cad0c3),
+ (0x3fb9eb0b15ad6e7f, 0x3fefd60d2df8efac, 0x3fb9dfb6d20cd89d),
+ (0x3fbab41b0d8e04c2, 0x3fefd3791414a510, 0x3fbaa7b72849583a),
+ (0x3fbb7d2b02ec004d, 0x3fefd0d1586ee673, 0x3fbb6fa70acd0c55),
+ (0x3fbc463a9c1cda98, 0x3fefce15fde7feac, 0x3fbc3785a525059d),
+ (0x3fbd0f4a92881eaf, 0x3fefcb4703aa4711, 0x3fbcff5334552718),
+ (0x3fbdd85a7ea6b644, 0x3fefc8646cd750e6, 0x3fbdc70ed631fba7),
+ (0x3fbea16a4da0e792, 0x3fefc56e3b81b243, 0x3fbe8eb7fcd470b2),
+ (0x3fbf6a7a2082960a, 0x3fefc26471042f37, 0x3fbf564e4de7cd23),
+ (0x3fc019c4f4362172, 0x3fefbf470f79208c, 0x3fc00ee89fc60798),
+ (0x3fc07e4cb4549583, 0x3fefbc1619c9367a, 0x3fc072a00d3f81a5),
+ (0x3fc0e2d4df1f9ef0, 0x3fefb8d18d51928e, 0x3fc0d64dbf2ea47f),
+ (0x3fc1475c06879885, 0x3fefb5797828e1da, 0x3fc139f00d3ac360),
+ (0x3fc1abe531137149, 0x3fefb20dc250d36d, 0x3fc19d89b968e759),
+ (0x3fc2106c256f2705, 0x3fefae8e92bf458c, 0x3fc20116570e32c3),
+ (0x3fc274f5eafb17bd, 0x3fefaafbbea74df5, 0x3fc2649aa381628e),
+ (0x3fc2d97c80aa0f5b, 0x3fefa7557efae43c, 0x3fc2c81070013fe8),
+ (0x3fc33e0462663cd9, 0x3fefa39bacdb1026, 0x3fc32b7bef4456d4),
+ (0x3fc3a28c534e14b9, 0x3fef9fce55ed894d, 0x3fc38edbaa59fe3f),
+ (0x3fc40714a18f49e2, 0x3fef9bed79763508, 0x3fc3f22fb1298233),
+ (0x3fc46b9c38ac6ee7, 0x3fef97f9249e436c, 0x3fc45576b5509b55),
+ (0x3fc4d02432e80a59, 0x3fef93f14ed442ec, 0x3fc4b8b1905fbd16),
+ (0x3fc534ac0298d5e2, 0x3fef8fd600323753, 0x3fc51bdf7942e899),
+ (0x3fc5993416dbe628, 0x3fef8ba736af1693, 0x3fc57f00a065f728),
+ (0x3fc5fdbbf1d87ff8, 0x3fef8764fa1887ba, 0x3fc5e2144c8984d0),
+ (0x3fc66243dc0ae9e8, 0x3fef830f4a092d65, 0x3fc6451a8808363c),
+ (0x3fc6c6cbc286e3f6, 0x3fef7ea629fb13d9, 0x3fc6a8130326d53f),
+ (0x3fc72b53b7c96348, 0x3fef7a299bd3df70, 0x3fc70afd9309c784),
+ (0x3fc78fdba21869ed, 0x3fef7599a37cdcb3, 0x3fc76dd9e15bdb4a),
+ (0x3fc7f4642a07f677, 0x3fef70f63bf58052, 0x3fc7d0a856c8ccd4),
+ (0x3fc858eb5b649af0, 0x3fef6c3f7f63a632, 0x3fc83366caea942f),
+ (0x3fc8bd736951ad00, 0x3fef6775566b1c57, 0x3fc896172a175d2d),
+ (0x3fc921fb399259ea, 0x3fef6297d144aa53, 0x3fc8f8b8223b223a),
+ (0x3fc986835ce1c644, 0x3fef5da6ebe94da7, 0x3fc95b4a04783401),
+ (0x3fc9eb0b24569abc, 0x3fef58a2b2008d0c, 0x3fc9bdcbe883cc5f),
+ (0x3fca4f9323461762, 0x3fef538b1f52fbe6, 0x3fca203e2200e122),
+ (0x3fcab41b09c5898b, 0x3fef4e603b080549, 0x3fca82a025ebcacb),
+ (0x3fcb18a2e6d4292f, 0x3fef492207941b17, 0x3fcae4f1c649efbf),
+ (0x3fcb7d2ae7b9bf9f, 0x3fef43d085cf0736, 0x3fcb4732f2b7a6f7),
+ (0x3fcbe1b2cb7b8481, 0x3fef3e6bbc6eba27, 0x3fcba9632f158ca6),
+ (0x3fcc463ab6f78496, 0x3fef38f3acd2bb22, 0x3fcc0b8262d9d9f9),
+ (0x3fccaac29faeedb5, 0x3fef33685aeb67ba, 0x3fcc6d90473e1ca0),
+ (0x3fcd0f4ab0d5a9e9, 0x3fef2dc9c7b77e6a, 0x3fcccf8cc9dfcdca),
+ (0x3fcd73d299801a67, 0x3fef2817fb345701, 0x3fcd31775f6e7021),
+ (0x3fcdd85a5f03c93d, 0x3fef2252f8ad886d, 0x3fcd934fd0c9eb90),
+ (0x3fce3ce2615c01b0, 0x3fef1c7abe28a12c, 0x3fcdf5163efb2fd2),
+ (0x3fcea16a34d49aca, 0x3fef168f557f8c38, 0x3fce56ca04ee54fd),
+ (0x3fcf05f2396af361, 0x3fef1090bcb17909, 0x3fceb86b43a82046),
+ (0x3fcf6a7a34c8a42b, 0x3fef0a7efae01f19, 0x3fcf19f9863cf10b),
+ (0x3fcfcf02154123ee, 0x3fef045a14e56969, 0x3fcf7b747f630b74),
+ (0x3fd019c51e84ab67, 0x3feefe22087e60d4, 0x3fcfdcdc522606ad),
+ (0x3fd04c08e989dfc7, 0x3feef7d6e70399a4, 0x3fd01f17f81c5502),
+ (0x3fd07e4ceb775576, 0x3feef178a4618400, 0x3fd04fb80a82fe62),
+ (0x3fd0b090f60ca615, 0x3feeeb074a3d9810, 0x3fd0804e15777f94),
+ (0x3fd0e2d4c54314ef, 0x3feee482e5663962, 0x3fd0b0d9b95007d9),
+ (0x3fd11518d2106a76, 0x3feeddeb6a30657d, 0x3fd0e15b4cec5711),
+ (0x3fd1475cc633879f, 0x3feed740e7e7b002, 0x3fd111d25f193a47),
+ (0x3fd179a0ce17673e, 0x3feed0835cc79369, 0x3fd1423efcc68ac8),
+ (0x3fd1abe4bace84c5, 0x3feec9b2d347a043, 0x3fd172a0dae268af),
+ (0x3fd1de28a262f5c2, 0x3feec2cf4cb15d4b, 0x3fd1a2f7f0d5a412),
+ (0x3fd2106cb1425d8c, 0x3feebbd8c71a89f1, 0x3fd1d3444b7da503),
+ (0x3fd242b0905f83c0, 0x3feeb4cf52e27d33, 0x3fd20385796d7260),
+ (0x3fd274f4947838ba, 0x3feeadb2e8897f5d, 0x3fd233bbae3e8ad0),
+ (0x3fd2a7386b632300, 0x3feea6839816a5b1, 0x3fd263e67d68a111),
+ (0x3fd2d97c9e0be433, 0x3fee9f41524c0663, 0x3fd294064c5a5940),
+ (0x3fd30bc07884186a, 0x3fee97ec359da93b, 0x3fd2c41a511fcae1),
+ (0x3fd33e046121eaf8, 0x3fee908437cd8074, 0x3fd2f422d00c7726),
+ (0x3fd3704855d0fd6b, 0x3fee89095dacc360, 0x3fd3241fa9798465),
+ (0x3fd3a28c5798cbdd, 0x3fee817baba342be, 0x3fd35410c0bfc504),
+ (0x3fd3d4d04553249a, 0x3fee79db2b58ee3d, 0x3fd383f5d8b1345d),
+ (0x3fd407145ea979a2, 0x3fee7227d7cc183e, 0x3fd3b3cf1062e88d),
+ (0x3fd43958386293b4, 0x3fee6a61c6350b11, 0x3fd3e39be4473723),
+ (0x3fd46b9c38cc0bc5, 0x3fee6288eb9aff42, 0x3fd4135c98341878),
+ (0x3fd49de029090fc7, 0x3fee5a9d555912a2, 0x3fd44310da8ddcf9),
+ (0x3fd4d024217e697d, 0x3fee529f047e7434, 0x3fd472b8a510e608),
+ (0x3fd50267f89f138c, 0x3fee4a8e04a2f573, 0x3fd4a253b2fc94cc),
+ (0x3fd534ac160681bb, 0x3fee426a4a0b5efa, 0x3fd4d1e249057326),
+ (0x3fd566eff496a5e0, 0x3fee3a33ef471d66, 0x3fd50163cbe183bf),
+ (0x3fd59933ebfe75ec, 0x3fee31eaeb28cfda, 0x3fd530d871303f3b),
+ (0x3fd5cb77fe42d304, 0x3fee298f425adc95, 0x3fd560401d7fa792),
+ (0x3fd5fdbbd011e31d, 0x3fee212109492bba, 0x3fd58f9a5d81663b),
+ (0x3fd62ffff34dbbed, 0x3fee18a02ca5e0f9, 0x3fd5bee79d6734e6),
+ (0x3fd66243bdc0c9c2, 0x3fee100cce7f06b6, 0x3fd5ee271fdac6fd),
+ (0x3fd69487ca38ebb6, 0x3fee0766d9c23a82, 0x3fd61d595943641b),
+ (0x3fd6c6cbc58a707e, 0x3fedfeae61f95db0, 0x3fd64c7dde58f888),
+ (0x3fd6f90fbf3b4b9a, 0x3fedf5e369de7ac8, 0x3fd67b94a09dba36),
+ (0x3fd72b53a84c8c13, 0x3feded05f987af46, 0x3fd6aa9d7500e60d),
+ (0x3fd75d97a7d7826a, 0x3fede4160f84614d, 0x3fd6d99863129ab7),
+ (0x3fd78fdba723c71a, 0x3feddb13b555c5d8, 0x3fd7088538928031),
+ (0x3fd7c21faf2481ab, 0x3fedd1feeeeb3f77, 0x3fd73763e0e5bb7e),
+ (0x3fd7f46380443ef6, 0x3fedc8d7cd74eb35, 0x3fd7663403ecedb8),
+ (0x3fd826a78fee86a2, 0x3fedbf9e41325ad2, 0x3fd794f5f21f812c),
+ (0x3fd858eb6fc6c00e, 0x3fedb652640d194e, 0x3fd7c3a927f6e7be),
+ (0x3fd88b2f6091a911, 0x3fedacf42fd7881c, 0x3fd7f24dc4deb30b),
+ (0x3fd8bd73747fa82a, 0x3feda383a6d8b415, 0x3fd820e3bcdb4516),
+ (0x3fd8efb76ad3bcc6, 0x3fed9a00db088521, 0x3fd84f6ab72e0455),
+ (0x3fd921fb52f99571, 0x3fed906bcf71ced7, 0x3fd87de2a57d3bef),
+ (0x3fd9543f4c295528, 0x3fed86c484089c2c, 0x3fd8ac4b87fa1376),
+ (0x3fd98683387c2ef2, 0x3fed7d0b047d2a2d, 0x3fd8daa526666236),
+ (0x3fd9b8c74c734cd7, 0x3fed733f4c983086, 0x3fd908ef9488c638),
+ (0x3fd9eb0b316e257b, 0x3fed6961734a4759, 0x3fd9372a66100418),
+ (0x3fda1d4f1c5b13c6, 0x3fed5f71745bef9a, 0x3fd96555af393979),
+ (0x3fda4f93134bbffd, 0x3fed556f54b18a49, 0x3fd99371591464cc),
+ (0x3fda81d7079868bb, 0x3fed4b5b1d5d78c0, 0x3fd9c17d39ba9f53),
+ (0x3fdab41b223bd70a, 0x3fed4134cc4c88b2, 0x3fd9ef795a3dd82c),
+ (0x3fdae65f0bd2deeb, 0x3fed36fc796c6d1f, 0x3fda1d654e53c0c4),
+ (0x3fdb18a2faa54a8e, 0x3fed2cb220194f43, 0x3fda4b412b549e3d),
+ (0x3fdb4ae6e27ee94a, 0x3fed2255c92cb15e, 0x3fda790cc9d54d8e),
+ (0x3fdb7d2afabcbd96, 0x3fed17e76f8b1bff, 0x3fdaa6c83ff29436),
+ (0x3fdbaf6edd63f9e6, 0x3fed0d672ed022eb, 0x3fdad47314b13a5e),
+ (0x3fdbe1b2ee09849b, 0x3fed02d4f8ac5a8e, 0x3fdb020d86625c4a),
+ (0x3fdc13f6d9f827b3, 0x3fecf830e504ee27, 0x3fdb2f972dd6dd44),
+ (0x3fdc463ac85c4fc4, 0x3feced7af23188c8, 0x3fdb5d101285fa87),
+ (0x3fdc787e99315d1d, 0x3fece2b32dae84df, 0x3fdb8a77fb79b6c9),
+ (0x3fdcaac2c391b562, 0x3fecd7d984771033, 0x3fdbb7cf38286324),
+ (0x3fdcdd06bf1547a8, 0x3fecccee1a978b83, 0x3fdbe515317a2767),
+ (0x3fdd0f4a93c25d1a, 0x3fecc1f0f53b85ed, 0x3fdc1249d2e7d3d7),
+ (0x3fdd418e7cb2c1ac, 0x3fecb6e20e487888, 0x3fdc3f6d35bf5483),
+ (0x3fdd73d292000a16, 0x3fecabc16721278b, 0x3fdc6c7f53ab8367),
+ (0x3fdda616817e6870, 0x3feca08f18d00ef9, 0x3fdc997fc72e0f8c),
+ (0x3fddd85a5a1d76bb, 0x3fec954b270990c5, 0x3fdcc66e8203a2b0),
+ (0x3fde0a9e6cc874c6, 0x3fec89f5868b6c72, 0x3fdcf34bb0b7c500),
+ (0x3fde3ce26dc62595, 0x3fec7e8e4f51729a, 0x3fdd2016f3f2781b),
+ (0x3fde6f264279a0bc, 0x3fec73158e8e71da, 0x3fdd4cd0187c011d),
+ (0x3fdea16a5680fb27, 0x3fec678b32bc65ff, 0x3fdd797762744bde),
+ (0x3fded3ae3d42764c, 0x3fec5bef5bdf2d0e, 0x3fdda60c55ccc8d6),
+ (0x3fdf05f24a4f648b, 0x3fec5041fdd6d9d1, 0x3fddd28f21277b30),
+ (0x3fdf383651e91662, 0x3fec448329efd6a1, 0x3fddfeff8240fbd5),
+ (0x3fdf6a7a42ac94c9, 0x3fec38b2eb87ae38, 0x3fde2b5d4e604dad),
+ (0x3fdf9cbe1d3b3429, 0x3fec2cd149d960c0, 0x3fde57a86acebf99),
+ (0x3fdfcf020d397ddb, 0x3fec20de41e8a738, 0x3fde83e0e2af7102),
+ (0x3fe000a313205931, 0x3fec14d9d64b5fbe, 0x3fdeb006abd63bf5),
+ (0x3fe019c4f9037b82, 0x3fec08c42ac58524, 0x3fdedc194338385b),
+ (0x3fe032e70860dbf0, 0x3febfc9d20441910, 0x3fdf08191a1b3b78),
+ (0x3fe04c0906e9f028, 0x3febf064da5e176a, 0x3fdf3405af2bebc0),
+ (0x3fe0652b03fdfd24, 0x3febe41b5936a8f3, 0x3fdf5fdf024485e7),
+ (0x3fe07e4d0b91a0db, 0x3febd7c09e80889b, 0x3fdf8ba50d2a0c48),
+ (0x3fe0976ef1f9c5ae, 0x3febcb54c769186a, 0x3fdfb75768e7fa9b),
+ (0x3fe0b090e7d18f51, 0x3febbed7c3a63454, 0x3fdfe2f64f20fede),
+ (0x3fe0c9b2e9a39b2d, 0x3febb2499c87d6e9, 0x3fe00740cf66097a),
+ (0x3fe0e2d4ddcaea14, 0x3feba5aa669df22e, 0x3fe01cfc88506502),
+ (0x3fe0fbf6e1078c8e, 0x3feb98fa1b3ff547, 0x3fe032ae5dc3499b),
+ (0x3fe11518d98a4a52, 0x3feb8c38cf44e150, 0x3fe048562c12ec1d),
+ (0x3fe12e3add8e0dcc, 0x3feb7f667f41f3ed, 0x3fe05df3f90aa7ce),
+ (0x3fe1475ce227685c, 0x3feb728338a5b7b8, 0x3fe07387ade96093),
+ (0x3fe1607ec66ef221, 0x3feb658f1462d09f, 0x3fe0891121335eab),
+ (0x3fe179a0be834cc7, 0x3feb5889ffa66df4, 0x3fe09e9072510f34),
+ (0x3fe192c2b8568d3a, 0x3feb4b740bbd2fb5, 0x3fe0b405848141dd),
+ (0x3fe1abe4b59244a1, 0x3feb3e4d3fd6bd5a, 0x3fe0c9704befbcad),
+ (0x3fe1c506b2c3bd2e, 0x3feb3115a5da6c4f, 0x3fe0ded0b8742978),
+ (0x3fe1de289173a977, 0x3feb23cd5613980e, 0x3fe0f426a30829d9),
+ (0x3fe1f74a8f60e6e9, 0x3feb167438110e6e, 0x3fe1097232ed0851),
+ (0x3fe2106ca03e5fd0, 0x3feb090a5a650d01, 0x3fe11eb350745afe),
+ (0x3fe2298e9db917c4, 0x3feafb8fd9ca55cd, 0x3fe133e9ce192ca0),
+ (0x3fe242b08f9ff90f, 0x3feaee04ba8026f6, 0x3fe14915a5706056),
+ (0x3fe25bd28f640cf6, 0x3feae068f72931d9, 0x3fe15e36ded7bb02),
+ (0x3fe274f47c0699ab, 0x3fead2bcaa0d2e11, 0x3fe1734d518c9922),
+ (0x3fe28e1685bcfd71, 0x3feac4ffc15749f9, 0x3fe1885918f9ac8e),
+ (0x3fe2a736fe1e1494, 0x3feab7333233d9b8, 0x3fe19d58c0a87f59),
+ (0x3fe2c05a9f4eb35a, 0x3feaa9546b006d3c, 0x3fe1b2502def806a),
+ (0x3fe2d97c6af42411, 0x3fea9b66344e259f, 0x3fe1c73b28d8e8d8),
+ (0x3fe2f29e6e1ad75e, 0x3fea8d6775437b8d, 0x3fe1dc1b5a8d2e11),
+ (0x3fe30bc06e6776fd, 0x3fea7f5856cdc155, 0x3fe1f0f0859072c6),
+ (0x3fe324e1d749c5d3, 0x3fea7139354a6417, 0x3fe205ba2249d8d8),
+ (0x3fe33e045ca06605, 0x3fea63092400435f, 0x3fe21a798c19a7e7),
+ (0x3fe357267800e833, 0x3fea54c9076262f5, 0x3fe22f2d7377bfc7),
+ (0x3fe370485e5be5d6, 0x3fea4678cad14810, 0x3fe243d5f7a46407),
+ (0x3fe3896a4d91d9d2, 0x3fea3818540e8f5e, 0x3fe258733edb7371),
+ (0x3fe3a28c599d4a38, 0x3fea29a7a0666245, 0x3fe26d054caf4fb0),
+ (0x3fe3bbae6b7bfd61, 0x3fea1b26c5d7e6f3, 0x3fe2818c01832d38),
+ (0x3fe3d4d03ed37a30, 0x3fea0c95f4fd994c, 0x3fe29607190147e2),
+ (0x3fe3edf268105227, 0x3fe9fdf4e0b7d0c8, 0x3fe2aa76ff6bb39c),
+ (0x3fe4071445cf2ff6, 0x3fe9ef43eff2b264, 0x3fe2bedb24e4da22),
+ (0x3fe420363dd59357, 0x3fe9e082f06f9ebe, 0x3fe2d333cf8d19db),
+ (0x3fe4395843ba2fd5, 0x3fe9d1b1f26b4f3c, 0x3fe2e780e8af8d40),
+ (0x3fe4527a412b9192, 0x3fe9c2d10c2c8325, 0x3fe2fbc251bb901b),
+ (0x3fe46b9c27a00cd7, 0x3fe9b3e04f99cbd5, 0x3fe30ff7f2910508),
+ (0x3fe484be309a8d17, 0x3fe9a4dfa3af458c, 0x3fe32421ecef726e),
+ (0x3fe49de033475607, 0x3fe995cf29f2517a, 0x3fe33840139184b1),
+ (0x3fe4b7021f53f06e, 0x3fe986aef5919af2, 0x3fe34c524d121d49),
+ (0x3fe4d0240f052501, 0x3fe9777f00244ce3, 0x3fe36058a217b268),
+ (0x3fe4e9463716a53c, 0x3fe9683f32f2ae77, 0x3fe3745330215f46),
+ (0x3fe502681e9a6016, 0x3fe958efe0cb9eeb, 0x3fe388418ac15fff),
+ (0x3fe51b8a16255181, 0x3fe94990e237aa2e, 0x3fe39c23e5b6b244),
+ (0x3fe534ac09d21709, 0x3fe93a224cd1d5d1, 0x3fe3affa24f6eadd),
+ (0x3fe54dcdf3b3627e, 0x3fe92aa42dcf60f8, 0x3fe3c3c437a1dace),
+ (0x3fe566effef25aad, 0x3fe91b16740dcf8f, 0x3fe3d782336d7ad4),
+ (0x3fe5801215d3e4d3, 0x3fe90b79366e5e53, 0x3fe3eb33faf99080),
+ (0x3fe59933ff695620, 0x3fe8fbcca2107806, 0x3fe3fed9559bf7b0),
+ (0x3fe5b255ee2800cd, 0x3fe8ec10a14c3bbf, 0x3fe412725ec52f70),
+ (0x3fe5cb77fdabb08b, 0x3fe8dc452c6aa905, 0x3fe425ff1fc9c896),
+ (0x3fe5e499ef7e111b, 0x3fe8cc6a746824f6, 0x3fe4397f5c023a11),
+ (0x3fe5fdbbe1bc4e34, 0x3fe8bc807027ea41, 0x3fe44cf31eda7ea0),
+ (0x3fe616ddfb59ae0d, 0x3fe8ac8710ace7bf, 0x3fe4605a7a5aaeff),
+ (0x3fe62fffdd1c090a, 0x3fe89c7e9c66cac4, 0x3fe473b51912497a),
+ (0x3fe64921cfa4dad5, 0x3fe88c66ef074f3a, 0x3fe48703271ce28c),
+ (0x3fe66243d696fbd4, 0x3fe87c401005fc0b, 0x3fe49a449b40f2d3),
+ (0x3fe67b65d9d2cf92, 0x3fe86c0a18cb21f8, 0x3fe4ad795715a3e0),
+ (0x3fe69487ba5a4421, 0x3fe85bc52776b9fc, 0x3fe4c0a1373040c0),
+ (0x3fe6ada9c7509d39, 0x3fe84b7112ce79ff, 0x3fe4d3bc6c09e271),
+ (0x3fe6c6cbcacfb7d4, 0x3fe83b0e07c9e659, 0x3fe4e6cac0c53e06),
+ (0x3fe6dfedb933bfad, 0x3fe82a9c1836e690, 0x3fe4f9cc20e5450e),
+ (0x3fe6f90fb67efcd3, 0x3fe81a1b36b01830, 0x3fe50cc09bf06f81),
+ (0x3fe71231b7343738, 0x3fe8098b74dea97c, 0x3fe51fa81d7d14ee),
+ (0x3fe72b53a85aa5f7, 0x3fe7f8ece98512ef, 0x3fe532828ba63ede),
+ (0x3fe74475af974694, 0x3fe7e83f85f965d1, 0x3fe5454ff702d29b),
+ (0x3fe75d9799aac134, 0x3fe7d783768ce953, 0x3fe558102da86994),
+ (0x3fe776b9903c81f2, 0x3fe7c6b8a9e499af, 0x3fe56ac34326d2d0),
+ (0x3fe78fdba082343f, 0x3fe7b5df2167453d, 0x3fe57d6935ab2542),
+ (0x3fe7a8fdabde74d9, 0x3fe7a4f6fbedea5e, 0x3fe59001e2ecf285),
+ (0x3fe7c21f8108af2e, 0x3fe794006540f1ec, 0x3fe5a28d1b2b3ab1),
+ (0x3fe7db4178eb2ca1, 0x3fe782fb2be4619a, 0x3fe5b50b14a054d2),
+ (0x3fe7f4637bfce457, 0x3fe771e76a206b4b, 0x3fe5c77bb26e7244),
+ (0x3fe80d856a5e5235, 0x3fe760c5402e27c0, 0x3fe5d9ded1dab0d8),
+ (0x3fe826a77aed6885, 0x3fe74f94932c2aa4, 0x3fe5ec348fa6b9f0),
+ (0x3fe83fc98cbf9b15, 0x3fe73e55841a0699, 0x3fe5fe7cc8635d47),
+ (0x3fe858eb8885faf1, 0x3fe72d082dab83d0, 0x3fe610b75fe5f52f),
+ (0x3fe8720d606e4a1c, 0x3fe71baca44228b1, 0x3fe622e441189d58),
+ (0x3fe88b2f5ac151d8, 0x3fe70a42c20978bf, 0x3fe63503939a8c88),
+ (0x3fe8a4516df43f11, 0x3fe6f8ca982975b6, 0x3fe64715452c2eb4),
+ (0x3fe8bd735c5fca22, 0x3fe6e7445c4d6347, 0x3fe659191e5ec1a0),
+ (0x3fe8d69563ed711a, 0x3fe6d5afee23005e, 0x3fe66b0f407fe153),
+ (0x3fe8efb7598d0461, 0x3fe6c40d769b68c4, 0x3fe67cf781aec585),
+ (0x3fe908d9720f2113, 0x3fe6b25cdb7a4ca4, 0x3fe68ed1fc7319a2),
+ (0x3fe921fb5b6a3d21, 0x3fe6a09e61712f13, 0x3fe6a09e6b8d4885),
+]
+
+/*
+ The huge reduction tables. R[j] = (l1, l2, l3), such that l1+l2+l3
+ is less than 2pi, and 2^(50j + 25) is approximately l1+l2+l3
+ (mod 2pi).
+
+ The stepping of 50 was chosen because it is a nice, round number,
+ close to 52. [GB91] make reference to a difficult-to-reduce
+ number, Xhard, which is in the range (-2^27, 2^27). By using an
+ offset of 25, we can ensure that huge_reduce returns results in
+ about that range. This allows us to reuse calculations in [GB91]
+ showing we have probably stored enough bits of pi (and of these
+ numbers).
+
+ TODO: There is always the chance that, for some x = x1*2^j +
+ x2*2^k, there is catastrophic cancellation which makes the
+ remainder sum x1*r[50j+25] + x2*r[50k+25] imprecise. That needs
+ to be checked; it shouldn't be too hard.
+ */
+const R : (uint64, uint64, uint64)[20] = [
+ (0x4011fb222e13e839, 0xbcbeecfb7d19df11, 0x3955f9708d867b5b), /* 2^25 mod 2pi */
+ (0x3ffee6639887604d, 0x3c88b437ad3f55e0, 0x39218522303457a8), /* 2^75 mod 2pi */
+ (0x3fc589bfb31f1687, 0x3c31994c1edb7977, 0xb8dd15afd80892a0), /* 2^125 mod 2pi */
+ (0x400663e27d2e0b47, 0xbca00a74acd21bf1, 0x39257592b34b8c25), /* 2^175 mod 2pi */
+ (0x40177ff13d560e79, 0xbc8403f354e865f6, 0x38e6e69984c4338e), /* 2^225 mod 2pi */
+ (0x4015a4a4671a3606, 0x3ca6fc3b1a2bddd5, 0xb9270923530d0279), /* 2^275 mod 2pi */
+ (0x3ff3bf65f73a0474, 0x3c67deda97eb4131, 0x38fa32118f8f578f), /* 2^325 mod 2pi */
+ (0x3ffa8e506685f311, 0x3c698a6391d9d31b, 0x38db58d212d28d0a), /* 2^375 mod 2pi */
+ (0x400d38d7ecc58385, 0x3c98b076242f0ed3, 0x38e04e83f1274a16), /* 2^425 mod 2pi */
+ (0x4017023cb671ed43, 0xbcbeb418af32f189, 0xb944d3aea8efaa5f), /* 2^475 mod 2pi */
+ (0x401655d13f672fe9, 0x3ca8e56fc8ad533e, 0x393d464a045591bf), /* 2^525 mod 2pi */
+ (0x3feec8dd916fa3b6, 0x3c82c834374ed2af, 0x39225586c285a0d8), /* 2^575 mod 2pi */
+ (0x400ed2900b47ba63, 0x3c697a0d5776cc4d, 0x390c831068ee85b1), /* 2^625 mod 2pi */
+ (0x4008563b8e99caac, 0xbc5ee03ee870ab9d, 0x38fb980dd9756063), /* 2^675 mod 2pi */
+ (0x4012d961cf7cd57d, 0xbcbdde0a1d27628e, 0xb95a8840599f8246), /* 2^725 mod 2pi */
+ (0x4016919aac4a18f4, 0xbc93552937a28b88, 0xb9145e70b28c0cbb), /* 2^775 mod 2pi */
+ (0x400c37d195196372, 0xbca948065398479d, 0xb94cadd4f6e80c1b), /* 2^825 mod 2pi */
+ (0x400d77a34a63ebae, 0x3ca8ca5d7ccc2874, 0xb91611b1e4b65369), /* 2^875 mod 2pi */
+ (0x3ff4cb08f62cb38b, 0xbc90f42df8fc967a, 0xb8ed7d71202d2f45), /* 2^925 mod 2pi */
+ (0x401848860f8742d6, 0x3c90337738c287b4, 0xb92f24fc614ec2f7), /* 2^975 mod 2pi */
+]
+
+const sin32 = {x : flt32
+ var r = sin64((x : flt64))
+ -> (r : flt32)
+}
+
+const sin64 = {x : flt64
+ var s
+ (s, _) = w64(x, true, false)
+ -> s
+}
+
+const cos32 = {x : flt32
+ var r : flt64 = cos64((x : flt64))
+ -> (r : flt32)
+}
+
+const cos64 = {x : flt64
+ var c
+ (_, c) = w64(x, false, true)
+ -> c
+}
+
+const sincos32 = {x : flt32
+ var r1 : flt64
+ var r2 : flt64
+ (r1, r2) = sincos64((x : flt64))
+ -> ((r1 : flt32), (r2 : flt32))
+}
+
+const sincos64 = {x : flt64
+ -> w64(x, true, true)
+}
+
+/* Calculate sin and/or cos */
+const w64 = {x : flt64, want_sin : bool, want_cos : bool
+ var sin_ret : flt64 = 0.0
+ var cos_ret : flt64 = 0.0
+
+ var e : int64
+ (_, e, _) = std.flt64explode(x)
+
+ if e == 1024
+ -> (std.flt64nan(), std.flt64nan())
+ ;;
+
+ var N : int64
+ var x1 : flt64, x2 : flt64, x3 : flt64
+ (N, x1, x2, x3) = reduce(x)
+
+ /* Handle multiples of pi/2 */
+ var swap_sin_cos : bool = false
+ var then_negate_sin : bool = false
+ var then_negate_cos : bool = false
+ N = ((N % 4) + 4) % 4
+ match N
+ | 1:
+ /* sin(x + pi/2) = cos(x), cos(x + pi/2) = -sin(x) */
+ swap_sin_cos = true
+ then_negate_cos = true
+ | 2:
+ /* sin(x + pi) = -sin(x), cos(x + pi) = -cos(x) */
+ then_negate_cos = true
+ then_negate_sin = true
+ | 3:
+ /* sin(x + 3pi/2) = -cos(x), cos(x + 3pi/2) = sin(x) */
+ swap_sin_cos = true
+ then_negate_sin = true
+ | _:
+ ;;
+
+ if x1 < 0.0
+ x1 = -x1
+ x2 = -x2
+ x3 = -x3
+ N = -N
+ ;;
+
+ /* Figure out where in the C table x lies */
+ var j = (rn(x1 * std.flt64frombits(oneohtwofour_over_pi)) : uint64)
+ if j < 0
+ j = 0
+ elif j > 256
+ j = 256
+ ;;
+
+ var xi, sin_xi, cos_xi, sin_delta, cos_delta
+ (xi, sin_xi, cos_xi) = C[j]
+
+ /*
+ Compute x - xi with compensated summation. Because xi
+ and delta both lie in the same interval of width (pi/4)/256,
+ which is less than 1/256, we can use that |delta1| <
+ 2^-8: we need a polynomial approximation of at least
+ degree 7.
+
+ This also gives that |delta2| < 2^(-60), vanishing quickly
+ in the polynomial approximations.
+ */
+ /* TODO: inline this, use double-compensated */
+ var delta1, delta2
+ (delta1, delta2, _) = triple_compensate_sum([-std.flt64frombits(xi), x1, x2, x3][:])
+
+ /*
+ sin(delta); the degree 2 coefficient is near 0, so delta_2
+ only shows up in deg 1
+ */
+ sin_delta = horner_polyu(delta1, sin_coeff[:]) + delta2 * std.flt64frombits(sin_coeff[1])
+
+ /*
+ cos(delta); delta_2 shows up in deg 1 and 2; the term
+ we care about is a1*d2 + 2*a2*d1*d2
+ */
+ cos_delta = horner_polyu(delta1, cos_coeff[:])
+ cos_delta += delta2 * fma64(delta1, 2.0*std.flt64frombits(cos_coeff[2]), std.flt64frombits(cos_coeff[1]))
+
+ var q : flt64[4]
+ if (want_sin && !swap_sin_cos) || (want_cos && swap_sin_cos)
+ (q[0], q[1]) = two_by_two(std.flt64frombits(sin_xi), cos_delta)
+ (q[2], q[3]) = two_by_two(std.flt64frombits(cos_xi), sin_delta)
+ std.sort(q[:], fltabscmp)
+
+ /* TODO: use double-compensation instead */
+ (sin_ret, _, _) = triple_compensate_sum(q[:])
+ ;;
+ if (want_cos && !swap_sin_cos) || (want_sin && swap_sin_cos)
+ (q[0], q[1]) = two_by_two(std.flt64frombits(cos_xi), cos_delta)
+ (q[2], q[3]) = two_by_two(std.flt64frombits(sin_xi), sin_delta)
+ q[2] = -q[2]
+ q[3] = -q[3]
+ std.sort(q[:], fltabscmp)
+
+ /* TODO: use double-compensation instead */
+ (cos_ret, _, _) = triple_compensate_sum(q[:])
+ ;;
+
+ if swap_sin_cos
+ (sin_ret, cos_ret) = (cos_ret, sin_ret)
+ ;;
+
+ if then_negate_sin
+ sin_ret = -sin_ret
+ ;;
+
+ if then_negate_cos
+ cos_ret = -cos_ret
+ ;;
+
+ -> (sin_ret, cos_ret)
+}
+
+/* Reduce x to N*(pi/4) + x', with x' in [-pi/4, pi/4] */
+const reduce = {x : flt64
+ var N : int64 = 0
+ var Nf : flt64
+
+ /*
+ We actually only care about N's parity. If x is very
+ large, the ultimate N that we end up using might not be
+ representable (either as an int64 or flt64), so we instead
+ just keep track of the parity exactly.
+ */
+
+ /*
+ If we want to store pi in a form to properly reduce
+ 2^1000 or so, it turns out that there's some complication
+ with the final digits: they trail off beyond subnormality
+ and can't be represented, even though they are the digits
+ we need for the reduction. Therefore, for extremely high
+ values of x, we pre-compute the reduction and return it
+ here.
+
+ We get: x1 about in range 2^25, and that (x1 + x2 + x3)
+ approximates x (mod 2pi) very well. This is good enough
+ to ensure that N = x/pi is representable (int64 and
+ flt64). We do need to worry about catastrophic cancellation,
+ however.
+ */
+ var x1, x2, x3
+ var q : flt64[12]
+ (x1, x2, x3) = huge_reduce_2pi(x)
+
+ var pi_o_4_0 : flt64 = std.flt64frombits(pi_over_4[0])
+ var pi_o_4_1 : flt64 = std.flt64frombits(pi_over_4[1])
+ var pi_o_4_2 : flt64 = std.flt64frombits(pi_over_4[2])
+ var pi_o_4_3 : flt64 = std.flt64frombits(pi_over_4[3])
+
+ while true
+ N = rn(x1 * std.flt64frombits(four_over_pi))
+ Nf = (-N : flt64)
+ (q[0], q[1], q[2]) = (x1, x2, x3)
+ (q[3], q[ 4]) = two_by_two(Nf, pi_o_4_0)
+ (q[5], q[ 6]) = two_by_two(Nf, pi_o_4_1)
+ (q[7], q[ 8]) = two_by_two(Nf, pi_o_4_2)
+ (q[9], q[10]) = two_by_two(Nf, pi_o_4_3)
+ (x1, x2, x3) = triple_compensate_sum(q[:])
+
+ if le_33(x1, x2, x3, pi_o_4_0, pi_o_4_1, pi_o_4_2) && \
+ le_33(-pi_o_4_0, -pi_o_4_1, -pi_o_4_2, x1, x2, x3)
+ break
+ ;;
+ ;;
+
+ -> (N, x1, x2, x3)
+}
+
+
+const huge_reduce_2pi = {x : flt64
+ var e : int64
+ var b : uint64 = std.flt64bits(x)
+ (_, e, _) = std.flt64explode(x)
+
+ if e < 25
+ -> (x, 0.0, 0.0)
+ ;;
+
+ /*
+ Since the stepping of R is 50, and x has 53 significant
+ bits, we get a splitting of x into two components. We
+ want
+
+ x = [ xa * 2^(50j + 25) ] + [ xb * 2^(50(j-1) + 25) ]
+
+ and {ai}, {bi} such that
+
+ a1 + a2 + a3 === ( 2^(50j + 25) ) mod 2pi
+
+ b1 + b2 + b3 === ( 2^(50(j-1) + 25) ) mod 2pi
+
+ If j is small enough, we can slack off a bit. We really
+ just want xa(a1+a2+a3) + xb(b1+b2+b3) === x (mod 2pi)
+ with a heck of a lot of precision.
+ */
+ var j : uint64 = (e - 25 : uint64) / 50
+ var xa : flt64, xb : flt64, xc : flt64
+ var a1 : flt64, a2 : flt64, a3 : flt64
+ var b1 : flt64, b2 : flt64, b3 : flt64
+ var c1 : flt64, c2 : flt64, c3 : flt64
+ var u1 : uint64, u2 : uint64, u3 : uint64
+ if j == 0
+ (u1, u2, u3) = R[0]
+ a1 = std.flt64frombits(u1)
+ a2 = std.flt64frombits(u2)
+ a3 = std.flt64frombits(u3)
+ var xhi = std.flt64frombits(b & ((~0) << (52 + (25 - e : uint64))))
+ xa = scale2(xhi, -25)
+
+ (b1, b2, b3) = (x - xhi, 0.0, 0.0)
+ xb = 1.0
+ else
+ (u1, u2, u3) = R[j]
+ a1 = std.flt64frombits(u1)
+ a2 = std.flt64frombits(u2)
+ a3 = std.flt64frombits(u3)
+ var e1 : int64 = 50*(j : int64) + 25
+ var xhi = std.flt64frombits(b & ((~0) << (52 + (e1 - e : uint64))))
+ xa = scale2(xhi, -e1)
+
+ /* j > 0 in this branch */
+ (u1, u2, u3) = R[j - 1]
+ b1 = std.flt64frombits(u1)
+ b2 = std.flt64frombits(u2)
+ b3 = std.flt64frombits(u3)
+ var e2 : int64 = 50*(j - 1 : int64) + 25
+ var xmi = std.flt64frombits(b & ((~0) << (52 + (e2 - e : uint64))))
+ xb = scale2(xmi, -e2)
+
+ if j == 1
+ (c1, c2, c3) = (0.0, 0.0, 0.0)
+ xc = 0.0
+ else
+ (u1, u2, u3) = R[j - 2]
+ b1 = std.flt64frombits(u1)
+ b2 = std.flt64frombits(u2)
+ b3 = std.flt64frombits(u3)
+ var e3 : int64 = 50*(j - 2 : int64) + 25
+ var xlo = std.flt64frombits(b & ((~0) << (52 + (e3 - e : uint64))))
+ xc = scale2(xlo, -e3)
+ ;;
+ ;;
+
+
+ /*
+ Now we need to combine all this. Even worse, when we
+ multiply the two together, we need to keep full precision
+ (at least for the high-ish bits), so we need some help.
+
+ TODO: The c-type calculations can probably be optimized,
+ since xc is so small.
+ */
+ var q : flt64[18]
+ (q[ 0], q[ 1]) = two_by_two(xa, a1)
+ (q[ 2], q[ 3]) = two_by_two(xa, a2)
+ (q[ 4], q[ 5]) = two_by_two(xa, a3)
+ (q[ 6], q[ 7]) = two_by_two(xb, b1)
+ (q[ 8], q[ 9]) = two_by_two(xb, b2)
+ (q[10], q[11]) = two_by_two(xb, b3)
+ (q[12], q[13]) = two_by_two(xc, c1)
+ (q[14], q[15]) = two_by_two(xc, c2)
+ (q[16], q[17]) = two_by_two(xc, c3)
+
+ -> triple_compensate_sum(q[:])
+}
+
+/* TODO: move to sum-impl.myr, figure out a good API to allow optional sorting */
+const triple_compensate_sum = {q : flt64[:]
+ /* TODO: verify, with GAPPA or something, that this is correct. */
+ std.sort(q, fltabscmp)
+ var s1 : flt64, s2 : flt64, s3
+ var t1 : flt64, t2 : flt64, t3 : flt64, t4 : flt64, t5 : flt64, t6
+ s1 = q[0]
+ s2 = 0.0
+ s3 = 0.0
+ for qq : q[1:]
+ (t5, t6) = fast2sum(s3, qq)
+ (t3, t4) = fast2sum(s2, t5)
+ (t1, t2) = fast2sum(s1, t3)
+ s1 = t1
+ (s2, s3) = fast2sum(t2, t4 + t6)
+ ;;
+
+ -> (s1, s2, s3)
+}
+
+const fltabscmp = {x : flt64, y : flt64
+ var xb = std.flt64bits(x) & ~(1 << 63)
+ var yb = std.flt64bits(y) & ~(1 << 63)
+ if xb == yb
+ -> `std.Equal
+ elif xb > yb
+ -> `std.Before
+ else
+ -> `std.After
+ ;;
+}
+
+/*
+ Perform high-prec multiplication: x * y = z1 + z2.
+ */
+const two_by_two = {x : flt64, y : flt64
+ var xh : flt64 = std.flt64frombits(std.flt64bits(x) & twentysix_bits_mask)
+ var xl : flt64 = x - xh
+ var yh : flt64 = std.flt64frombits(std.flt64bits(y) & twentysix_bits_mask)
+ var yl : flt64 = y - yh
+
+ /* Multiply out */
+ var a1 : flt64 = xh * yh
+ var a2 : flt64 = xh * yl
+ var a3 : flt64 = yl * yh
+ var a4 : flt64 = xl * yl
+
+ /* By-hand compensated summation */
+ /* TODO: convert to normal compensated summation, move to sum-impl.myr */
+ var yy, u, t, v, z, s, c
+ if a2 < a3
+ (a3, a2) = (a2, a3)
+ ;;
+
+ s = a1
+ c = 0.0
+
+ /* a2 */
+ (s, c) = fast2sum(s, a2)
+
+ /* a3 */
+ (yy, u) = fast2sum(c, a3)
+ (t, v) = fast2sum(s, yy)
+ z = u + v
+ (s, c) = fast2sum(t, z)
+
+ /* a4 */
+ (yy, u) = fast2sum(c, a4)
+ (t, v) = fast2sum(s, yy)
+ z = u + v
+ (s, c) = fast2sum(t, z)
+
+ -> (s, c)
+}
+
+/*
+ Return true iff (a1 + a2 + a3) <= (b1 + b2 + b3)
+ */
+const le_33 = { a1, a2, a3, b1, b2, b3
+ if a1 < b1
+ -> true
+ elif a1 > b1
+ -> false
+ ;;
+
+ if a2 < b2
+ -> true
+ elif a2 > b2
+ -> false
+ ;;
+
+ if a3 > b3
+ -> false
+ ;;
+
+ -> true
+}
+
+/* Return (s, t) such that s + t = a + b, with s = rn(a + b). */
+const fast2sum = {a : flt64, b : flt64
+ var s : flt64 = a + b
+ var z : flt64 = s - a
+ var t : flt64 = b - z
+ -> (s, t)
+
+}