diff options
author | S. Gilles <sgilles@math.umd.edu> | 2019-06-09 13:47:22 -0400 |
---|---|---|
committer | S. Gilles <sgilles@math.umd.edu> | 2019-06-09 13:53:43 -0400 |
commit | e09c6b53f3b1928d3752c53c813025dbcbb976e0 (patch) | |
tree | c87d2a23356b47954f15d8625d814228eb31f977 | |
parent | d52fd3b32109f12e529a7f5e380c5ee8f06aefcf (diff) | |
download | mc-e09c6b53f3b1928d3752c53c813025dbcbb976e0.tar.gz |
Use fma instead of rote multiplication in powr's final calculation.
Since the exp() we use is sometimes wrong by 1 ulp, the final
multiplication has a possibility of increasing that to 2 ulps. Using fma
instead makes such a thing much less likely.
This makes some of our off-by-0s off-by-1s, and vice versa, but on the
whole it is a definite improvement.
-rw-r--r-- | lib/math/powr-impl.myr | 6 | ||||
-rw-r--r-- | lib/math/test/powr-impl.myr | 24 |
2 files changed, 17 insertions, 13 deletions
diff --git a/lib/math/powr-impl.myr b/lib/math/powr-impl.myr index 32ee678..dadd23b 100644 --- a/lib/math/powr-impl.myr +++ b/lib/math/powr-impl.myr @@ -1,6 +1,7 @@ use std use "fpmath" +use "impls" use "log-impl" use "log-overkill" use "util" @@ -34,6 +35,7 @@ type fltdesc(@f, @u, @i) = struct emin : @i sgnmask : @u log_overkill : (x : @f -> (@f, @f)) + fma : (x : @f, y : @f, z : @f -> @f) split_mul : (x_h : @f, x_l : @f, y_h : @f, y_l : @f -> (@f, @f)) exp_zero_border : @u ;; @@ -51,6 +53,7 @@ const desc32 : fltdesc(flt32, uint32, int32) = [ .emin = -126, .sgnmask = 1 << 31, .log_overkill = logoverkill32, + .fma = fma32, .split_mul = split_mul32, .exp_zero_border = 0xc2cff1b4, /* minimal y such that e^y > 0 */ ] @@ -68,6 +71,7 @@ const desc64 : fltdesc(flt64, uint64, int64) = [ .emin = -1022, .sgnmask = 1 << 63, .log_overkill = logoverkill64, + .fma = fma64, .split_mul = hl_mult, .exp_zero_border = 0xc0874910d52d3052, /* minimal y such that e^y > 0 */ ] @@ -198,5 +202,5 @@ generic powrgen = {x : @f, y : @f, d : fltdesc(@f, @u, @i) :: numeric,floating,s -> z_hi ;; - -> z_hi * (1.0 + final_exp_lo) + -> d.fma(z_hi, final_exp_lo, z_hi) } diff --git a/lib/math/test/powr-impl.myr b/lib/math/test/powr-impl.myr index 643af42..fb54d84 100644 --- a/lib/math/test/powr-impl.myr +++ b/lib/math/test/powr-impl.myr @@ -50,6 +50,13 @@ const powr02 = {c (0x000342cdeeb18fc9, 0xbe645d513ed4670d, 0x3ff0001c3d5eaa62), (0x3e086c8c9160ccfe, 0xc027b2f7021e0508, 0x567134b75886e1ce), (0x3d799a9014c5c710, 0xc0294dc8ea46772e, 0x5f068df47efc8583), + (0x3fca93f7f9ca25b6, 0x3ff5847df0338da2, 0x3fbee98550a085a0), + (0x3fabb5869cd3c59e, 0x3ff36f822e577f69, 0x3f9da0409e2f391c), + (0x3f77f7f7131769cb, 0x3feaf3f718dd7ebe, 0x3f8af6517cbfdc36), + (0x3fcac3d4060de9e2, 0x4005f5e897c9aeb8, 0x3f8be74e7fa8bdd6), + (0x406f0b978c302c5e, 0x4035705781be3d35, 0x4a97d0db24be1855), + (0x40644d87e8676e6c, 0x404b0add51b6a1db, 0x58c21ad2028c4bcc), + (0x4196f9c67efe9a1f, 0x3fbb8bebec6ccd29, 0x401ceae1ef1736e2), /* x in (-0.1, 0.1), y in (-50, 50) */ (0x85c8e7c348119f30, 0x895b661cd3f6bf12, 0xfff8000000000000), @@ -105,9 +112,7 @@ const powr02 = {c (0x41094a4327327bff, 0xc011131c033fdfd5, 0x3b3879e23ee63805), (0x40f8a138d93dfa6f, 0xc001413283b35d5f, 0x3db1bbb215bf42ae), (0x414539311cf511ff, 0xbfbca848c2d205ed, 0x3fc84fc70f28d73e), - (0x417c6bc9e89d9c1d, 0xc01992c87dc70f2c, 0x36032a3faeb94526), (0xfff3bc381986a30c, 0xbfdca95e011cebe9, 0xfff8000000000000), - (0x41c7ac7f3f65ea65, 0xc01e7dbcfb5c724c, 0x31d8c5031b0424e3), (0x412e44a5346a1be6, 0xbffdcb72cdad7356, 0x3d9dfbadec90e3fa), (0x418e5e64d3edf4c6, 0xc01674baa108856c, 0x36d602089075174d), (0x416b58e89ad71a84, 0xbfca78a8fcb854b3, 0x3fa0f412b7918686), @@ -115,7 +120,6 @@ const powr02 = {c /* x and y both in (0, 10) */ (0x7ff693c2af30864a, 0x3fa16af316e2d88c, 0xfff8000000000000), (0x3fdde3d0f357d227, 0x3fd4e93bd57a2981, 0x3fe8f3d387c3cfe2), - (0x3fa351c48c3746ce, 0x3fe729ab4b99e801, 0x3fb7e116428f44c0), (0x3fea1f57254b2d12, 0x3ff33aaf58690aa3, 0x3fe912f746775394), (0x3fd19a840c634304, 0x3fba064281d93290, 0x3fec1099957576a6), (0x3fce944eca01df35, 0x3f746940bfb8cacf, 0x3fefc5c3319cd7dd), @@ -140,12 +144,10 @@ const powr02 = {c (0x3f6d1ed0f9c04a9f, 0x3f62ffb9e40680c7, 0x3fef958dbae35a0f), (0xfffe3ddc1a7f09a7, 0x3f845a6294f3a890, 0xfff8000000000000), (0x3ff041e289ac5b8c, 0x3f57490a9fd58f95, 0x3ff00017c7d7ad4b), - (0x3fc9ffe250089ea1, 0x3ff88bf79a9dc33c, 0x3fb63170e54074b8), (0x4020bc3c2d25bb94, 0xfff903dc5ef7b5da, 0xfff8000000000000), (0x4003d467c525071c, 0x3f98c1fe89a5ecc4, 0x3ff05ae362b9a84a), (0x3fc238b70f4e7216, 0x3f8a75e2fd6e64f4, 0x3fef343ee42df045), (0x3fe001d1da500969, 0x400ff32df1b36945, 0x3fb0191d9c03801f), - (0x3f55ee3142fec6bf, 0x401cdc101b6b2276, 0x3ba18abf782d7bc4), (0x3ffb954dbc226673, 0x400408855a1fc270, 0x400f49e30f335764), (0x400bb9f936dad207, 0xfffb58709426aaec, 0xfff8000000000000), (0x3f7bbe6db78c8014, 0xfff26b1c8b0746a0, 0xfff8000000000000), @@ -200,13 +202,6 @@ const powr03 = {c const powr04 = {c var inputs : (uint64, uint64, uint64, uint64)[:] = [ (0x3f8627bbf0b2534e, 0x3fab532501422efb, 0x3fe921e86671e519, 0x3fe921e86671e518), - (0x3fca93f7f9ca25b6, 0x3ff5847df0338da2, 0x3fbee98550a085a0, 0x3fbee98550a085a1), - (0x3fabb5869cd3c59e, 0x3ff36f822e577f69, 0x3f9da0409e2f391c, 0x3f9da0409e2f391b), - (0x3f77f7f7131769cb, 0x3feaf3f718dd7ebe, 0x3f8af6517cbfdc36, 0x3f8af6517cbfdc35), - (0x3fcac3d4060de9e2, 0x4005f5e897c9aeb8, 0x3f8be74e7fa8bdd6, 0x3f8be74e7fa8bdd5), - (0x406f0b978c302c5e, 0x4035705781be3d35, 0x4a97d0db24be1855, 0x4a97d0db24be1854), - (0x40644d87e8676e6c, 0x404b0add51b6a1db, 0x58c21ad2028c4bcc, 0x58c21ad2028c4bcd), - (0x4196f9c67efe9a1f, 0x3fbb8bebec6ccd29, 0x401ceae1ef1736e2, 0x401ceae1ef1736e1), (0x41c84ac138a030ef, 0x3f91da7f2b3e4605, 0x3ff6e1b47e9ed782, 0x3ff6e1b47e9ed781), (0x41ca9d0efec9e036, 0x3fa8f2f672d68769, 0x4005d70b6fe1084b, 0x4005d70b6fe1084a), (0x40f949e1394ba90c, 0x3fb52c7ac7e9fb25, 0x4004cad8a0151dff, 0x4004cad8a0151e00), @@ -214,6 +209,11 @@ const powr04 = {c (0x41242aa370d444b6, 0xbff214c0dfc7a867, 0x3e91c53dfd314590, 0x3e91c53dfd31458f), (0x418f00a1df7a23d0, 0xc033fc3dd88f7505, 0x1f83a0afed046038, 0x1f83a0afed046039), (0x4169769b9e71e521, 0xc00ddf0ecaa33de6, 0x3a6889acb36be574, 0x3a6889acb36be573), + (0x417c6bc9e89d9c1d, 0xc01992c87dc70f2c, 0x36032a3faeb94526, 0x36032a3faeb94525), + (0x41c7ac7f3f65ea65, 0xc01e7dbcfb5c724c, 0x31d8c5031b0424e3, 0x31d8c5031b0424e2), + (0x3fa351c48c3746ce, 0x3fe729ab4b99e801, 0x3fb7e116428f44c0, 0x3fb7e116428f44c1), + (0x3fc9ffe250089ea1, 0x3ff88bf79a9dc33c, 0x3fb63170e54074b8, 0x3fb63170e54074b9), + (0x3f55ee3142fec6bf, 0x401cdc101b6b2276, 0x3ba18abf782d7bc4, 0x3ba18abf782d7bc5), ][:] for (x, y, z_perfect, z_accepted) : inputs |