diff options
author | S. Gilles <sgilles@math.umd.edu> | 2017-10-29 00:51:25 -0400 |
---|---|---|
committer | Ori Bernstein <ori@eigenstate.org> | 2017-10-29 19:46:03 -0700 |
commit | dde43f18e3b9f699c188c74064cf785917b41e22 (patch) | |
tree | 7f3c9863634a948a3a5047eb7e5deea6331d48a7 /lib | |
parent | 4ecfdbc4819481740991e7f477a32ed7396c7125 (diff) | |
download | mc-dde43f18e3b9f699c188c74064cf785917b41e22.tar.gz |
Implement graphemewidth
Diffstat (limited to 'lib')
-rw-r--r-- | lib/std/chartype.myr | 470 | ||||
-rw-r--r-- | lib/std/fmt.myr | 10 | ||||
-rw-r--r-- | lib/std/test/chartype.myr | 18 | ||||
-rw-r--r-- | lib/std/test/fmt.myr | 4 | ||||
-rw-r--r-- | lib/std/test/utf.myr | bin | 0 -> 1305 bytes | |||
-rw-r--r-- | lib/std/utf.myr | 29 |
6 files changed, 522 insertions, 9 deletions
diff --git a/lib/std/chartype.myr b/lib/std/chartype.myr index dd31e20..bf01d9a 100644 --- a/lib/std/chartype.myr +++ b/lib/std/chartype.myr @@ -26,6 +26,8 @@ pkg std = const totitle : (c : char -> char) generic charval : (c : char, base : int -> @a::(integral,numeric)) + + const cellwidth : (c : char -> int) ;; extern const put : (fmt : byte[:], args : ... -> size) @@ -1061,6 +1063,445 @@ const rtotitle1 = [ 0x01f3, 499 /* dz Dz */ ] +/* + * Generated by + * + * uniset cat:Me,Mn,Cf | \ + * sed -r 's/^([0-9a-f]+)[.][.]([0-9a-f]+)$/\t0x\1, 0x\2,/' |\ + * sed -r 's/^([0-9a-f]+)$/\t0x\1, 0x\1,/' + */ +const width0 = [ + 0xad, 0xad, + 0x300, 0x36f, + 0x483, 0x489, + 0x591, 0x5bd, + 0x5bf, 0x5bf, + 0x5c1, 0x5c2, + 0x5c4, 0x5c5, + 0x5c7, 0x5c7, + 0x600, 0x605, + 0x610, 0x61a, + 0x61c, 0x61c, + 0x64b, 0x65f, + 0x670, 0x670, + 0x6d6, 0x6dd, + 0x6df, 0x6e4, + 0x6e7, 0x6e8, + 0x6ea, 0x6ed, + 0x70f, 0x70f, + 0x711, 0x711, + 0x730, 0x74a, + 0x7a6, 0x7b0, + 0x7eb, 0x7f3, + 0x816, 0x819, + 0x81b, 0x823, + 0x825, 0x827, + 0x829, 0x82d, + 0x859, 0x85b, + 0x8d4, 0x902, + 0x93a, 0x93a, + 0x93c, 0x93c, + 0x941, 0x948, + 0x94d, 0x94d, + 0x951, 0x957, + 0x962, 0x963, + 0x981, 0x981, + 0x9bc, 0x9bc, + 0x9c1, 0x9c4, + 0x9cd, 0x9cd, + 0x9e2, 0x9e3, + 0xa01, 0xa02, + 0xa3c, 0xa3c, + 0xa41, 0xa42, + 0xa47, 0xa48, + 0xa4b, 0xa4d, + 0xa51, 0xa51, + 0xa70, 0xa71, + 0xa75, 0xa75, + 0xa81, 0xa82, + 0xabc, 0xabc, + 0xac1, 0xac5, + 0xac7, 0xac8, + 0xacd, 0xacd, + 0xae2, 0xae3, + 0xafa, 0xaff, + 0xb01, 0xb01, + 0xb3c, 0xb3c, + 0xb3f, 0xb3f, + 0xb41, 0xb44, + 0xb4d, 0xb4d, + 0xb56, 0xb56, + 0xb62, 0xb63, + 0xb82, 0xb82, + 0xbc0, 0xbc0, + 0xbcd, 0xbcd, + 0xc00, 0xc00, + 0xc3e, 0xc40, + 0xc46, 0xc48, + 0xc4a, 0xc4d, + 0xc55, 0xc56, + 0xc62, 0xc63, + 0xc81, 0xc81, + 0xcbc, 0xcbc, + 0xcbf, 0xcbf, + 0xcc6, 0xcc6, + 0xccc, 0xccd, + 0xce2, 0xce3, + 0xd00, 0xd01, + 0xd3b, 0xd3c, + 0xd41, 0xd44, + 0xd4d, 0xd4d, + 0xd62, 0xd63, + 0xdca, 0xdca, + 0xdd2, 0xdd4, + 0xdd6, 0xdd6, + 0xe31, 0xe31, + 0xe34, 0xe3a, + 0xe47, 0xe4e, + 0xeb1, 0xeb1, + 0xeb4, 0xeb9, + 0xebb, 0xebc, + 0xec8, 0xecd, + 0xf18, 0xf19, + 0xf35, 0xf35, + 0xf37, 0xf37, + 0xf39, 0xf39, + 0xf71, 0xf7e, + 0xf80, 0xf84, + 0xf86, 0xf87, + 0xf8d, 0xf97, + 0xf99, 0xfbc, + 0xfc6, 0xfc6, + 0x102d, 0x1030, + 0x1032, 0x1037, + 0x1039, 0x103a, + 0x103d, 0x103e, + 0x1058, 0x1059, + 0x105e, 0x1060, + 0x1071, 0x1074, + 0x1082, 0x1082, + 0x1085, 0x1086, + 0x108d, 0x108d, + 0x109d, 0x109d, + 0x135d, 0x135f, + 0x1712, 0x1714, + 0x1732, 0x1734, + 0x1752, 0x1753, + 0x1772, 0x1773, + 0x17b4, 0x17b5, + 0x17b7, 0x17bd, + 0x17c6, 0x17c6, + 0x17c9, 0x17d3, + 0x17dd, 0x17dd, + 0x180b, 0x180e, + 0x1885, 0x1886, + 0x18a9, 0x18a9, + 0x1920, 0x1922, + 0x1927, 0x1928, + 0x1932, 0x1932, + 0x1939, 0x193b, + 0x1a17, 0x1a18, + 0x1a1b, 0x1a1b, + 0x1a56, 0x1a56, + 0x1a58, 0x1a5e, + 0x1a60, 0x1a60, + 0x1a62, 0x1a62, + 0x1a65, 0x1a6c, + 0x1a73, 0x1a7c, + 0x1a7f, 0x1a7f, + 0x1ab0, 0x1abe, + 0x1b00, 0x1b03, + 0x1b34, 0x1b34, + 0x1b36, 0x1b3a, + 0x1b3c, 0x1b3c, + 0x1b42, 0x1b42, + 0x1b6b, 0x1b73, + 0x1b80, 0x1b81, + 0x1ba2, 0x1ba5, + 0x1ba8, 0x1ba9, + 0x1bab, 0x1bad, + 0x1be6, 0x1be6, + 0x1be8, 0x1be9, + 0x1bed, 0x1bed, + 0x1bef, 0x1bf1, + 0x1c2c, 0x1c33, + 0x1c36, 0x1c37, + 0x1cd0, 0x1cd2, + 0x1cd4, 0x1ce0, + 0x1ce2, 0x1ce8, + 0x1ced, 0x1ced, + 0x1cf4, 0x1cf4, + 0x1cf8, 0x1cf9, + 0x1dc0, 0x1df9, + 0x1dfb, 0x1dff, + 0x200b, 0x200f, + 0x202a, 0x202e, + 0x2060, 0x2064, + 0x2066, 0x206f, + 0x20d0, 0x20f0, + 0x2cef, 0x2cf1, + 0x2d7f, 0x2d7f, + 0x2de0, 0x2dff, + 0x302a, 0x302d, + 0x3099, 0x309a, + 0xa66f, 0xa672, + 0xa674, 0xa67d, + 0xa69e, 0xa69f, + 0xa6f0, 0xa6f1, + 0xa802, 0xa802, + 0xa806, 0xa806, + 0xa80b, 0xa80b, + 0xa825, 0xa826, + 0xa8c4, 0xa8c5, + 0xa8e0, 0xa8f1, + 0xa926, 0xa92d, + 0xa947, 0xa951, + 0xa980, 0xa982, + 0xa9b3, 0xa9b3, + 0xa9b6, 0xa9b9, + 0xa9bc, 0xa9bc, + 0xa9e5, 0xa9e5, + 0xaa29, 0xaa2e, + 0xaa31, 0xaa32, + 0xaa35, 0xaa36, + 0xaa43, 0xaa43, + 0xaa4c, 0xaa4c, + 0xaa7c, 0xaa7c, + 0xaab0, 0xaab0, + 0xaab2, 0xaab4, + 0xaab7, 0xaab8, + 0xaabe, 0xaabf, + 0xaac1, 0xaac1, + 0xaaec, 0xaaed, + 0xaaf6, 0xaaf6, + 0xabe5, 0xabe5, + 0xabe8, 0xabe8, + 0xabed, 0xabed, + 0xfb1e, 0xfb1e, + 0xfe00, 0xfe0f, + 0xfe20, 0xfe2f, + 0xfeff, 0xfeff, + 0xfff9, 0xfffb, + 0x101fd, 0x101fd, + 0x102e0, 0x102e0, + 0x10376, 0x1037a, + 0x10a01, 0x10a03, + 0x10a05, 0x10a06, + 0x10a0c, 0x10a0f, + 0x10a38, 0x10a3a, + 0x10a3f, 0x10a3f, + 0x10ae5, 0x10ae6, + 0x11001, 0x11001, + 0x11038, 0x11046, + 0x1107f, 0x11081, + 0x110b3, 0x110b6, + 0x110b9, 0x110ba, + 0x110bd, 0x110bd, + 0x11100, 0x11102, + 0x11127, 0x1112b, + 0x1112d, 0x11134, + 0x11173, 0x11173, + 0x11180, 0x11181, + 0x111b6, 0x111be, + 0x111ca, 0x111cc, + 0x1122f, 0x11231, + 0x11234, 0x11234, + 0x11236, 0x11237, + 0x1123e, 0x1123e, + 0x112df, 0x112df, + 0x112e3, 0x112ea, + 0x11300, 0x11301, + 0x1133c, 0x1133c, + 0x11340, 0x11340, + 0x11366, 0x1136c, + 0x11370, 0x11374, + 0x11438, 0x1143f, + 0x11442, 0x11444, + 0x11446, 0x11446, + 0x114b3, 0x114b8, + 0x114ba, 0x114ba, + 0x114bf, 0x114c0, + 0x114c2, 0x114c3, + 0x115b2, 0x115b5, + 0x115bc, 0x115bd, + 0x115bf, 0x115c0, + 0x115dc, 0x115dd, + 0x11633, 0x1163a, + 0x1163d, 0x1163d, + 0x1163f, 0x11640, + 0x116ab, 0x116ab, + 0x116ad, 0x116ad, + 0x116b0, 0x116b5, + 0x116b7, 0x116b7, + 0x1171d, 0x1171f, + 0x11722, 0x11725, + 0x11727, 0x1172b, + 0x11a01, 0x11a06, + 0x11a09, 0x11a0a, + 0x11a33, 0x11a38, + 0x11a3b, 0x11a3e, + 0x11a47, 0x11a47, + 0x11a51, 0x11a56, + 0x11a59, 0x11a5b, + 0x11a8a, 0x11a96, + 0x11a98, 0x11a99, + 0x11c30, 0x11c36, + 0x11c38, 0x11c3d, + 0x11c3f, 0x11c3f, + 0x11c92, 0x11ca7, + 0x11caa, 0x11cb0, + 0x11cb2, 0x11cb3, + 0x11cb5, 0x11cb6, + 0x11d31, 0x11d36, + 0x11d3a, 0x11d3a, + 0x11d3c, 0x11d3d, + 0x11d3f, 0x11d45, + 0x11d47, 0x11d47, + 0x16af0, 0x16af4, + 0x16b30, 0x16b36, + 0x16f8f, 0x16f92, + 0x1bc9d, 0x1bc9e, + 0x1bca0, 0x1bca3, + 0x1d167, 0x1d169, + 0x1d173, 0x1d182, + 0x1d185, 0x1d18b, + 0x1d1aa, 0x1d1ad, + 0x1d242, 0x1d244, + 0x1da00, 0x1da36, + 0x1da3b, 0x1da6c, + 0x1da75, 0x1da75, + 0x1da84, 0x1da84, + 0x1da9b, 0x1da9f, + 0x1daa1, 0x1daaf, + 0x1e000, 0x1e006, + 0x1e008, 0x1e018, + 0x1e01b, 0x1e021, + 0x1e023, 0x1e024, + 0x1e026, 0x1e02a, + 0x1e8d0, 0x1e8d6, + 0x1e944, 0x1e94a, + 0xe0001, 0xe0001, + 0xe0020, 0xe007f, + 0xe0100, 0xe01ef, +] + +/* + * Generated by + * + * uniset eaw:W,F - cat:Me,Mn,Cf | \ + * sed -r 's/^([0-9a-f]+)[.][.]([0-9a-f]+)$/\t0x\1, 0x\2,/' | \ + * sed -r 's/^([0-9a-f]+)$/\t0x\1, 0x\1,/' + */ +const width2 = [ + 0x1100, 0x115f, + 0x231a, 0x231b, + 0x2329, 0x232a, + 0x23e9, 0x23ec, + 0x23f0, 0x23f0, + 0x23f3, 0x23f3, + 0x25fd, 0x25fe, + 0x2614, 0x2615, + 0x2648, 0x2653, + 0x267f, 0x267f, + 0x2693, 0x2693, + 0x26a1, 0x26a1, + 0x26aa, 0x26ab, + 0x26bd, 0x26be, + 0x26c4, 0x26c5, + 0x26ce, 0x26ce, + 0x26d4, 0x26d4, + 0x26ea, 0x26ea, + 0x26f2, 0x26f3, + 0x26f5, 0x26f5, + 0x26fa, 0x26fa, + 0x26fd, 0x26fd, + 0x2705, 0x2705, + 0x270a, 0x270b, + 0x2728, 0x2728, + 0x274c, 0x274c, + 0x274e, 0x274e, + 0x2753, 0x2755, + 0x2757, 0x2757, + 0x2795, 0x2797, + 0x27b0, 0x27b0, + 0x27bf, 0x27bf, + 0x2b1b, 0x2b1c, + 0x2b50, 0x2b50, + 0x2b55, 0x2b55, + 0x2e80, 0x2e99, + 0x2e9b, 0x2ef3, + 0x2f00, 0x2fd5, + 0x2ff0, 0x2ffb, + 0x3000, 0x3029, + 0x302e, 0x303e, + 0x3041, 0x3096, + 0x309b, 0x30ff, + 0x3105, 0x312e, + 0x3131, 0x318e, + 0x3190, 0x31ba, + 0x31c0, 0x31e3, + 0x31f0, 0x321e, + 0x3220, 0x3247, + 0x3250, 0x32fe, + 0x3300, 0x4dbf, + 0x4e00, 0xa48c, + 0xa490, 0xa4c6, + 0xa960, 0xa97c, + 0xac00, 0xd7a3, + 0xf900, 0xfaff, + 0xfe10, 0xfe19, + 0xfe30, 0xfe52, + 0xfe54, 0xfe66, + 0xfe68, 0xfe6b, + 0xff01, 0xff60, + 0xffe0, 0xffe6, + 0x16fe0, 0x16fe1, + 0x17000, 0x187ec, + 0x18800, 0x18af2, + 0x1b000, 0x1b11e, + 0x1b170, 0x1b2fb, + 0x1f004, 0x1f004, + 0x1f0cf, 0x1f0cf, + 0x1f18e, 0x1f18e, + 0x1f191, 0x1f19a, + 0x1f200, 0x1f202, + 0x1f210, 0x1f23b, + 0x1f240, 0x1f248, + 0x1f250, 0x1f251, + 0x1f260, 0x1f265, + 0x1f300, 0x1f320, + 0x1f32d, 0x1f335, + 0x1f337, 0x1f37c, + 0x1f37e, 0x1f393, + 0x1f3a0, 0x1f3ca, + 0x1f3cf, 0x1f3d3, + 0x1f3e0, 0x1f3f0, + 0x1f3f4, 0x1f3f4, + 0x1f3f8, 0x1f43e, + 0x1f440, 0x1f440, + 0x1f442, 0x1f4fc, + 0x1f4ff, 0x1f53d, + 0x1f54b, 0x1f54e, + 0x1f550, 0x1f567, + 0x1f57a, 0x1f57a, + 0x1f595, 0x1f596, + 0x1f5a4, 0x1f5a4, + 0x1f5fb, 0x1f64f, + 0x1f680, 0x1f6c5, + 0x1f6cc, 0x1f6cc, + 0x1f6d0, 0x1f6d2, + 0x1f6eb, 0x1f6ec, + 0x1f6f4, 0x1f6f8, + 0x1f910, 0x1f93e, + 0x1f940, 0x1f94c, + 0x1f950, 0x1f96b, + 0x1f980, 0x1f997, + 0x1f9c0, 0x1f9c0, + 0x1f9d0, 0x1f9e6, + 0x20000, 0x2fffd, + 0x30000, 0x3fffd, +] + const findc = {c, t, n, nelt, ret var p, m @@ -1245,3 +1686,32 @@ generic charval = {c, base -> @a::(numeric,integral) ;; -> v } + +const cellwidth = { c : char + var l + /* + * Follows the general idea described at + * http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c + */ + if c == 0 + -> 0 + ;; + + if c == -1 || c < 0x20 || (c >= 0x7f && c <= 0x9f) + -> -1 + ;; + + if findc(c, width0[:], width0.len/2, 2, &l) + if (c >= l[0] && c <= l[1]) + -> 0 + ;; + ;; + + if findc(c, width2[:], width2.len/2, 2, &l) + if (c >= l[0] && c <= l[1]) + -> 2 + ;; + ;; + + -> 1 +} diff --git a/lib/std/fmt.myr b/lib/std/fmt.myr index 08b814d..1f542d3 100644 --- a/lib/std/fmt.myr +++ b/lib/std/fmt.myr @@ -553,7 +553,7 @@ const strfmt = {sb, str, params ;; ;; else - for i = 0; i < w - graphemewidth(str); i++ + for i = 0; i < w - strcellwidth(str); i++ sbputc(sb, p) ;; sbputs(sb, str) @@ -564,14 +564,6 @@ const isprint = {b -> b >= (' ' : byte) && b < ('~' : byte) } -/* -Hah. like we're going to put in the work to actually -count graphemes. -*/ -const graphemewidth = {str - -> str.len -} - const digitchars = [ '0','1','2','3','4', '5','6','7','8','9', diff --git a/lib/std/test/chartype.myr b/lib/std/test/chartype.myr index d0df1d2..30426e6 100644 --- a/lib/std/test/chartype.myr +++ b/lib/std/test/chartype.myr @@ -20,4 +20,22 @@ const main = { std.assert(!std.isalnum('!'), "! should not be isalnum\n") std.assert(!std.isspace('@'), "@ should not be isspace\n") std.assert(!std.isblank('@'), "@ should not be isblank\n") + + std.assert(std.cellwidth(std.Badchar) == -1, "Badchar should have width -1\n") + std.assert(std.cellwidth((0 : char)) == 0, "U+0000 should have width 0\n") + std.assert(std.cellwidth('') == -1, "^E should have width -1\n") + std.assert(std.cellwidth('a') == 1, "'a' should have width 1\n") + std.assert(std.cellwidth('ê') == 1, "'ê' should have width 1\n") + std.assert(std.cellwidth('̀') == 0, "U+300 should have width 0\n") + std.assert(std.cellwidth('ϼ') == 1, "ϼ should have width 1\n") + std.assert(std.cellwidth('Ї') == 1, "Ї should have width 1\n") + std.assert(std.cellwidth('⊲') == 1, "⊲ should have width 1\n") + std.assert(std.cellwidth(' ') == 2, "U+3000 should have width 2\n") + std.assert(std.cellwidth('ギ') == 2, "ギ should have width 2\n") + std.assert(std.cellwidth('匌') == 2, "匌 should have width 2\n") + std.assert(std.cellwidth('') == 0, "U+00AD should have width 0\n") + std.assert(std.cellwidth('ࠠ') == 0, "U+820 should have width 0\n") + std.assert(std.cellwidth('־') == 1, "U+5BE should have width 1\n") + std.assert(std.cellwidth('ֿ') == 0, "U+5BF should have width 0\n") + std.assert(std.cellwidth('׀') == 1, "U+5C0 should have width 1\n") } diff --git a/lib/std/test/fmt.myr b/lib/std/test/fmt.myr index 5d0cfa8..c1a17ac 100644 --- a/lib/std/test/fmt.myr +++ b/lib/std/test/fmt.myr @@ -56,6 +56,10 @@ const builtins = { check("\\n\\r\\xff", "{e}", "\n\r\xff") check("{}barbaz", "{{}}bar{}", "baz") check("{barbaz}", "{{bar{}}}", "baz") + check(" Tlön", "{w=10}", "Tlön") + check(" háček", "{w=10}", "háček") + check("_____即是多多即", "{w=15,p=_}", "即是多多即") + check(" τῶν ῾Ελλήνων", "{w=15}", "τῶν ῾Ελλήνων") check("abcd", "{}", "abcd") check("123", "{}", 123) check("7b", "{x}", 123) diff --git a/lib/std/test/utf.myr b/lib/std/test/utf.myr Binary files differnew file mode 100644 index 0000000..4ac25d7 --- /dev/null +++ b/lib/std/test/utf.myr diff --git a/lib/std/utf.myr b/lib/std/utf.myr index 6c655a3..439254c 100644 --- a/lib/std/utf.myr +++ b/lib/std/utf.myr @@ -1,3 +1,5 @@ +use "extremum" +use "chartype" use "die" use "types" @@ -12,6 +14,7 @@ pkg std = const decode : (buf : byte[:] -> char) const strstep : (str : byte[:] -> (char, byte[:])) + const strcellwidth : (str : byte[:] -> size) ;; const charlen = {c @@ -101,3 +104,29 @@ const strstep = {str -> ((chr : char), str[len:]) } + +const strcellwidth = {str + var s : byte[:] = str + var c : char = Badchar + var n : size = 0 + + while s.len > 0 + (c, s) = strstep(s) + if c == Badchar + /* Something will probably be printed as U+FFFD */ + n++ + elif c < 0x20 + /* Control characters take 0 cells */ + elif c < 0x7f + /* Bog standard ASCII takes 1 cell */ + n++ + elif c == 0x7f + /* DEL is like a control character */ + else + /* It's not ASCII, so ask chartype what to do */ + n += (abs(cellwidth(c)) : size) + ;; + ;; + + -> n +} |