diff options
author | Ori Bernstein <ori@eigenstate.org> | 2015-08-26 12:20:58 -0700 |
---|---|---|
committer | Ori Bernstein <ori@eigenstate.org> | 2015-08-26 12:20:58 -0700 |
commit | 2bc852bda98762d3bc01548bf972e3f1b137fbfb (patch) | |
tree | 74831deed3c9057c5fe0cbb8790d220e855bc792 /lib/std/utf.myr | |
parent | 3de952510eb2a23350d24ed926f19c0cf72a12f2 (diff) | |
download | mc-2bc852bda98762d3bc01548bf972e3f1b137fbfb.tar.gz |
Move Myrddin libs to lib/ subdirectory.
Diffstat (limited to 'lib/std/utf.myr')
-rw-r--r-- | lib/std/utf.myr | 103 |
1 files changed, 103 insertions, 0 deletions
diff --git a/lib/std/utf.myr b/lib/std/utf.myr new file mode 100644 index 0000000..03cc8bb --- /dev/null +++ b/lib/std/utf.myr @@ -0,0 +1,103 @@ +use "die.use" +use "types.use" + +pkg std = + const Badchar : char = -1 castto(char) + const Maxcharlen : size = 4 + const Maxcharval : char = 0x10FFFF + + const charlen : (chr : char -> size) + const encode : (buf : byte[:], chr : char -> size) + const decode : (buf : byte[:] -> char) + const striter : (str : byte[:] -> (char, byte[:])) +;; + +const charlen = {c + if c < 0x80 + -> 1 + elif c < 0x800 + -> 2 + elif c < 0x10000 + -> 3 + elif c < 0x200000 + -> 4 + else + -> -1 + ;; +} + +const encode = {buf, c + var len + var mark + var i + + len = charlen(c) + if len < 0 || buf.len < len + -> -1 + ;; + + if (len == 1) + mark = 0 + else + mark = (((1 << (8 - len)) - 1) ^ 0xff) castto(char) + ;; + + for i = len - 1; i > 0; i-- + buf[i] = (c & 0x3f | 0x80) castto(byte) + c >>= 6 + ;; + + buf[0] = (c | mark) castto(byte) + -> len +} + +const decode = {buf + var c + var b + + (c, b) = striter(buf) + -> c +} + +const striter = {str + var len + var mask + var chr + var i + var c + var tmp + + if str.len == 0 + /* empty string: no resync needed */ + -> (Badchar, str) + ;; + c = str[0] + len = 0 + if c & 0x80 == 0 /* 0b0xxx_xxxx */ + len = 1 + elif c & 0xe0 == 0xc0 /* 0b110x_xxxx */ + len = 2 + elif c & 0xf0 == 0xe0 /* 0b1110_xxxx */ + len = 3 + elif c & 0xf8 == 0xf0 /* 0b1111_0xxx */ + len = 4 + else + /* skip one char forward so we can try + resyncing the character stream */ + -> (Badchar, str[1:]) + ;; + + if len == 0 || len > str.len + /* again, we want to try to resync */ + -> (Badchar, str[1:]) + ;; + + mask = (1 << (8 - len)) - 1 + chr = (c castto(uint32)) & mask + for i = 1; i < len; i++ + tmp = str[i] castto(uint32) + chr = (chr << 6) | (tmp & 0x3f) + ;; + + -> (chr castto(char), str[len:]) +} |