summaryrefslogtreecommitdiff
path: root/lib/std/utf.myr
blob: 03cc8bb81c15036510c462db2f139329f223436a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
use "die.use"
use "types.use"

pkg std =
	const Badchar	: char = -1 castto(char)
	const Maxcharlen : size = 4
	const Maxcharval : char = 0x10FFFF

	const charlen	: (chr : char -> size)
	const encode	: (buf : byte[:], chr : char -> size)
	const decode	: (buf : byte[:] -> char)
	const striter	: (str : byte[:] -> (char, byte[:]))
;;

const charlen = {c
	if c < 0x80
		-> 1
	elif c < 0x800
		-> 2
	elif c < 0x10000
		-> 3
	elif c < 0x200000
		-> 4
	else
		-> -1
	;;
}

const encode = {buf, c
	var len
	var mark
	var i

	len = charlen(c)
	if len < 0 || buf.len < len
		-> -1
	;;

	if (len == 1)
		mark = 0
	else
		mark = (((1 << (8 - len)) - 1) ^ 0xff) castto(char)
	;;

	for i = len - 1; i > 0; i--
		buf[i] = (c & 0x3f | 0x80) castto(byte)
		c >>= 6
	;;

	buf[0] = (c | mark) castto(byte)
	-> len
}

const decode = {buf
	var c
	var b

	(c, b) = striter(buf)
	-> c
}

const striter = {str
	var len
	var mask
	var chr
	var i
	var c
	var tmp

	if str.len == 0
		/* empty string: no resync needed */
		-> (Badchar, str)
	;;
	c = str[0]
	len = 0
	if c & 0x80 == 0	/* 0b0xxx_xxxx */
		len = 1
	elif c & 0xe0 == 0xc0	/* 0b110x_xxxx */
		len = 2
	elif c & 0xf0 == 0xe0 	/* 0b1110_xxxx */
		len = 3
	elif c & 0xf8 == 0xf0 	/* 0b1111_0xxx */
		len = 4
	else
		/* skip one char forward so we can try
		   resyncing the character stream */
		-> (Badchar, str[1:])
	;;

	if len == 0 || len > str.len
		/* again, we want to try to resync */
		-> (Badchar, str[1:])
	;;

	mask = (1 << (8 - len)) - 1
	chr = (c castto(uint32)) & mask
	for i = 1; i < len; i++
		tmp = str[i] castto(uint32)
		chr = (chr << 6) | (tmp & 0x3f)
	;;

	-> (chr castto(char), str[len:])
}