summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorOri Bernstein <ori@eigenstate.org>2014-10-13 05:22:25 -0400
committerOri Bernstein <ori@eigenstate.org>2014-10-13 05:22:25 -0400
commit5e60759629a43ed5b5aeb5c831cadc516bc3c980 (patch)
tree11ee8931e7ac68bda4c1e63f9eb1ed6c805e6c12
parent655d23e95323b6e3b99c7b35bf807c277475721b (diff)
downloadlibregex-5e60759629a43ed5b5aeb5c831cadc516bc3c980.tar.gz
Fix up unicode boundary generation.
Using boundary characters doesn't guarantee a match -- while the values are lexicographically in order, individual bytes may not be quite right...
-rw-r--r--compile.myr37
1 files changed, 25 insertions, 12 deletions
diff --git a/compile.myr b/compile.myr
index d536f25..4e376e4 100644
--- a/compile.myr
+++ b/compile.myr
@@ -120,28 +120,26 @@ const gen = {re, t
}
const genranges = {re, sl
- const charbounds = [
- 0, /* len = 0: bug if used for hi */
- 0x80, /* len = 1 */
- 0x800, /* len = 2 */
- 0x10000, /* len = 3 */
- 0x200000, /* len = 4 */
- -1
- ]
var lbuf : byte[4], hbuf : byte[4], boundbuf : byte[4]
- var lsz, hsz, bsz
+ var lsz, hsz, bsz, i
var rt : rangetrie#
- var i
/* generate a trie of ranges */
rt = std.zalloc()
for r in sl
+ /*
+ encode:
+ lo => bounds[loidx] - 1
+ bounds[loidx] => bounds[loidx + 1] - 1
+ ...
+ bounds[hiidx - 1] => hi
+ */
lsz = std.encode(lbuf[:], r[0])
hsz = std.encode(hbuf[:], r[1])
for i = lsz; i < hsz; i++
- bsz = std.encode(boundbuf[:], charbounds[i] - 1)
+ bsz = bound(boundbuf[:], i, 0xff)
rtinsert(rt, lbuf[:lsz], boundbuf[:bsz])
- lsz = std.encode(lbuf[:], charbounds[i])
+ lsz = bound(lbuf[:], i + 1, 0x00)
;;
rtinsert(rt, lbuf[:lsz], hbuf[:hsz])
;;
@@ -153,6 +151,21 @@ const genranges = {re, sl
-> re.proglen
}
+const bound = {buf, len, fill
+ var i, s
+
+ if len == 1
+ buf[0] = 0x7f
+ else
+ s = len castto(byte)
+ buf[0] = (0xff << (8 - s)) | (fill >> (s + 1))
+ for i = 1; i < len; i++
+ buf[i] = 0x80 | (fill >> 2)
+ ;;
+ ;;
+ -> len
+}
+
type rangetrie = struct
ranges : (byte, byte)[:]
link : rangetrie#[:]