summaryrefslogtreecommitdiff
path: root/libregex/compile.myr
diff options
context:
space:
mode:
Diffstat (limited to 'libregex/compile.myr')
-rw-r--r--libregex/compile.myr848
1 files changed, 0 insertions, 848 deletions
diff --git a/libregex/compile.myr b/libregex/compile.myr
deleted file mode 100644
index 28d7ce8..0000000
--- a/libregex/compile.myr
+++ /dev/null
@@ -1,848 +0,0 @@
-use std
-
-use "types.use"
-use "ranges.use"
-
-pkg regex =
- const parse : (re : byte[:] -> std.result(ast#, status))
- const compile : (re : byte[:] -> std.result(regex#, status))
- const dbgcompile : (re : byte[:] -> std.result(regex#, status))
- const free : (re : regex# -> void)
- const failmsg : (st : status -> byte[:])
-;;
-
-type parseresult = union
- `Some ast#
- `None
- `Fail status
-;;
-
-/* Compiles a pattern into a regex */
-const compile = {pat
- -> regexcompile(std.mk([.pat = pat, .nmatch = 1]), 0)
-}
-
-const parse = {pat
- var re
-
- re = std.mk([.pat = pat, .nmatch = 1])
- match regexparse(re)
- | `None: -> `std.Fail `Incomplete
- | `Fail f: -> `std.Fail f
- | `Some t:
- if re.pat.len > 0
- -> `std.Fail `Incomplete
- else
- -> `std.Ok t
- ;;
- ;;
-}
-
-/* Compiles a pattern into a debug regex. This can be verbose. */
-const dbgcompile = {pat
- var re
-
- re = std.mk([.pat = pat, .nmatch = 1, .debug = true])
- -> regexcompile(re, 0)
-}
-
-/* compiles a pattern into an allocated regex */
-const regexcompile = {re, id
- match regexparse(re)
- | `None: -> `std.Fail (`Incomplete)
- | `Fail f: -> `std.Fail f
- | `Some t:
- /*
- we can stop early if we get
- an incorrectly encoded char
- */
- if re.pat.len > 0
- astfree(t)
- -> `std.Fail (`Incomplete)
- ;;
- dump(re, t, 0)
- append(re, `Ilbra 0)
- gen(re, t)
- append(re, `Irbra 0)
- append(re, `Imatch id)
- idump(re)
- astfree(t)
- -> `std.Ok re
- ;;
- -> `std.Fail (`Noimpl)
-}
-
-const free = {re
- /* all the threads should be dead,
- so we shouldn't have to free any*/
- std.slfree(re.prog)
- std.free(re)
-}
-
-
-/* generates bytecode from an AST */
-const gen = {re, t
- match t#
- |`Alt (a, b): genalt(re, a, b)
- |`Cat (a, b): gen(re, a); gen(re, b)
- /* repetition */
- |`Star a: genstar(re, a, false)
- |`Rstar a: genstar(re, a, true)
- |`Plus a: gen(re, a); genstar(re, a, false)
- |`Rplus a: gen(re, a); genstar(re, a, true)
- |`Quest a: genquest(re, a)
-
- /* end matches */
- |`Chr c: genchar(re, c)
- |`Ranges sl: genranges(re, sl)
-
- /* meta */
- |`Bol: append(re, `Ibol)
- |`Eol: append(re, `Ibol)
- |`Bow: append(re, `Ibow)
- |`Eow: append(re, `Ieow)
- |`Cap (m, a):
- append(re, `Ilbra m)
- gen(re, a)
- append(re, `Irbra m)
- ;;
- -> re.proglen
-}
-
-const genranges = {re, sl
- var lbuf : byte[4], hbuf : byte[4], boundbuf : byte[4]
- var lsz, hsz, bsz, i
- var rt : rangetrie#
-
- /* generate a trie of ranges */
- rt = std.zalloc()
- for r in sl
- /*
- encode:
- lo => bounds[loidx] - 1
- bounds[loidx] => bounds[loidx + 1] - 1
- ...
- bounds[hiidx - 1] => hi
- */
- lsz = std.encode(lbuf[:], r[0])
- hsz = std.encode(hbuf[:], r[1])
- for i = lsz; i < hsz; i++
- bsz = bound(boundbuf[:], i, 0xff)
- rtinsert(rt, lbuf[:lsz], boundbuf[:bsz])
- lsz = bound(lbuf[:], i + 1, 0x00)
- ;;
- rtinsert(rt, lbuf[:lsz], hbuf[:hsz])
- ;;
- if re.debug
- rtdump(rt, 0)
- ;;
- rangegen(re, rt, rt.ranges, rt.link, rangeprogsize(rt) + re.proglen)
- rtfree(rt)
- -> re.proglen
-}
-
-const bound = {buf, len, fill
- var i, s
-
- if len == 1
- buf[0] = 0x7f
- else
- s = len castto(byte)
- buf[0] = (0xff << (8 - s)) | (fill >> (s + 1))
- for i = 1; i < len; i++
- buf[i] = 0x80 | (fill >> 2)
- ;;
- ;;
- -> len
-}
-
-type rangetrie = struct
- ranges : (byte, byte)[:]
- link : rangetrie#[:]
- end : bool
-;;
-
-const rtdump = {rt, ind
- var i
- var l, h
-
- indent(ind)
- std.put("Range (end = {}) {{\n", rt.end)
- for i = 0; i < rt.ranges.len; i++
- indent(ind + 1)
- (l, h) = rt.ranges[i]
- std.put("0x{x}-0x{x}: \n", l, h)
- rtdump(rt.link[i], ind + 1)
- ;;
- indent(ind)
- std.put("}\n")
-}
-
-const indent = {ind
- var i
- for i = 0; i < ind; i++
- std.put("\t")
- ;;
-}
-
-const rtinsert = {rt, lo, hi
- var a, b
- var n
-
- std.assert(lo.len == hi.len, "range sizes differ")
- if lo.len == 0
- rt.end = true
- ->
- ;;
-
- n = rt.ranges.len
- if n == 0
- rt.ranges = std.slpush(rt.ranges, (lo[0], hi[0]))
- rt.link = std.slpush(rt.link, std.zalloc())
- else
- /*
- this is a safe way to compare because we know that ranges
- should always be coming in ordered. This means that equal
- values will be added one after the other.
- */
- (a, b) = rt.ranges[n - 1]
- if a != lo[0] || b != hi[0]
- rt.ranges = std.slpush(rt.ranges, (lo[0], hi[0]))
- rt.link = std.slpush(rt.link, std.zalloc())
- ;;
- ;;
-
- rtinsert(rt.link[rt.link.len - 1], lo[1:], hi[1:])
-}
-
-const rtfree = {rt
- for l in rt.link
- rtfree(l)
- ;;
- std.slfree(rt.link)
- std.slfree(rt.ranges)
- std.free(rt)
-}
-
-const rangegen = {re, rt, ranges, links, end
- var alt, l0, l1, l2
- var a, b
- var n
-
- n = ranges.len
- if n == 0
- -> re.proglen
- elif n == 1
- (a, b) = ranges[0]
- append(re, `Irange (a, b))
- if links[0].end
- if links[0].ranges.len > 0
- append(re, `Ifork (re.prog.len + 1, end))
- else
- append(re, `Ijmp end)
- ;;
- ;;
- rangegen(re, links[0], links[0].ranges, links[0].link, end)
- else
- alt = re.proglen
- l0 = append(re, `Ifork (-1, -1))
- l1 = rangegen(re, rt, ranges[0:n/2], links[0:n/2], end)
- l2 = rangegen(re, rt, ranges[n/2:n], links[n/2:n], end)
- re.prog[alt] = `Ifork (l0, l1)
- ;;
- -> re.proglen
-}
-
-const rangeprogsize = {rt
- var sz
-
- if rt.ranges.len == 0
- sz = 0
- else
- sz = 2*rt.ranges.len - 1
- for l in rt.link
- sz += rangeprogsize(l)
- ;;
- ;;
- if rt.end
- sz += 1
- ;;
- -> sz
-}
-
-/* calculates the forward jump distance for a utf8 character range */
-const jmpdist = {n
- var d
- var i
-
- d = n - 1
- for i = n - 1; i > 0; i--
- d += i
- ;;
- -> d
-}
-
-/* generates an alternation */
-const genalt = {re, l, r
- var alt
- var jmp
- var l0
- var l1
- var l2
-
- alt = re.proglen
- l0 = append(re, `Ifork (-1, -1)) /* needs to be replaced */
- gen(re, l)
- jmp = re.proglen
- l1 = append(re, `Ijmp -1) /* needs to be replaced */
- l2 = gen(re, r)
-
- re.prog[alt] = `Ifork(l0, l1)
- re.prog[jmp] = `Ijmp l2
- -> re.proglen
-}
-
-/* generates a repetition operator */
-const genstar = {re, rep, reluct
- var alt
- var jmp
- var l0
- var l1
- var l2
-
- l0 = re.proglen
- alt = re.proglen
- l1 = append(re, `Ifork (-1, -1)) /* needs to be replaced */
- jmp = gen(re, rep)
- l2 = append(re, `Ijmp -1)
-
-
- /* reluctant matches should prefer jumping to the end. */
- if reluct
- re.prog[alt] = `Ifork (l2, l1)
- else
- re.prog[alt] = `Ifork (l1, l2)
- ;;
- re.prog[jmp] = `Ijmp l0
- -> re.proglen
-}
-
-/* generates a question mark operator */
-const genquest = {re, q
- var alt
- var l0
- var l1
-
- alt = re.proglen
- l0 = append(re, `Ifork (-1, -1)) /* needs to be replaced */
- l1 = gen(re, q)
- re.prog[alt] = `Ifork (l0, l1)
- -> re.proglen
-}
-
-/* generates a single char match */
-const genchar = {re, c
- var b : byte[4]
- var n
- var i
-
- n = std.encode(b[:], c)
- std.assert(n > 0 && n < 4, "non-utf character in regex\n")
- for i = 0; i < n; i++
- append(re, `Ibyte b[i])
- ;;
- -> re.proglen
-}
-
-/* appends an instructon to an re program */
-const append = {re, insn
- if re.proglen == re.prog.len
- re.prog = std.slgrow(re.prog, std.max(1, 2*re.proglen))
- ;;
- re.prog[re.proglen] = insn
- re.proglen++
- -> re.proglen
-}
-
-/* instruction dump */
-const idump = {re
- var i
-
- if !re.debug
- ->
- ;;
- for i = 0; i < re.proglen; i++
- std.put("{}:\t", i)
- match re.prog[i]
- /* Char matching. Consume exactly one byte from the string. */
- | `Ibyte b: std.put("`Ibyte {} ({})\n", b, b castto(char))
- | `Irange (start, end):
- std.put("`Irange ({},{})", start, end)
- if std.isalnum(start castto(char)) && std.isalnum(end castto(char))
- std.put("\t/* {}-{} */", start castto(char), end castto(char))
- ;;
- std.put("\n")
- /* capture groups */
- | `Ilbra m: std.put("`Ilbra {}\n", m)
- | `Irbra m: std.put("`Irbra {}\n", m)
- /* anchors */
- | `Ibol: std.put("`Ibol\n")
- | `Ieol: std.put("`Ieol\n")
- | `Ibow: std.put("`Ibow\n")
- | `Ieow: std.put("`Ieow\n")
- /* control flow */
- | `Ifork (lip, rip): std.put("`Ifork ({},{})\n", lip, rip)
- | `Ijmp ip: std.put("`Ijmp {}\n", ip)
- | `Imatch id: std.put("`Imatch {}\n", id)
- ;;
- ;;
-}
-
-/* AST dump */
-const dump = {re, t, indent
- var i
-
- if !re.debug
- ->
- ;;
- for i = 0; i < indent; i++
- std.put(" ")
- ;;
- match t#
- | `Alt (a, b):
- std.put("Alt\n")
- dump(re, a, indent + 1)
- dump(re, b, indent + 1)
- | `Cat (a, b):
- std.put("Cat\n")
- dump(re, a, indent + 1)
- dump(re, b, indent + 1)
- /* repetition */
- | `Star a:
- std.put("Star\n")
- dump(re, a, indent + 1)
- | `Rstar a:
- std.put("Rstar\n")
- dump(re, a, indent + 1)
- | `Plus a:
- std.put("Plus\n")
- dump(re, a, indent + 1)
- | `Rplus a:
- std.put("Rplus\n")
- dump(re, a, indent + 1)
- | `Quest a:
- std.put("Quest\n")
- dump(re, a, indent + 1)
- | `Bol:
- std.put("Bol\n")
- | `Eol:
- std.put("Eol\n")
- | `Bow:
- std.put("Bow\n")
- | `Eow:
- std.put("Eow\n")
- /* end matches */
- | `Chr c:
- std.put("Char {}\n", c)
- | `Ranges rl:
- std.put("Ranges")
- for r in rl
- for i = 0; i < indent + 1; i++
- std.put(" ")
- ;;
- std.put("\t({}-{})\n", r[0], r[1])
- ;;
-
- /* meta */
- | `Cap (m, a):
- std.put("Cap {}\n", m)
- dump(re, a, indent + 1)
- ;;
-}
-
-/* parses an expression */
-const regexparse = {re
- match altexpr(re)
- | `Some t:
- if re.pat.len == 0
- -> `Some t
- else
- astfree(t)
- -> `Fail `Incomplete
- ;;
- | `None:
- -> `None
- | `Fail st:
- -> `Fail st
- ;;
-}
-
-const altexpr = {re
- var ret
-
- match catexpr(re)
- | `Some t:
- ret = t
- if matchc(re, '|')
- match altexpr(re)
- | `Some rhs:
- ret = mk(`Alt (ret, rhs))
- | `None:
- astfree(ret)
- -> `Fail (`Incomplete)
- | `Fail f:
- -> `Fail f
- ;;
- ;;
- | other:
- -> other
- ;;
- -> `Some ret
-}
-
-const catexpr = {re
- var ret
-
- match repexpr(re)
- | `Some t:
- ret = t
- match catexpr(re)
- | `Some rhs:
- ret = mk(`Cat (t, rhs))
- | `Fail f: -> `Fail f
- | `None: /* nothing */
- ;;
- | other:
- -> other
- ;;
- -> `Some ret
-}
-
-const repexpr = {re
- var ret
-
- match baseexpr(re)
- | `Some t:
- if matchc(re, '*')
- if matchc(re, '?')
- ret = mk(`Rstar t)
- else
- ret = mk(`Star t)
- ;;
- elif matchc(re, '+')
- if matchc(re, '?')
- ret = mk(`Rplus t)
- else
- ret = mk(`Plus t)
- ;;
- elif matchc(re, '?')
- ret = mk(`Quest t)
- else
- ret = t
- ;;
- | other:
- -> other
- ;;
- -> `Some ret
-}
-
-const baseexpr = {re
- var ret, m
-
- if re.pat.len == 0
- -> `None
- ;;
- match peekc(re)
- /* lower prec operators */
- | '|': -> `None
- | ')': -> `None
- | '*': -> `Fail `Badrep
- | '+': -> `Fail `Badrep
- | '?': -> `Fail `Badrep
- | '[': -> chrclass(re)
- | '.': getc(re); ret = mk(`Ranges std.slpush([][:], [0, std.Maxcharval]))
- | '^': getc(re); ret = mk(`Bol)
- | '$': getc(re); ret = mk(`Eol)
- | '(':
- m = re.nmatch++
- getc(re)
- match altexpr(re)
- | `Some s:
- if matchc(re, ')')
- -> `Some mk(`Cap (m, s))
- else
- -> `Fail `Unbalanced
- ;;
- | `None: -> `Fail `Emptyparen
- | `Fail st: -> `Fail st
- ;;
- | '\\':
- getc(re) /* consume the slash */
- if re.pat.len == 0
- -> `Fail `Incomplete
- ;;
- -> escaped(re)
- | c:
- getc(re)
- ret = mk(`Chr c)
- ;;
- -> `Some ret
-}
-
-const escaped = {re
- var ret
-
- match getc(re)
- /* character classes */
- | 'd': ret = `Some mk(`Ranges std.sldup(_ranges.tabasciidigit[:]))
- | 'x': ret = `Some mk(`Ranges std.sldup(_ranges.tabasciixdigit[:]))
- | 's': ret = `Some mk(`Ranges std.sldup(_ranges.tabasciispace[:]))
- | 'w': ret = `Some mk(`Ranges std.sldup(_ranges.tabasciiword[:]))
- | 'h': ret = `Some mk(`Ranges std.sldup(_ranges.tabasciiblank[:]))
-
- /* negated character classes */
- | 'W': ret = `Some mk(`Ranges negate(_ranges.tabasciiword[:]))
- | 'S': ret = `Some mk(`Ranges negate(_ranges.tabasciispace[:]))
- | 'D': ret = `Some mk(`Ranges negate(_ranges.tabasciidigit[:]))
- | 'X': ret = `Some mk(`Ranges negate(_ranges.tabasciixdigit[:]))
- | 'H': ret = `Some mk(`Ranges negate(_ranges.tabasciiblank[:]))
-
- /* unicode character classes */
- | 'p': ret = unicodeclass(re, false)
- | 'P': ret = unicodeclass(re, true)
-
- /* operators that need an escape */
- | '<': ret = `Some mk(`Bow)
- | '>': ret = `Some mk(`Eow)
-
- /* escaped metachars */
- | '^': ret = `Some mk(`Chr '^')
- | '$': ret = `Some mk(`Chr '$')
- | '.': ret = `Some mk(`Chr '.')
- | '+': ret = `Some mk(`Chr '+')
- | '?': ret = `Some mk(`Chr '?')
- | chr: ret = `Fail `Badescape
- ;;
- -> ret
-}
-
-const unicodeclass = {re, neg
- var c, s
- var tab
- var t
- var n
-
- if re.pat.len == 0
- -> `Fail (`Incomplete)
- ;;
- n = 0
- s = re.pat
- /* either a single char pattern, or {pat} */
- match getc(re)
- | '{':
- s = s[1:]
- while re.pat.len > 0
- c = getc(re)
- if c == '}'
- break
- ;;
- n += std.charlen(c)
- ;;
- | r:
- n += std.charlen(r)
- ;;
- s = s[:n]
- /* letters */
- if std.sleq(s, "L") || std.sleq(s, "Letter")
- tab = _ranges.tabalpha[:]
- elif std.sleq(s, "Lu") || std.sleq(s, "Uppercase_Letter")
- tab = _ranges.tabupper[:]
- elif std.sleq(s, "Ll") || std.sleq(s, "Lowercase_Letter")
- tab = _ranges.tablower[:]
- elif std.sleq(s, "Lt") || std.sleq(s, "Titlecase_Letter")
- tab = _ranges.tablower[:]
- /* numbers (incomplete) */
- elif std.sleq(s, "N") || std.sleq(s, "Number")
- tab = _ranges.tabdigit[:]
- elif std.sleq(s, "Z") || std.sleq(s, "Separator")
- tab = _ranges.tabspace[:]
- elif std.sleq(s, "Zs") || std.sleq(s, "Space_Separator")
- tab = _ranges.tabblank[:]
- else
- -> `Fail (`Badrange)
- ;;
- if !neg
- t = mk(`Ranges std.sldup(tab))
- else
- t = mk(`Ranges negate(tab))
- ;;
- -> `Some t
-}
-
-const chrclass = {re
- var rl, m, n
- var neg
- var t
-
- /* we know we saw '[' on entry */
- matchc(re, '[')
- neg = false
- if matchc(re, '^')
- neg = true
- ;;
- rl = rangematch(re, [][:])
- while peekc(re) != ']' && re.pat.len > 0
- rl = rangematch(re, rl)
- ;;
- if !matchc(re, ']')
- std.slfree(rl)
- -> `Fail `Unbalanced
- ;;
-
- std.sort(rl, {a, b;
- if a[0] < b[0]
- -> `std.Before
- elif a[0] == b[0]
- -> `std.Equal
- else
- -> `std.After
- ;;})
- m = merge(rl)
- std.slfree(rl)
- if neg
- n = negate(m)
- std.slfree(m)
- t = mk(`Ranges n)
- else
- t = mk(`Ranges m)
- ;;
- -> `Some t
-}
-
-const rangematch = {re, sl
- var lo
- var hi
-
- lo = getc(re)
- if matchc(re, '-')
- hi = getc(re)
- if lo <= hi
- -> std.slpush(sl, [lo, hi])
- else
- -> std.slpush(sl, [hi, lo])
- ;;
- else
- -> std.slpush(sl, [lo, lo])
- ;;
-}
-
-const negate = {rng
- var start, end, next
- var neg
-
- neg = [][:]
- start = 0
- next = 0 /* if we have no ranges */
- for r in rng
- (end, next) = (r[0], r[1])
- neg = std.slpush(neg, [start, end - 1])
- start = next + 1
- ;;
- neg = std.slpush(neg, [next + 1, std.Maxcharval])
- -> neg
-}
-
-/* rl is a sorted list of ranges */
-const merge = {rl
- var lo, hi
- var ret
-
- if rl.len == 0
- -> [][:]
- ;;
- ret = [][:]
- lo = rl[0][0]
- hi = rl[0][1]
- for r in rl[1:]
- /* if it overlaps or abuts, merge */
- if r[0] <= hi + 1
- hi = r[1]
- else
- ret = std.slpush(ret, [lo, hi])
- lo = r[0]
- hi = r[1]
- ;;
- ;;
- -> std.slpush(ret, [lo, hi])
-}
-
-
-const matchc = {re, c
- var str
- var chr
-
- (chr, str) = std.striter(re.pat)
- if chr != c
- -> false
- ;;
- re.pat = str
- -> true
-}
-
-const getc = {re
- var c
-
- (c, re.pat) = std.striter(re.pat)
- -> c
-}
-
-const peekc = {re
- var c
-
- (c, _) = std.striter(re.pat)
- -> c
-}
-
-const mk = {v
- var t
-
- t = std.alloc()
- t# = v
- -> t
-}
-
-const astfree = {t
- match t#
- | `Alt (a, b): astfree(a); astfree(b)
- | `Cat (a, b): astfree(a); astfree(b)
- /* repetition */
- | `Star a: astfree(a)
- | `Rstar a: astfree(a)
- | `Plus a: astfree(a)
- | `Rplus a: astfree(a)
- | `Quest a: astfree(a)
-
- /* end matches */
- | `Chr c:
- | `Ranges rl: std.slfree(rl)
-
- /* meta */
- | `Cap (m, a): astfree(a)
- | _: /* other types have no suballocations */
- ;;
- std.free(t)
-}
-
-const failmsg = {st
- match st
- | `Noimpl: -> "no implementation"
- | `Incomplete: -> "regex ended before input fully parsed"
- | `Unbalanced: -> "unbalanced bracket"
- | `Emptyparen: -> "empty parentheses"
- | `Badrep: -> "invalid repetition"
- | `Badrange: -> "invalid range"
- | `Badescape: -> "invalid escape code"
-
- ;;
-}
-