summaryrefslogtreecommitdiff
path: root/lib/regex
diff options
context:
space:
mode:
authorOri Bernstein <ori@markovcorp.com>2017-12-28 14:59:28 -0800
committerOri Bernstein <ori@markovcorp.com>2017-12-28 14:59:28 -0800
commit75ddf8edac3ea43fee7b7bdafe9e7799fb3571aa (patch)
treed60c3bfd10e3de66cde173e666ba4e5f81a777b3 /lib/regex
parent929f3dcbc45901d30b40f06826443424825c251e (diff)
downloadmc-75ddf8edac3ea43fee7b7bdafe9e7799fb3571aa.tar.gz
Performance increase.
Actually using integers over unions speeds things up a lot.
Diffstat (limited to 'lib/regex')
-rw-r--r--lib/regex/compile.myr23
-rw-r--r--lib/regex/interp.myr169
-rw-r--r--lib/regex/types.myr18
3 files changed, 125 insertions, 85 deletions
diff --git a/lib/regex/compile.myr b/lib/regex/compile.myr
index 6a82cc6..3069d27 100644
--- a/lib/regex/compile.myr
+++ b/lib/regex/compile.myr
@@ -73,11 +73,34 @@ const regexcompile = {re, id
append(re, `Imatch id, t)
idump(re)
astfree(t)
+ re.code = convert(re.prog)
-> `std.Ok re
;;
-> `std.Err (`Noimpl)
}
+const convert = {prog
+ var bc
+
+ bc = [][:]
+ for p : prog
+ match p
+ | `Ibyte b: std.slpush(&bc, OpByte | ((b : recode) << 4))
+ | `Irange (a, b): std.slpush(&bc, OpRange | ((a : recode) << 4) | ((b : recode) << 16))
+ | `Ilbra m: std.slpush(&bc, OpLbra | ((m : recode) << 4))
+ | `Irbra m: std.slpush(&bc, OpRbra | ((m : recode) << 4))
+ | `Ibol: std.slpush(&bc, OpBol)
+ | `Ieol: std.slpush(&bc, OpEol)
+ | `Ibow: std.slpush(&bc, OpBow)
+ | `Ieow: std.slpush(&bc, OpEow)
+ | `Ijmp d: std.slpush(&bc, OpJmp | ((d : recode) << 4))
+ | `Imatch m: std.slpush(&bc, OpMatch | ((m : recode) << 4))
+ | `Ifork (a, b): std.slpush(&bc, OpFork | ((a : recode) << 4) | ((b : recode) << 34))
+ ;;
+ ;;
+ -> bc
+}
+
const free = {re
/* all the threads should be dead,
so we shouldn't have to free any*/
diff --git a/lib/regex/interp.myr b/lib/regex/interp.myr
index 87528a2..895fe7e 100644
--- a/lib/regex/interp.myr
+++ b/lib/regex/interp.myr
@@ -252,10 +252,6 @@ const run = {re, str, idx, wholestr
thr = re.runq
re.runq = thr.next
- if re.trace
- trace(re, thr, "\nrunning tid={}, ip={}, s[{}]={}\n", \
- thr.tid, thr.ip, re.strp, std.decode(re.str[re.strp:]))
- ;;
ip = thr.ip
consumed = step(re, thr, -1)
while !consumed
@@ -263,13 +259,12 @@ const run = {re, str, idx, wholestr
;;
if std.bshas(states, thr.ip)
- die(re, thr, "there can be only one")
+ die(re, thr)
;;
if thr.dead
thrfree(re, thr)
elif thr.matched
- trace(re, thr, "new bestmatch\n")
if bestmatch != Zthr
thrfree(re, bestmatch)
;;
@@ -311,117 +306,112 @@ const run = {re, str, idx, wholestr
consumed, false otherwise.
*/
const step = {re, thr, curip
- var str
- var nthr
+ var str, nthr, inst
str = re.str
- match re.prog[thr.ip]
+ inst = re.code[thr.ip]
+ if re.trace
+ itrace(re, thr, re.prog[thr.ip])
+ ;;
+ if re.debug
+ std.bsput(re.traces[thr.tid], thr.ip)
+ ;;
+ match inst & 0xf
/* Char matching. Consume exactly one byte from the string. */
- | `Ibyte b:
- trace(re, thr, "\t{}:\tByte {} ({})\n", thr.ip, b, (b : char))
- if !within(re, str)
- die(re, thr, "end of string")
- elif b != str[re.strp]
- die(re, thr, "not right char")
+ | OpRange:
+ var lo = (inst >> 4 : byte)
+ var hi = (inst >> 16 : byte)
+
+ if !within(re, str) || lo > str[re.strp] || hi < str[re.strp]
+ die(re, thr)
else
- hit(re, thr)
thr.ip++
- trace(re, thr, "\t\tmatched {} with {}\n", b, str[re.strp])
;;
- | `Irange (start, end):
- trace(re, thr, "\t{}:\tRange ({}, {}) /* {} - {} */\n", thr.ip, start, end, (start : char), (end : char))
- if !within(re, str) || start > str[re.strp] || end < str[re.strp]
- die(re, thr, "bad range")
+ | OpByte:
+ var b = (inst >> 4 : byte)
+ if !within(re, str)
+ die(re, thr)
+ elif b != str[re.strp]
+ die(re, thr)
else
- hit(re, thr)
thr.ip++
;;
+ | OpFork:
+ var lip = ((inst >> 4) & 0x3fffffff : std.size)
+ var rip = ((inst >> 34) & 0x3fffffff : std.size)
+ if rip != curip
+ nthr = mkthread(re, rip)
+ nthr.next = re.runq
+ nthr.mgroup = thr.mgroup
+ re.runq = nthr
+ ;;
+ if re.debug
+ std.slpush(&re.traces, std.bsdup(re.traces[thr.tid]))
+ ;;
+ thr.ip = lip
+ -> false
/*
Non-consuming. All of these return false, and expect step to be
called again until exactly one byte is consumed from the string.
*/
- | `Ibol:
- trace(re, thr, "\t{}:\tBol\n", thr.ip)
+ | OpJmp:
+ var ip = (inst >> 4 : std.size)
+ thr.ip = ip
+ -> false
+ | OpMatch:
+ var id = (inst >> 4 : std.size)
+ re.lastthr = thr.tid
+ finish(re, thr)
+ -> true
+ | OpLbra:
+ var m = (inst >> 4 : std.size)
+ thr.mgroup[m][0] = re.strp
+ thr.ip++
+ -> false
+ | OpRbra:
+ var m = (inst >> 4 : std.size)
+ thr.mgroup[m][1] = re.strp
+ thr.ip++
+ -> false
+ | OpBol:
if re.strp == 0 || str[re.strp - 1] == ('\n' : byte)
- hit(re, thr)
thr.ip++
-> false
else
- die(re, thr, "not beginning of line")
+ die(re, thr)
;;
- | `Ieol:
- trace(re, thr, "\t{}:\tEol\n", thr.ip)
+ | OpEol:
if re.strp == str.len || str[re.strp] == ('\n' : byte)
- hit(re, thr)
thr.ip++
-> false
else
- die(re, thr, "not end of line")
+ die(re, thr)
;;
/* check for word characters */
- | `Ibow:
- trace(re, thr, "\t{}:\tBow\n", thr.ip)
+ | OpBow:
if iswordchar(str[re.strp:]) && (re.strp == 0 || !iswordchar(prevchar(str, re.strp)))
- hit(re, thr)
thr.ip++
-> false
else
- die(re, thr, "not beginning of word")
+ die(re, thr)
;;
- | `Ieow:
- trace(re, thr, "\t{}:\tEow\n", thr.ip)
+ | OpEow:
if re.strp == str.len && iswordchar(prevchar(str, re.strp))
- hit(re, thr)
thr.ip++
-> false
elif re.strp > 0 && !iswordchar(str[re.strp:]) && iswordchar(prevchar(str, re.strp))
- hit(re, thr)
thr.ip++
-> false
else
- die(re, thr, "not end of word")
- ;;
- | `Ilbra m:
- trace(re, thr, "\t{}:\tLbra {}\n", thr.ip, m)
- thr.mgroup[m][0] = re.strp
- hit(re, thr)
- thr.ip++
- -> false
- | `Irbra m:
- trace(re, thr, "\t{}:\tRbra {}\n", thr.ip, m)
- thr.mgroup[m][1] = re.strp
- hit(re, thr)
- thr.ip++
- -> false
- | `Ifork (lip, rip):
- trace(re, thr, "\t{}:\tFork ({}, {})\n", thr.ip, lip, rip)
- hit(re, thr)
- if rip != curip
- nthr = mkthread(re, rip)
- nthr.next = re.runq
- nthr.mgroup = thr.mgroup
- re.runq = nthr
- ;;
- if re.debug
- std.slpush(&re.traces, std.bsdup(re.traces[thr.tid]))
+ die(re, thr)
;;
- thr.ip = lip
- -> false
- | `Ijmp ip:
- trace(re, thr, "\t{}:\tJmp {}\n", thr.ip, ip)
- hit(re, thr)
- thr.ip = ip
- -> false
- | `Imatch id:
- trace(re, thr, "\t{}:\tMatch\n", thr.ip)
- re.lastthr = thr.tid
- finish(re, thr)
- -> true
+ | _:
+ std.die("bad match")
;;
-> true
}
-const die = {re, thr, msg
+const die = {re, thr
/*
we can have die called on a thread
multiple times, eg, if it has a bad
@@ -429,7 +419,6 @@ const die = {re, thr, msg
thread is in. We should only decrement
the number of threads for that once.
*/
- trace(re, thr, "\t\tdie {}: {}\n", thr.tid, msg)
if !thr.dead
re.nthr--
;;
@@ -439,7 +428,6 @@ const die = {re, thr, msg
}
const finish = {re, thr
- trace(re, thr, "finish {}\n", thr.tid)
thr.matched = true
re.nthr--
}
@@ -478,6 +466,25 @@ const within = {re, str
-> re.strp < str.len
}
+const itrace = {re, thr, inst
+ match inst
+ | `Ibyte b: trace(re, thr, "\t{}: Byte ({})\n", thr.ip, b)
+ | `Irange (lo, hi): trace(re, thr, "\t{}: Range {}, {}\n", thr.ip, lo, hi)
+ | `Ilbra m: trace(re, thr, "\t{}: Lbra {}\n", thr.ip, m)
+ | `Irbra m: trace(re, thr, "\t{}: Rbra {}\n", thr.ip, m)
+ /* anchors */
+ | `Ibol: trace(re, thr, "\t{}: Bol\n", thr.ip)
+ | `Ieol: trace(re, thr, "\t{}: Eol\n", thr.ip)
+ | `Ibow: trace(re, thr, "\t{}: Bow\n", thr.ip)
+ | `Ieow: trace(re, thr, "\t{}: Eow\n", thr.ip)
+
+ /* control flow */
+ | `Ifork (l, r): trace(re, thr, "\t{}: Fork {}, {}\n", thr.ip, l, r)
+ | `Ijmp ip: trace(re, thr, "\t{}: Jmp {}\n", thr.ip, ip)
+ | `Imatch m: trace(re, thr, "\t{}: Match {}\n", thr.ip, m)
+ ;;
+}
+
const trace : (re : regex#, thr : rethread#, msg : byte[:], args : ... -> void) = {re, thr, msg, args
var ap
@@ -493,12 +500,6 @@ const trace : (re : regex#, thr : rethread#, msg : byte[:], args : ... -> void)
;;
}
-const hit = {re, thr
- if re.debug
- std.bsput(re.traces[thr.tid], thr.ip)
- ;;
-}
-
/* must be called with i >= 1 */
const prevchar = {s, i
std.assert(i != 0, "prevchar must be called with i >= 1\n")
diff --git a/lib/regex/types.myr b/lib/regex/types.myr
index 75db777..4d65031 100644
--- a/lib/regex/types.myr
+++ b/lib/regex/types.myr
@@ -27,6 +27,7 @@ pkg regex =
nfree : std.size
proglen : std.size
prog : reinst[:]
+ code : recode[:]
nthr : std.size
str : byte[:]
strp : std.size
@@ -70,10 +71,25 @@ pkg regex =
dead : bool /* thread died */
matched : bool /* thread matched */
- mgroup : std.size[2][32] /* match starts */
+ mgroup : std.size[2][16] /* match starts */
;;
+ pkglocal type recode = uint64
+
+ /* can have at most up to 0xf ops */
+ const OpByte : recode = 0x0
+ const OpRange : recode = 0x1
+ const OpLbra : recode = 0x2
+ const OpRbra : recode = 0x3
+ const OpBol : recode = 0x4
+ const OpEol : recode = 0x5
+ const OpBow : recode = 0x6
+ const OpEow : recode = 0x7
+ const OpFork : recode = 0x8
+ const OpJmp : recode = 0x9
+ const OpMatch : recode = 0xa
+
pkglocal type reinst = union
/* direct consumers */
`Ibyte byte