diff options
author | Ori Bernstein <ori@markovcorp.com> | 2017-12-28 14:59:28 -0800 |
---|---|---|
committer | Ori Bernstein <ori@markovcorp.com> | 2017-12-28 14:59:28 -0800 |
commit | 75ddf8edac3ea43fee7b7bdafe9e7799fb3571aa (patch) | |
tree | d60c3bfd10e3de66cde173e666ba4e5f81a777b3 | |
parent | 929f3dcbc45901d30b40f06826443424825c251e (diff) | |
download | mc-75ddf8edac3ea43fee7b7bdafe9e7799fb3571aa.tar.gz |
Performance increase.
Actually using integers over unions speeds things up a lot.
-rw-r--r-- | lib/regex/compile.myr | 23 | ||||
-rw-r--r-- | lib/regex/interp.myr | 169 | ||||
-rw-r--r-- | lib/regex/types.myr | 18 |
3 files changed, 125 insertions, 85 deletions
diff --git a/lib/regex/compile.myr b/lib/regex/compile.myr index 6a82cc6..3069d27 100644 --- a/lib/regex/compile.myr +++ b/lib/regex/compile.myr @@ -73,11 +73,34 @@ const regexcompile = {re, id append(re, `Imatch id, t) idump(re) astfree(t) + re.code = convert(re.prog) -> `std.Ok re ;; -> `std.Err (`Noimpl) } +const convert = {prog + var bc + + bc = [][:] + for p : prog + match p + | `Ibyte b: std.slpush(&bc, OpByte | ((b : recode) << 4)) + | `Irange (a, b): std.slpush(&bc, OpRange | ((a : recode) << 4) | ((b : recode) << 16)) + | `Ilbra m: std.slpush(&bc, OpLbra | ((m : recode) << 4)) + | `Irbra m: std.slpush(&bc, OpRbra | ((m : recode) << 4)) + | `Ibol: std.slpush(&bc, OpBol) + | `Ieol: std.slpush(&bc, OpEol) + | `Ibow: std.slpush(&bc, OpBow) + | `Ieow: std.slpush(&bc, OpEow) + | `Ijmp d: std.slpush(&bc, OpJmp | ((d : recode) << 4)) + | `Imatch m: std.slpush(&bc, OpMatch | ((m : recode) << 4)) + | `Ifork (a, b): std.slpush(&bc, OpFork | ((a : recode) << 4) | ((b : recode) << 34)) + ;; + ;; + -> bc +} + const free = {re /* all the threads should be dead, so we shouldn't have to free any*/ diff --git a/lib/regex/interp.myr b/lib/regex/interp.myr index 87528a2..895fe7e 100644 --- a/lib/regex/interp.myr +++ b/lib/regex/interp.myr @@ -252,10 +252,6 @@ const run = {re, str, idx, wholestr thr = re.runq re.runq = thr.next - if re.trace - trace(re, thr, "\nrunning tid={}, ip={}, s[{}]={}\n", \ - thr.tid, thr.ip, re.strp, std.decode(re.str[re.strp:])) - ;; ip = thr.ip consumed = step(re, thr, -1) while !consumed @@ -263,13 +259,12 @@ const run = {re, str, idx, wholestr ;; if std.bshas(states, thr.ip) - die(re, thr, "there can be only one") + die(re, thr) ;; if thr.dead thrfree(re, thr) elif thr.matched - trace(re, thr, "new bestmatch\n") if bestmatch != Zthr thrfree(re, bestmatch) ;; @@ -311,117 +306,112 @@ const run = {re, str, idx, wholestr consumed, false otherwise. */ const step = {re, thr, curip - var str - var nthr + var str, nthr, inst str = re.str - match re.prog[thr.ip] + inst = re.code[thr.ip] + if re.trace + itrace(re, thr, re.prog[thr.ip]) + ;; + if re.debug + std.bsput(re.traces[thr.tid], thr.ip) + ;; + match inst & 0xf /* Char matching. Consume exactly one byte from the string. */ - | `Ibyte b: - trace(re, thr, "\t{}:\tByte {} ({})\n", thr.ip, b, (b : char)) - if !within(re, str) - die(re, thr, "end of string") - elif b != str[re.strp] - die(re, thr, "not right char") + | OpRange: + var lo = (inst >> 4 : byte) + var hi = (inst >> 16 : byte) + + if !within(re, str) || lo > str[re.strp] || hi < str[re.strp] + die(re, thr) else - hit(re, thr) thr.ip++ - trace(re, thr, "\t\tmatched {} with {}\n", b, str[re.strp]) ;; - | `Irange (start, end): - trace(re, thr, "\t{}:\tRange ({}, {}) /* {} - {} */\n", thr.ip, start, end, (start : char), (end : char)) - if !within(re, str) || start > str[re.strp] || end < str[re.strp] - die(re, thr, "bad range") + | OpByte: + var b = (inst >> 4 : byte) + if !within(re, str) + die(re, thr) + elif b != str[re.strp] + die(re, thr) else - hit(re, thr) thr.ip++ ;; + | OpFork: + var lip = ((inst >> 4) & 0x3fffffff : std.size) + var rip = ((inst >> 34) & 0x3fffffff : std.size) + if rip != curip + nthr = mkthread(re, rip) + nthr.next = re.runq + nthr.mgroup = thr.mgroup + re.runq = nthr + ;; + if re.debug + std.slpush(&re.traces, std.bsdup(re.traces[thr.tid])) + ;; + thr.ip = lip + -> false /* Non-consuming. All of these return false, and expect step to be called again until exactly one byte is consumed from the string. */ - | `Ibol: - trace(re, thr, "\t{}:\tBol\n", thr.ip) + | OpJmp: + var ip = (inst >> 4 : std.size) + thr.ip = ip + -> false + | OpMatch: + var id = (inst >> 4 : std.size) + re.lastthr = thr.tid + finish(re, thr) + -> true + | OpLbra: + var m = (inst >> 4 : std.size) + thr.mgroup[m][0] = re.strp + thr.ip++ + -> false + | OpRbra: + var m = (inst >> 4 : std.size) + thr.mgroup[m][1] = re.strp + thr.ip++ + -> false + | OpBol: if re.strp == 0 || str[re.strp - 1] == ('\n' : byte) - hit(re, thr) thr.ip++ -> false else - die(re, thr, "not beginning of line") + die(re, thr) ;; - | `Ieol: - trace(re, thr, "\t{}:\tEol\n", thr.ip) + | OpEol: if re.strp == str.len || str[re.strp] == ('\n' : byte) - hit(re, thr) thr.ip++ -> false else - die(re, thr, "not end of line") + die(re, thr) ;; /* check for word characters */ - | `Ibow: - trace(re, thr, "\t{}:\tBow\n", thr.ip) + | OpBow: if iswordchar(str[re.strp:]) && (re.strp == 0 || !iswordchar(prevchar(str, re.strp))) - hit(re, thr) thr.ip++ -> false else - die(re, thr, "not beginning of word") + die(re, thr) ;; - | `Ieow: - trace(re, thr, "\t{}:\tEow\n", thr.ip) + | OpEow: if re.strp == str.len && iswordchar(prevchar(str, re.strp)) - hit(re, thr) thr.ip++ -> false elif re.strp > 0 && !iswordchar(str[re.strp:]) && iswordchar(prevchar(str, re.strp)) - hit(re, thr) thr.ip++ -> false else - die(re, thr, "not end of word") - ;; - | `Ilbra m: - trace(re, thr, "\t{}:\tLbra {}\n", thr.ip, m) - thr.mgroup[m][0] = re.strp - hit(re, thr) - thr.ip++ - -> false - | `Irbra m: - trace(re, thr, "\t{}:\tRbra {}\n", thr.ip, m) - thr.mgroup[m][1] = re.strp - hit(re, thr) - thr.ip++ - -> false - | `Ifork (lip, rip): - trace(re, thr, "\t{}:\tFork ({}, {})\n", thr.ip, lip, rip) - hit(re, thr) - if rip != curip - nthr = mkthread(re, rip) - nthr.next = re.runq - nthr.mgroup = thr.mgroup - re.runq = nthr - ;; - if re.debug - std.slpush(&re.traces, std.bsdup(re.traces[thr.tid])) + die(re, thr) ;; - thr.ip = lip - -> false - | `Ijmp ip: - trace(re, thr, "\t{}:\tJmp {}\n", thr.ip, ip) - hit(re, thr) - thr.ip = ip - -> false - | `Imatch id: - trace(re, thr, "\t{}:\tMatch\n", thr.ip) - re.lastthr = thr.tid - finish(re, thr) - -> true + | _: + std.die("bad match") ;; -> true } -const die = {re, thr, msg +const die = {re, thr /* we can have die called on a thread multiple times, eg, if it has a bad @@ -429,7 +419,6 @@ const die = {re, thr, msg thread is in. We should only decrement the number of threads for that once. */ - trace(re, thr, "\t\tdie {}: {}\n", thr.tid, msg) if !thr.dead re.nthr-- ;; @@ -439,7 +428,6 @@ const die = {re, thr, msg } const finish = {re, thr - trace(re, thr, "finish {}\n", thr.tid) thr.matched = true re.nthr-- } @@ -478,6 +466,25 @@ const within = {re, str -> re.strp < str.len } +const itrace = {re, thr, inst + match inst + | `Ibyte b: trace(re, thr, "\t{}: Byte ({})\n", thr.ip, b) + | `Irange (lo, hi): trace(re, thr, "\t{}: Range {}, {}\n", thr.ip, lo, hi) + | `Ilbra m: trace(re, thr, "\t{}: Lbra {}\n", thr.ip, m) + | `Irbra m: trace(re, thr, "\t{}: Rbra {}\n", thr.ip, m) + /* anchors */ + | `Ibol: trace(re, thr, "\t{}: Bol\n", thr.ip) + | `Ieol: trace(re, thr, "\t{}: Eol\n", thr.ip) + | `Ibow: trace(re, thr, "\t{}: Bow\n", thr.ip) + | `Ieow: trace(re, thr, "\t{}: Eow\n", thr.ip) + + /* control flow */ + | `Ifork (l, r): trace(re, thr, "\t{}: Fork {}, {}\n", thr.ip, l, r) + | `Ijmp ip: trace(re, thr, "\t{}: Jmp {}\n", thr.ip, ip) + | `Imatch m: trace(re, thr, "\t{}: Match {}\n", thr.ip, m) + ;; +} + const trace : (re : regex#, thr : rethread#, msg : byte[:], args : ... -> void) = {re, thr, msg, args var ap @@ -493,12 +500,6 @@ const trace : (re : regex#, thr : rethread#, msg : byte[:], args : ... -> void) ;; } -const hit = {re, thr - if re.debug - std.bsput(re.traces[thr.tid], thr.ip) - ;; -} - /* must be called with i >= 1 */ const prevchar = {s, i std.assert(i != 0, "prevchar must be called with i >= 1\n") diff --git a/lib/regex/types.myr b/lib/regex/types.myr index 75db777..4d65031 100644 --- a/lib/regex/types.myr +++ b/lib/regex/types.myr @@ -27,6 +27,7 @@ pkg regex = nfree : std.size proglen : std.size prog : reinst[:] + code : recode[:] nthr : std.size str : byte[:] strp : std.size @@ -70,10 +71,25 @@ pkg regex = dead : bool /* thread died */ matched : bool /* thread matched */ - mgroup : std.size[2][32] /* match starts */ + mgroup : std.size[2][16] /* match starts */ ;; + pkglocal type recode = uint64 + + /* can have at most up to 0xf ops */ + const OpByte : recode = 0x0 + const OpRange : recode = 0x1 + const OpLbra : recode = 0x2 + const OpRbra : recode = 0x3 + const OpBol : recode = 0x4 + const OpEol : recode = 0x5 + const OpBow : recode = 0x6 + const OpEow : recode = 0x7 + const OpFork : recode = 0x8 + const OpJmp : recode = 0x9 + const OpMatch : recode = 0xa + pkglocal type reinst = union /* direct consumers */ `Ibyte byte |