summaryrefslogtreecommitdiff
path: root/lib/regex
diff options
context:
space:
mode:
authorOri Bernstein <ori@eigenstate.org>2016-05-19 22:41:05 -0700
committerOri Bernstein <ori@eigenstate.org>2016-05-19 22:41:05 -0700
commitc3a7db11052192c3ae599604e0c3922a28a7f771 (patch)
treeca274cbb208662159690f613dc4a8438a6a0c4b8 /lib/regex
parent009cb9bf1139d5ddda4e464734f53eeb4ec10d74 (diff)
downloadmc-c3a7db11052192c3ae599604e0c3922a28a7f771.tar.gz
Add tracing of steps when in debug mode.
Diffstat (limited to 'lib/regex')
-rw-r--r--lib/regex/compile.myr80
-rw-r--r--lib/regex/interp.myr35
-rw-r--r--lib/regex/redump.myr32
-rw-r--r--lib/regex/test/basic.myr10
-rw-r--r--lib/regex/types.myr52
5 files changed, 142 insertions, 67 deletions
diff --git a/lib/regex/compile.myr b/lib/regex/compile.myr
index 7c4c56b..023d7ae 100644
--- a/lib/regex/compile.myr
+++ b/lib/regex/compile.myr
@@ -85,6 +85,10 @@ const free = {re
if re.debug
std.htfree(re.astloc)
std.slfree(re.pcidx)
+ for bs in re.traces
+ std.bsfree(bs)
+ ;;
+ std.slfree(re.traces)
;;
std.free(re)
}
@@ -93,13 +97,13 @@ const free = {re
/* generates bytecode from an AST */
const gen = {re, t
match t#
- |`Alt (a, b): genalt(re, a, b)
+ |`Alt (a, b): genalt(re, a, b, t)
|`Cat (a, b): gen(re, a); gen(re, b)
/* repetition */
- |`Star a: genstar(re, a, false)
- |`Rstar a: genstar(re, a, true)
- |`Plus a: gen(re, a); genstar(re, a, false)
- |`Rplus a: gen(re, a); genstar(re, a, true)
+ |`Star a: genstar(re, a, false, t)
+ |`Rstar a: genstar(re, a, true, t)
+ |`Plus a: gen(re, a); genstar(re, a, false, t)
+ |`Rplus a: gen(re, a); genstar(re, a, true, t)
|`Quest a: genquest(re, a)
/* end matches */
@@ -290,7 +294,7 @@ const jmpdist = {n
}
/* generates an alternation */
-const genalt = {re, l, r
+const genalt = {re, l, r, t
var alt
var jmp
var l0
@@ -298,10 +302,10 @@ const genalt = {re, l, r
var l2
alt = re.proglen
- l0 = append(re, `Ifork (-1, -1), l) /* needs to be replaced */
+ l0 = append(re, `Ifork (-1, -1), t) /* needs to be replaced */
gen(re, l)
jmp = re.proglen
- l1 = append(re, `Ijmp -1, r) /* needs to be replaced */
+ l1 = append(re, `Ijmp -1, t) /* needs to be replaced */
l2 = gen(re, r)
re.prog[alt] = `Ifork(l0, l1)
@@ -310,7 +314,7 @@ const genalt = {re, l, r
}
/* generates a repetition operator */
-const genstar = {re, rep, reluct
+const genstar = {re, rep, reluct, t
var alt
var jmp
var l0
@@ -319,9 +323,9 @@ const genstar = {re, rep, reluct
l0 = re.proglen
alt = re.proglen
- l1 = append(re, `Ifork (-1, -1), rep) /* needs to be replaced */
+ l1 = append(re, `Ifork (-1, -1), t) /* needs to be replaced */
jmp = gen(re, rep)
- l2 = append(re, `Ijmp -1, rep)
+ l2 = append(re, `Ijmp -1, t)
/* reluctant matches should prefer jumping to the end. */
@@ -419,6 +423,7 @@ const dump = {re, t, indent
for var i = 0; i < indent; i++
std.put(" ")
;;
+ std.put("{}/", std.htgetv(re.astloc, t, -1))
match t#
| `Alt (a, b):
std.put("Alt\n")
@@ -454,7 +459,7 @@ const dump = {re, t, indent
std.put("Eow\n")
/* end matches */
| `Chr c:
- std.put("Char {}\n", c)
+ std.put("Chr {}\n", c)
| `Ranges rl:
std.put("Ranges")
for r in rl
@@ -495,8 +500,8 @@ const altexpr = {re
match catexpr(re)
| `Some t:
ret = t
+ idx = re.idx
if matchc(re, '|')
- idx = re.idx
match altexpr(re)
| `Some rhs:
ret = mk(re, `Alt (ret, rhs), idx)
@@ -517,9 +522,9 @@ const catexpr = {re
var ret
var idx
+ idx = re.idx
match repexpr(re)
| `Some t:
- idx = re.idx
ret = t
match catexpr(re)
| `Some rhs:
@@ -569,6 +574,7 @@ const baseexpr = {re
if re.pat.len == re.idx
-> `None
;;
+ idx = re.idx
match peekc(re)
/* lower prec operators */
| '|': -> `None
@@ -581,10 +587,9 @@ const baseexpr = {re
| '$': getc(re); ret = mk(re, `Eol, re.idx)
| '.':
getc(re);
- ret = mk(re, `Ranges std.sldup([[0, std.Maxcharval]][:]), re.idx)
+ ret = mk(re, `Ranges std.sldup([[0, std.Maxcharval]][:]), idx)
| '(':
m = re.nmatch++
- idx = re.idx
getc(re)
match altexpr(re)
| `Some s:
@@ -604,44 +609,45 @@ const baseexpr = {re
-> escaped(re)
| c:
getc(re)
- ret = mk(re, `Chr c, re.idx)
+ ret = mk(re, `Chr c, idx)
;;
-> `Some ret
}
const escaped = {re
- var ret
+ var ret, idx
+ idx = re.idx
match getc(re)
/* character classes */
- | 'd': ret = `Some mk(re, `Ranges std.sldup(_ranges.tabasciidigit[:]), re.idx)
- | 'x': ret = `Some mk(re, `Ranges std.sldup(_ranges.tabasciixdigit[:]), re.idx)
- | 's': ret = `Some mk(re, `Ranges std.sldup(_ranges.tabasciispace[:]), re.idx)
- | 'w': ret = `Some mk(re, `Ranges std.sldup(_ranges.tabasciiword[:]), re.idx)
- | 'h': ret = `Some mk(re, `Ranges std.sldup(_ranges.tabasciiblank[:]), re.idx)
+ | 'd': ret = `Some mk(re, `Ranges std.sldup(_ranges.tabasciidigit[:]), idx)
+ | 'x': ret = `Some mk(re, `Ranges std.sldup(_ranges.tabasciixdigit[:]), idx)
+ | 's': ret = `Some mk(re, `Ranges std.sldup(_ranges.tabasciispace[:]), idx)
+ | 'w': ret = `Some mk(re, `Ranges std.sldup(_ranges.tabasciiword[:]), idx)
+ | 'h': ret = `Some mk(re, `Ranges std.sldup(_ranges.tabasciiblank[:]), idx)
/* negated character classes */
- | 'W': ret = `Some mk(re, `Ranges negate(_ranges.tabasciiword[:]), re.idx)
- | 'S': ret = `Some mk(re, `Ranges negate(_ranges.tabasciispace[:]), re.idx)
- | 'D': ret = `Some mk(re, `Ranges negate(_ranges.tabasciidigit[:]), re.idx)
- | 'X': ret = `Some mk(re, `Ranges negate(_ranges.tabasciixdigit[:]), re.idx)
- | 'H': ret = `Some mk(re, `Ranges negate(_ranges.tabasciiblank[:]), re.idx)
+ | 'W': ret = `Some mk(re, `Ranges negate(_ranges.tabasciiword[:]), idx)
+ | 'S': ret = `Some mk(re, `Ranges negate(_ranges.tabasciispace[:]), idx)
+ | 'D': ret = `Some mk(re, `Ranges negate(_ranges.tabasciidigit[:]), idx)
+ | 'X': ret = `Some mk(re, `Ranges negate(_ranges.tabasciixdigit[:]), idx)
+ | 'H': ret = `Some mk(re, `Ranges negate(_ranges.tabasciiblank[:]), idx)
/* unicode character classes */
| 'p': ret = unicodeclass(re, false)
| 'P': ret = unicodeclass(re, true)
/* operators that need an escape */
- | '<': ret = `Some mk(re, `Bow, re.idx)
- | '>': ret = `Some mk(re, `Eow, re.idx)
+ | '<': ret = `Some mk(re, `Bow, idx)
+ | '>': ret = `Some mk(re, `Eow, idx)
/* escaped metachars */
- | '^': ret = `Some mk(re, `Chr '^', re.idx)
- | '$': ret = `Some mk(re, `Chr '$', re.idx)
- | '.': ret = `Some mk(re, `Chr '.', re.idx)
- | '+': ret = `Some mk(re, `Chr '+', re.idx)
- | '?': ret = `Some mk(re, `Chr '?', re.idx)
- | '*': ret = `Some mk(re, `Chr '*', re.idx)
+ | '^': ret = `Some mk(re, `Chr '^', idx)
+ | '$': ret = `Some mk(re, `Chr '$', idx)
+ | '.': ret = `Some mk(re, `Chr '.', idx)
+ | '+': ret = `Some mk(re, `Chr '+', idx)
+ | '?': ret = `Some mk(re, `Chr '?', idx)
+ | '*': ret = `Some mk(re, `Chr '*', idx)
| chr: ret = `Fail `Badescape chr
;;
-> ret
@@ -709,8 +715,8 @@ const chrclass = {re
var t
/* we know we saw '[' on entry */
- matchc(re, '[')
idx = re.idx
+ matchc(re, '[')
neg = false
if matchc(re, '^')
neg = true
diff --git a/lib/regex/interp.myr b/lib/regex/interp.myr
index 829384b..a203520 100644
--- a/lib/regex/interp.myr
+++ b/lib/regex/interp.myr
@@ -60,6 +60,7 @@ const cleanup = {re
next = thr.next
thrfree(re, thr)
;;
+ re.nexttid = 0
}
const matchfree = {m
@@ -92,6 +93,15 @@ const run = {re, wholestr
bestmatch = Zthr
states = std.mkbs()
re.runq = mkthread(re, 0)
+ if re.debug
+ /* The last run could have left things here, since we need this info after the run */
+ for bs in re.traces
+ std.bsfree(bs)
+ ;;
+ std.slfree(re.traces)
+ re.traces = [][:]
+ std.slpush(&re.traces, std.mkbs())
+ ;;
re.runq.mstart = std.slalloc(re.nmatch)
re.runq.mend = std.slalloc(re.nmatch)
for var i = 0; i < re.nmatch; i++
@@ -174,6 +184,7 @@ const step = {re, thr, curip
elif b != str[re.strp]
die(re, thr, "not right char")
else
+ hit(re, thr)
thr.ip++
trace(re, thr, "\t\tmatched {} with {}\n", b, str[re.strp])
;;
@@ -182,6 +193,7 @@ const step = {re, thr, curip
if !within(re, str) || start > str[re.strp] || end < str[re.strp]
die(re, thr, "bad range")
else
+ hit(re, thr)
thr.ip++
;;
/*
@@ -191,6 +203,7 @@ const step = {re, thr, curip
| `Ibol:
trace(re, thr, "\t{}:\tBol\n", thr.ip)
if re.strp == 0 || str[re.strp - 1] == ('\n' : byte)
+ hit(re, thr)
thr.ip++
-> false
else
@@ -199,6 +212,7 @@ const step = {re, thr, curip
| `Ieol:
trace(re, thr, "\t{}:\tEol\n", thr.ip)
if re.strp == str.len || str[re.strp] == ('\n' : byte)
+ hit(re, thr)
thr.ip++
-> false
else
@@ -208,6 +222,7 @@ const step = {re, thr, curip
| `Ibow:
trace(re, thr, "\t{}:\tBow\n", thr.ip)
if iswordchar(str[re.strp:]) && (re.strp == 0 || !iswordchar(prevchar(str, re.strp)))
+ hit(re, thr)
thr.ip++
-> false
else
@@ -216,9 +231,11 @@ const step = {re, thr, curip
| `Ieow:
trace(re, thr, "\t{}:\tEow\n", thr.ip)
if re.strp == str.len && iswordchar(prevchar(str, re.strp))
+ hit(re, thr)
thr.ip++
-> false
elif re.strp > 0 && !iswordchar(str[re.strp:]) && iswordchar(prevchar(str, re.strp))
+ hit(re, thr)
thr.ip++
-> false
else
@@ -228,26 +245,34 @@ const step = {re, thr, curip
trace(re, thr, "\t{}:\tLbra {}\n", thr.ip, m)
trace(re, thr, "\t\tmatch start = {}\n", re.strp)
thr.mstart[m] = re.strp
+ hit(re, thr)
thr.ip++
-> false
| `Irbra m:
trace(re, thr, "\t{}:\tRbra {}\n", thr.ip, m)
thr.mend[m] = re.strp
+ hit(re, thr)
thr.ip++
-> false
| `Ifork (lip, rip):
trace(re, thr, "\t{}:\tFork ({}, {})\n", thr.ip, lip, rip)
mstart = std.sldup(thr.mstart)
mend = std.sldup(thr.mend)
+ hit(re, thr)
fork(re, thr, rip, curip, mstart, mend)
+ if re.debug
+ std.slpush(&re.traces, std.bsdup(re.traces[thr.tid]))
+ ;;
thr.ip = lip
-> false
| `Ijmp ip:
trace(re, thr, "\t{}:\tJmp {}\n", thr.ip, ip)
+ hit(re, thr)
thr.ip = ip
-> false
| `Imatch id:
trace(re, thr, "\t{}:\tMatch\n", thr.ip)
+ re.lastthr = thr.tid
finish(re, thr)
-> true
;;
@@ -280,6 +305,7 @@ const die = {re, thr, msg
re.nthr--
;;
re.lastip = thr.ip
+ re.lastthr = thr.tid
thr.dead = true
}
@@ -289,7 +315,6 @@ const finish = {re, thr
re.nthr--
}
-var nexttid = 0
const mkthread = {re, ip
var thr : rethread#
@@ -298,7 +323,7 @@ const mkthread = {re, ip
thr.next = Zthr
thr.ip = ip
- thr.tid = nexttid++
+ thr.tid = re.nexttid++
thr.dead = false
thr.matched = false
@@ -336,6 +361,12 @@ const trace : (re : regex#, thr : rethread#, msg : byte[:], args : ... -> void)
;;
}
+const hit = {re, thr
+ if re.debug
+ std.bsput(re.traces[thr.tid], thr.ip)
+ ;;
+}
+
/* must be called with i >= 1 */
const prevchar = {s, i
std.assert(i != 0, "prevchar must be called with i >= 1\n")
diff --git a/lib/regex/redump.myr b/lib/regex/redump.myr
index ddc9b33..3f843ea 100644
--- a/lib/regex/redump.myr
+++ b/lib/regex/redump.myr
@@ -76,10 +76,13 @@ const show = {re, ln, mg
for var i = 1; i < rl.len; i++
std.put("\tgroup {}: {}\n", i, rl[i])
;;
+ std.put("coverage:\n")
+ std.put("\t{}\n", re.pat)
+ showcoverage(re)
| `std.None:
- std.put("Match failed:\n")
+ std.put("Match failed at {}:\n", re.lastip)
std.put("\t{}\n", re.pat)
- caret(re, re.pcidx[re.lastip] - 1)
+ caret(re, re.pcidx[re.lastip])
std.put("\t{}\n", ln)
caret(re, re.strp - 1)
;;
@@ -93,3 +96,28 @@ const caret = {re, idx
std.put("^\n")
}
+const showcoverage = {re
+ var hit
+ var idx
+
+ hit = std.slzalloc(re.pat.len)
+ for var ip = 0; ip < re.proglen; ip++
+ if !std.bshas(re.traces[re.lastthr], ip)
+ continue
+ ;;
+ idx = re.pcidx[ip]
+ if idx >= 0 && idx < hit.len
+ hit[idx] = true
+ ;;
+ ;;
+
+ std.put("\t")
+ for h in hit
+ if h
+ std.put("^")
+ else
+ std.put(" ")
+ ;;
+ ;;
+ std.put("\n")
+}
diff --git a/lib/regex/test/basic.myr b/lib/regex/test/basic.myr
index 35109c9..506b7c0 100644
--- a/lib/regex/test/basic.myr
+++ b/lib/regex/test/basic.myr
@@ -19,21 +19,29 @@ const main = {
"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
][:], "")
+ std.put("hi\n")
testmatch(".*bc", "Abc", `std.Some [][:])
- testmatch("(a*)*", "a", `std.Some ["a"][:])
+ std.put("1\n")
+ dbgmatch("(a*)*", "a", `std.Some ["a"][:])
+ std.put("2\n")
testmatch("(aa|aab?)*", s, `std.Some ["aa"][:])
+ std.put("3\n")
/* greedy matches */
testmatch("(<.*>).*", "<a foo> blah <bar>", `std.Some [
"<a foo> blah <bar>",
][:])
+ std.put("3\n")
testmatch("(<.+>).*", "<a foo> blah <bar>", `std.Some [
"<a foo> blah <bar>",
][:])
+ std.put("4\n")
/* reluctant matches */
testmatch("(<.*?>).*", "<a foo> blah <bar>", `std.Some [
"<a foo>",
][:])
+ std.put("5\n")
testmatch("(<.+?>).*", "<a foo> blah <bar>", `std.Some [
"<a foo>",
][:])
+ std.put("6\n")
}
diff --git a/lib/regex/types.myr b/lib/regex/types.myr
index a1e0208..b1ed4e3 100644
--- a/lib/regex/types.myr
+++ b/lib/regex/types.myr
@@ -11,31 +11,6 @@ pkg regex =
`Badescape char
;;
- type ast = union
- /* basic string building */
- `Alt (ast#, ast#)
- `Cat (ast#, ast#)
-
- /* repetition */
- `Star ast#
- `Rstar ast#
- `Plus ast#
- `Rplus ast#
- `Quest ast#
-
- /* end matches */
- `Chr char
- `Ranges char[2][:]
-
- /* meta */
- `Cap (std.size, ast#) /* id, ast */
- `Bol /* beginning of line */
- `Eol /* end of line */
- `Bow /* beginning of word */
- `Eow /* end of word */
- ;;
-
-
type regex = struct
/* compile state */
debug : bool
@@ -53,11 +28,38 @@ pkg regex =
nthr : std.size
str : byte[:]
strp : std.size
+ nexttid : std.size
/* debug state */
astloc : std.htab(ast#, std.size)#
+ traces : std.bitset#[:]
pcidx : std.size[:]
lastip : std.size
+ lastthr : std.size
+ ;;
+
+ type ast = union
+ /* basic string building */
+ `Alt (ast#, ast#)
+ `Cat (ast#, ast#)
+
+ /* repetition */
+ `Star ast#
+ `Rstar ast#
+ `Plus ast#
+ `Rplus ast#
+ `Quest ast#
+
+ /* end matches */
+ `Chr char
+ `Ranges char[2][:]
+
+ /* meta */
+ `Cap (std.size, ast#) /* id, ast */
+ `Bol /* beginning of line */
+ `Eol /* end of line */
+ `Bow /* beginning of word */
+ `Eow /* end of word */
;;
pkglocal type rethread = struct