summaryrefslogtreecommitdiff
path: root/lib/regex
diff options
context:
space:
mode:
authorOri Bernstein <ori@eigenstate.org>2016-06-08 12:09:30 -0400
committerOri Bernstein <ori@eigenstate.org>2016-06-08 12:09:52 -0400
commitcefdbe00dfad4086e2f3ba7cd0007d729e77e137 (patch)
treee1b2e5673096469abf289db87156aca19a21b295 /lib/regex
parent830f28c844022a71f6c7ad1caf1bcfb7ca9397dc (diff)
downloadmc-cefdbe00dfad4086e2f3ba7cd0007d729e77e137.tar.gz
Add matching that returns indexes.
Diffstat (limited to 'lib/regex')
-rw-r--r--lib/regex/interp.myr75
-rw-r--r--lib/regex/test/bld.sub9
-rw-r--r--lib/regex/test/idxmatch.myr48
-rw-r--r--lib/regex/test/testmatch.myr55
4 files changed, 172 insertions, 15 deletions
diff --git a/lib/regex/interp.myr b/lib/regex/interp.myr
index 71a117f..abb907c 100644
--- a/lib/regex/interp.myr
+++ b/lib/regex/interp.myr
@@ -3,12 +3,20 @@ use std
use "types"
pkg regex =
+ /* regex execution */
const exec : (re : regex#, str : byte[:] -> std.option(byte[:][:]))
const search : (re : regex#, str : byte[:] -> std.option(byte[:][:]))
+
+ /* regex execution returning indexes */
+ const iexec : (re : regex#, str : byte[:] -> std.option((std.size, std.size)[:]))
+ const isearch : (re : regex#, str : byte[:] -> std.option((std.size, std.size)[:]))
+
+ /* substitution */
const sub : (re : regex#, str : byte[:], subst : byte[:][:] -> std.option(byte[:]))
const sbsub : (sb : std.strbuf#, re : regex#, str : byte[:], subst : byte[:][:] -> bool)
const suball : (re : regex#, str : byte[:], subst : byte[:][:] -> byte[:])
const sbsuball : (sb : std.strbuf#, re : regex#, str : byte[:], subst : byte[:][:] -> void)
+
const matchfree : (pat : byte[:][:] -> void)
;;
@@ -16,26 +24,30 @@ pkg regex =
const Zthr = (0 : rethread#)
const exec = {re, str
- var thr
- var m
+ var thr, m
- re.str = str
- re.strp = 0
- thr = run(re, true)
+ thr = run(re, str, 0, true)
m = getmatches(re, thr)
cleanup(re)
-> m
}
+const iexec = {re, str
+ var thr, m
+
+ thr = run(re, str, 0, true)
+ m = getidxmatches(re, thr)
+ cleanup(re)
+ -> m
+}
+
const search = {re, str
var thr
var m
m = `std.None
for var i = 0; i < str.len; i++
- re.str = str[i:]
- re.strp = 0
- thr = run(re, false)
+ thr = run(re, str[i:], 0, false)
m = getmatches(re, thr)
match m
| `std.Some _: break
@@ -46,6 +58,23 @@ const search = {re, str
-> m
}
+const isearch = {re, str
+ var thr
+ var m
+
+ m = `std.None
+ for var i = 0; i < str.len; i++
+ thr = run(re, str[i:], 0, false)
+ m = getidxmatches(re, thr)
+ match m
+ | `std.Some _: break
+ | `std.None: /* nothing */
+ ;;
+ cleanup(re)
+ ;;
+ -> m
+}
+
const sub = {re, str, subst
var sb
@@ -65,9 +94,7 @@ const sbsub = {sb, re, str, subst
-> false
;;
- re.str = str
- re.strp = 0
- thr = run(re, true)
+ thr = run(re, str, 0, true)
if thr == Zthr
m = false
else
@@ -95,9 +122,7 @@ const sbsuball = {sb, re, str, subst
i = 0
while i < str.len
- re.str = str[i:]
- re.strp = 0
- thr = run(re, false)
+ thr = run(re, str[i:], 0, false)
if thr == Zthr
std.sbputb(sb, str[i])
i++
@@ -164,15 +189,35 @@ const getmatches = {re, thr
-> `std.Some ret
}
+const getidxmatches = {re, thr
+ var ret
+
+ if thr == Zthr
+ -> `std.None
+ ;;
+ ret = std.slalloc(re.nmatch)
+ for var i = 0; i < re.nmatch; i++
+ if thr.mstart[i] != -1 && thr.mend[i] != -1
+ ret[i] = (thr.mstart[i], thr.mend[i])
+ else
+ ret[i] = (-1, -1)
+ ;;
+ ;;
+ thrfree(re, thr)
+ -> `std.Some ret
+}
/* returns a matching thread, or Zthr if no threads matched */
-const run = {re, wholestr
+const run = {re, str, idx, wholestr
var bestmatch
var consumed
var states
var thr
var ip
+ re.str = str
+ re.strp = 0
+
bestmatch = Zthr
states = std.mkbs()
re.runq = mkthread(re, 0)
diff --git a/lib/regex/test/bld.sub b/lib/regex/test/bld.sub
index be1a868..faa6a57 100644
--- a/lib/regex/test/bld.sub
+++ b/lib/regex/test/bld.sub
@@ -5,6 +5,15 @@ test basic =
lib @/lib/sys:sys
lib @/lib/regex:regex
;;
+
+test idxmatch =
+ idxmatch.myr
+ testmatch.myr
+ lib @/lib/std:std
+ lib @/lib/sys:sys
+ lib @/lib/regex:regex
+;;
+
test boundaries =
boundaries.myr
testmatch.myr
diff --git a/lib/regex/test/idxmatch.myr b/lib/regex/test/idxmatch.myr
new file mode 100644
index 0000000..5473026
--- /dev/null
+++ b/lib/regex/test/idxmatch.myr
@@ -0,0 +1,48 @@
+use std
+
+use "testmatch"
+
+const main = {
+ var s : byte[:]
+
+ s = std.strjoin([
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ ][:], "")
+
+ testidxmatch(".*bc", "Abc", `std.Some [(0, 3)][:])
+ testidxmatch("(a*)*", "a", `std.Some [(0,1), (0, 1)][:])
+ testidxmatch("(aa|aab?)*", s, `std.Some [(0, 408), (406, 408)][:])
+ /* greedy matches */
+ testidxmatch("(<.*>).*", "<a foo> blah <bar>", `std.Some [
+ (0, 18),
+ (0, 18),
+ ][:])
+ testidxmatch("(<.+>).*", "<a foo> blah <bar>", `std.Some [
+ (0, 18),
+ (0, 18),
+ ][:])
+ /* reluctant matches */
+ testidxmatch("(<.*?>).*", "<a foo> blah <bar>", `std.Some [
+ (0, 18),
+ (0, 7),
+ ][:])
+ testidxmatch("(<.+?>).*", "<a foo> blah <bar>", `std.Some [
+ (0, 18),
+ (0, 7),
+ ][:])
+ testidxmatch(".*(<b.+?>).*", "<a foo> blah <bar>", `std.Some [
+ (0, 18),
+ (13, 18),
+ ][:])
+}
diff --git a/lib/regex/test/testmatch.myr b/lib/regex/test/testmatch.myr
index a3f8f15..78f8402 100644
--- a/lib/regex/test/testmatch.myr
+++ b/lib/regex/test/testmatch.myr
@@ -8,6 +8,12 @@ pkg =
expected : std.option(byte[:][:]) \
-> void)
+ const testidxmatch : (\
+ pat : byte[:], \
+ text : byte[:], \
+ expected : std.option((std.size, std.size)[:]) \
+ -> void)
+
const testsearch : ( \
pat : byte[:], \
text : byte[:], \
@@ -39,6 +45,10 @@ const testmatch = {pat, text, expected
run(regex.compile(pat), pat, text, expected, false)
}
+const testidxmatch = {pat, text, expected
+ runidx(regex.compile(pat), pat, text, expected, false)
+}
+
const testsearch = {pat, text, expected
run(regex.compile(pat), pat, text, expected, true)
}
@@ -84,6 +94,51 @@ const subst = {regex, pat, text, sub, expected, all
;;
}
+const runidx = {regex, pat, text, expected : std.option((std.size, std.size)[:]), search
+ var re, r
+ var lo, elo, hi, ehi
+
+ re = std.try(regex)
+ if search
+ r = regex.isearch(re, text)
+ else
+ r = regex.iexec(re, text)
+ ;;
+ match r
+ | `std.Some res:
+ match expected
+ | `std.None:
+ std.fatal("expected no match, got:")
+ for var i = 0; i < res.len; i++
+ std.put("\t{}: {}\n", i, res[i])
+ ;;
+ | `std.Some exp:
+ if res.len != exp.len
+ std.put("mismatch: expected {} matches, got {}\n", exp.len, res.len)
+ std.fatal("failed matching {} over {}\n", pat, text)
+ ;;
+ for var i = 0; i < exp.len; i++
+ (elo, ehi) = exp[i]
+ (lo, hi) = res[i]
+ if lo != elo || hi != ehi
+ std.put("mismatch on {}: expected {}, got {}\n", i, exp[i], res[i])
+ std.fatal("failed matching {} over {}\n", pat, text)
+ ;;
+ ;;
+ ;;
+ | `std.None:
+ match expected
+ | `std.None: /* : expected failure */
+ | `std.Some matches:
+ std.put("expected matches:\n")
+ for var i = 0; i < matches.len; i++
+ std.put("\t{}: {}\n", i, matches[i])
+ ;;
+ std.fatal("no match found\n")
+ ;;
+ ;;
+ regex.free(re)
+}
const run = {regex, pat, text, expected, search
var i, re, r