diff options
author | Ori Bernstein <ori@eigenstate.org> | 2016-06-08 12:09:30 -0400 |
---|---|---|
committer | Ori Bernstein <ori@eigenstate.org> | 2016-06-08 12:09:52 -0400 |
commit | cefdbe00dfad4086e2f3ba7cd0007d729e77e137 (patch) | |
tree | e1b2e5673096469abf289db87156aca19a21b295 /lib/regex | |
parent | 830f28c844022a71f6c7ad1caf1bcfb7ca9397dc (diff) | |
download | mc-cefdbe00dfad4086e2f3ba7cd0007d729e77e137.tar.gz |
Add matching that returns indexes.
Diffstat (limited to 'lib/regex')
-rw-r--r-- | lib/regex/interp.myr | 75 | ||||
-rw-r--r-- | lib/regex/test/bld.sub | 9 | ||||
-rw-r--r-- | lib/regex/test/idxmatch.myr | 48 | ||||
-rw-r--r-- | lib/regex/test/testmatch.myr | 55 |
4 files changed, 172 insertions, 15 deletions
diff --git a/lib/regex/interp.myr b/lib/regex/interp.myr index 71a117f..abb907c 100644 --- a/lib/regex/interp.myr +++ b/lib/regex/interp.myr @@ -3,12 +3,20 @@ use std use "types" pkg regex = + /* regex execution */ const exec : (re : regex#, str : byte[:] -> std.option(byte[:][:])) const search : (re : regex#, str : byte[:] -> std.option(byte[:][:])) + + /* regex execution returning indexes */ + const iexec : (re : regex#, str : byte[:] -> std.option((std.size, std.size)[:])) + const isearch : (re : regex#, str : byte[:] -> std.option((std.size, std.size)[:])) + + /* substitution */ const sub : (re : regex#, str : byte[:], subst : byte[:][:] -> std.option(byte[:])) const sbsub : (sb : std.strbuf#, re : regex#, str : byte[:], subst : byte[:][:] -> bool) const suball : (re : regex#, str : byte[:], subst : byte[:][:] -> byte[:]) const sbsuball : (sb : std.strbuf#, re : regex#, str : byte[:], subst : byte[:][:] -> void) + const matchfree : (pat : byte[:][:] -> void) ;; @@ -16,26 +24,30 @@ pkg regex = const Zthr = (0 : rethread#) const exec = {re, str - var thr - var m + var thr, m - re.str = str - re.strp = 0 - thr = run(re, true) + thr = run(re, str, 0, true) m = getmatches(re, thr) cleanup(re) -> m } +const iexec = {re, str + var thr, m + + thr = run(re, str, 0, true) + m = getidxmatches(re, thr) + cleanup(re) + -> m +} + const search = {re, str var thr var m m = `std.None for var i = 0; i < str.len; i++ - re.str = str[i:] - re.strp = 0 - thr = run(re, false) + thr = run(re, str[i:], 0, false) m = getmatches(re, thr) match m | `std.Some _: break @@ -46,6 +58,23 @@ const search = {re, str -> m } +const isearch = {re, str + var thr + var m + + m = `std.None + for var i = 0; i < str.len; i++ + thr = run(re, str[i:], 0, false) + m = getidxmatches(re, thr) + match m + | `std.Some _: break + | `std.None: /* nothing */ + ;; + cleanup(re) + ;; + -> m +} + const sub = {re, str, subst var sb @@ -65,9 +94,7 @@ const sbsub = {sb, re, str, subst -> false ;; - re.str = str - re.strp = 0 - thr = run(re, true) + thr = run(re, str, 0, true) if thr == Zthr m = false else @@ -95,9 +122,7 @@ const sbsuball = {sb, re, str, subst i = 0 while i < str.len - re.str = str[i:] - re.strp = 0 - thr = run(re, false) + thr = run(re, str[i:], 0, false) if thr == Zthr std.sbputb(sb, str[i]) i++ @@ -164,15 +189,35 @@ const getmatches = {re, thr -> `std.Some ret } +const getidxmatches = {re, thr + var ret + + if thr == Zthr + -> `std.None + ;; + ret = std.slalloc(re.nmatch) + for var i = 0; i < re.nmatch; i++ + if thr.mstart[i] != -1 && thr.mend[i] != -1 + ret[i] = (thr.mstart[i], thr.mend[i]) + else + ret[i] = (-1, -1) + ;; + ;; + thrfree(re, thr) + -> `std.Some ret +} /* returns a matching thread, or Zthr if no threads matched */ -const run = {re, wholestr +const run = {re, str, idx, wholestr var bestmatch var consumed var states var thr var ip + re.str = str + re.strp = 0 + bestmatch = Zthr states = std.mkbs() re.runq = mkthread(re, 0) diff --git a/lib/regex/test/bld.sub b/lib/regex/test/bld.sub index be1a868..faa6a57 100644 --- a/lib/regex/test/bld.sub +++ b/lib/regex/test/bld.sub @@ -5,6 +5,15 @@ test basic = lib @/lib/sys:sys lib @/lib/regex:regex ;; + +test idxmatch = + idxmatch.myr + testmatch.myr + lib @/lib/std:std + lib @/lib/sys:sys + lib @/lib/regex:regex +;; + test boundaries = boundaries.myr testmatch.myr diff --git a/lib/regex/test/idxmatch.myr b/lib/regex/test/idxmatch.myr new file mode 100644 index 0000000..5473026 --- /dev/null +++ b/lib/regex/test/idxmatch.myr @@ -0,0 +1,48 @@ +use std + +use "testmatch" + +const main = { + var s : byte[:] + + s = std.strjoin([ + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + ][:], "") + + testidxmatch(".*bc", "Abc", `std.Some [(0, 3)][:]) + testidxmatch("(a*)*", "a", `std.Some [(0,1), (0, 1)][:]) + testidxmatch("(aa|aab?)*", s, `std.Some [(0, 408), (406, 408)][:]) + /* greedy matches */ + testidxmatch("(<.*>).*", "<a foo> blah <bar>", `std.Some [ + (0, 18), + (0, 18), + ][:]) + testidxmatch("(<.+>).*", "<a foo> blah <bar>", `std.Some [ + (0, 18), + (0, 18), + ][:]) + /* reluctant matches */ + testidxmatch("(<.*?>).*", "<a foo> blah <bar>", `std.Some [ + (0, 18), + (0, 7), + ][:]) + testidxmatch("(<.+?>).*", "<a foo> blah <bar>", `std.Some [ + (0, 18), + (0, 7), + ][:]) + testidxmatch(".*(<b.+?>).*", "<a foo> blah <bar>", `std.Some [ + (0, 18), + (13, 18), + ][:]) +} diff --git a/lib/regex/test/testmatch.myr b/lib/regex/test/testmatch.myr index a3f8f15..78f8402 100644 --- a/lib/regex/test/testmatch.myr +++ b/lib/regex/test/testmatch.myr @@ -8,6 +8,12 @@ pkg = expected : std.option(byte[:][:]) \ -> void) + const testidxmatch : (\ + pat : byte[:], \ + text : byte[:], \ + expected : std.option((std.size, std.size)[:]) \ + -> void) + const testsearch : ( \ pat : byte[:], \ text : byte[:], \ @@ -39,6 +45,10 @@ const testmatch = {pat, text, expected run(regex.compile(pat), pat, text, expected, false) } +const testidxmatch = {pat, text, expected + runidx(regex.compile(pat), pat, text, expected, false) +} + const testsearch = {pat, text, expected run(regex.compile(pat), pat, text, expected, true) } @@ -84,6 +94,51 @@ const subst = {regex, pat, text, sub, expected, all ;; } +const runidx = {regex, pat, text, expected : std.option((std.size, std.size)[:]), search + var re, r + var lo, elo, hi, ehi + + re = std.try(regex) + if search + r = regex.isearch(re, text) + else + r = regex.iexec(re, text) + ;; + match r + | `std.Some res: + match expected + | `std.None: + std.fatal("expected no match, got:") + for var i = 0; i < res.len; i++ + std.put("\t{}: {}\n", i, res[i]) + ;; + | `std.Some exp: + if res.len != exp.len + std.put("mismatch: expected {} matches, got {}\n", exp.len, res.len) + std.fatal("failed matching {} over {}\n", pat, text) + ;; + for var i = 0; i < exp.len; i++ + (elo, ehi) = exp[i] + (lo, hi) = res[i] + if lo != elo || hi != ehi + std.put("mismatch on {}: expected {}, got {}\n", i, exp[i], res[i]) + std.fatal("failed matching {} over {}\n", pat, text) + ;; + ;; + ;; + | `std.None: + match expected + | `std.None: /* : expected failure */ + | `std.Some matches: + std.put("expected matches:\n") + for var i = 0; i < matches.len; i++ + std.put("\t{}: {}\n", i, matches[i]) + ;; + std.fatal("no match found\n") + ;; + ;; + regex.free(re) +} const run = {regex, pat, text, expected, search var i, re, r |