diff options
author | Ori Bernstein <ori@eigenstate.org> | 2014-09-07 22:11:10 -0400 |
---|---|---|
committer | Ori Bernstein <ori@eigenstate.org> | 2014-09-07 22:11:10 -0400 |
commit | 1b08f4376253dca73868af564b44a7680a02667c (patch) | |
tree | d7b2312abbfa1ea0af9de02e32632939910547a4 | |
parent | 560a2b7de036a9f21e0a86f8656ca9b4cd2be478 (diff) | |
download | libregex-1b08f4376253dca73868af564b44a7680a02667c.tar.gz |
Add support for word boundary metacharacter.
-rw-r--r-- | compile.myr | 15 | ||||
-rw-r--r-- | interp.myr | 38 | ||||
-rw-r--r-- | test/data/regex-boundaries-expected | 28 | ||||
-rw-r--r-- | test/regex-boundaries.myr | 17 | ||||
-rw-r--r-- | test/tests | 5 | ||||
-rw-r--r-- | types.myr | 2 |
6 files changed, 103 insertions, 2 deletions
diff --git a/compile.myr b/compile.myr index 27e0ee9..aa152be 100644 --- a/compile.myr +++ b/compile.myr @@ -30,6 +30,8 @@ type tree = union `Cap (std.size, tree#) /* id, tree */ `Bol /* beginning of line */ `Eol /* end of line */ + `Bow /* beginning of word */ + `Eow /* end of word */ ;; type parseresult = union @@ -107,6 +109,8 @@ const gen = {re, t /* meta */ |`Bol: append(re, `Ibol) |`Eol: append(re, `Ibol) + |`Bow: append(re, `Ibow) + |`Eow: append(re, `Ieow) |`Cap (m, a): append(re, `Ilbra m) gen(re, a) @@ -312,6 +316,7 @@ const genchar = {re, c var i n = std.encode(b[:], c) + std.assert(n > 0 && n < 4, "non-utf character in regex\n") for i = 0; i < n; i++ append(re, `Ibyte b[i]) ;; @@ -352,6 +357,8 @@ const idump = {re /* anchors */ | `Ibol: std.put("`Ibol\n") | `Ieol: std.put("`Ieol\n") + | `Ibow: std.put("`Ibow\n") + | `Ieow: std.put("`Ieow\n") /* control flow */ | `Ifork (lip, rip): std.put("`Ifork (%z,%z)\n", lip, rip) | `Ijmp ip: std.put("`Ijmp %z\n", ip) @@ -399,6 +406,10 @@ const dump = {re, t, indent std.put("Bol\n") | `Eol: std.put("Eol\n") + | `Bow: + std.put("Bow\n") + | `Eow: + std.put("Eow\n") /* end matches */ | `Byte b: std.put("Byte %b\n", b) @@ -567,6 +578,10 @@ const escaped = {re | 'p': ret = unicodeclass(re, false) | 'P': ret = unicodeclass(re, true) + /* operators that need an escape */ + | '<': ret = `Some mk(`Bow) + | '>': ret = `Some mk(`Eow) + /* escaped metachars */ | '^': ret = `Some mk(`Chr '^') | '$': ret = `Some mk(`Chr '$') @@ -163,10 +163,31 @@ const step = {re, thr, curip | `Ieol: trace(re, thr, "\t%z:\tEol\n", thr.ip) if re.strp == str.len || str[re.strp] == '\n' castto(byte) + thr.ip++ -> false else die(re, thr, "not end of line") ;; + /* check for word characters */ + | `Ibow: + trace(re, thr, "\t%z:\tBow\n", thr.ip) + if iswordchar(str[re.strp:]) && (re.strp == 0 || !iswordchar(prevchar(str, re.strp))) + thr.ip++ + -> false + else + die(re, thr, "not beginning of word") + ;; + | `Ieow: + trace(re, thr, "\t%z:\tEow\n", thr.ip) + if re.strp == str.len && iswordchar(prevchar(str, re.strp)) + thr.ip++ + -> false + elif re.strp > 0 && !iswordchar(str[re.strp:]) && iswordchar(prevchar(str, re.strp)) + thr.ip++ + -> false + else + die(re, thr, "not end of word") + ;; | `Ilbra m: trace(re, thr, "\t%z:\tLbra %z\n", thr.ip, m) trace(re, thr, "\t\tmatch start = %z\n", re.strp) @@ -268,3 +289,20 @@ const trace : (re : regex#, thr : rethread#, msg : byte[:], args : ... -> void) std.putv(msg, std.vastart(&args)) ;; } + +/* must be called with i >= 1 */ +const prevchar = {s, i + std.assert(i != 0, "prevchar must be called with i >= 1\n") + i-- + while i != 0 && s[i] >= 0x80 + i-- + ;; + -> s[i:] +} + +const iswordchar = {s + var c + + c = std.decode(s) + -> std.isalpha(c) || std.isdigit(c) || c == '_' +} diff --git a/test/data/regex-boundaries-expected b/test/data/regex-boundaries-expected new file mode 100644 index 0000000..3706af6 --- /dev/null +++ b/test/data/regex-boundaries-expected @@ -0,0 +1,28 @@ +Matched abcdef via \<([a-z]*)\> : 2 + match 0: abcdef + match 1: abcdef +Matched !m! via .*(\<.*\>).* : 2 + match 0: !m! + match 1: m +Matched !m via .*(\<.*\>).* : 2 + match 0: !m + match 1: m +Matched m! via .*(\<.*\>).* : 2 + match 0: m! + match 1: m +Matched !@#!!matches!!%! via .*(\<.*\>).* : 2 + match 0: !@#!!matches!!%! + match 1: matches +Matched matches!!%! via .*(\<.*\>).* : 2 + match 0: matches!!%! + match 1: matches +Matched !@#!!matches via .*(\<.*\>).* : 2 + match 0: !@#!!matches + match 1: matches +Matched !@#!!matches!!%!foo via .*(\<.*\>).* : 2 + match 0: !@#!!matches!!%!foo + match 1: foo +Matched 123 via .*(\<.*\>).* : 2 + match 0: 123 + match 1: 123 +No match of abcdefoo via \<([a-z]*)\>foo diff --git a/test/regex-boundaries.myr b/test/regex-boundaries.myr new file mode 100644 index 0000000..03157e4 --- /dev/null +++ b/test/regex-boundaries.myr @@ -0,0 +1,17 @@ +use "testmatch.use" + +const main = { + /* expected matches */ + testmatch("\\<([a-z]*)\\>", "abcdef") /* whole word */ + testmatch(".*(\\<.*\\>).*", "!m!") /* single char word in midstring */ + testmatch(".*(\\<.*\\>).*", "!m") /* single char word at end of string */ + testmatch(".*(\\<.*\\>).*", "m!") /* single char word at start of string */ + testmatch(".*(\\<.*\\>).*", "!@#!!matches!!%!") /* word in midstring */ + testmatch(".*(\\<.*\\>).*", "matches!!%!") /* word at start of string */ + testmatch(".*(\\<.*\\>).*", "!@#!!matches") /* word at end of string */ + testmatch(".*(\\<.*\\>).*", "!@#!!matches!!%!foo") /* matches last word in string */ + testmatch(".*(\\<.*\\>).*", "123") /* numbers are also word bounds */ + + /* nonmatches */ + testmatch("\\<([a-z]*)\\>foo", "abcdefoo") /* word boundary needed in midstring */ +} @@ -21,8 +21,9 @@ EXTRA_SRC=testmatch.myr # What we compare with. This should be self- # evident. B regex-basic C -B regex-class C -B regex-negclass C +B regex-boundaries C B regex-capture C +B regex-class C B regex-failmatch C +B regex-negclass C B regex-unicode C @@ -51,6 +51,8 @@ pkg regex = /* anchors */ `Ibol `Ieol + `Ibow + `Ieow /* control flow */ `Ifork (std.size, std.size) |