summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorOri Bernstein <ori@eigenstate.org>2014-09-07 22:11:10 -0400
committerOri Bernstein <ori@eigenstate.org>2014-09-07 22:11:10 -0400
commit1b08f4376253dca73868af564b44a7680a02667c (patch)
treed7b2312abbfa1ea0af9de02e32632939910547a4
parent560a2b7de036a9f21e0a86f8656ca9b4cd2be478 (diff)
downloadlibregex-1b08f4376253dca73868af564b44a7680a02667c.tar.gz
Add support for word boundary metacharacter.
-rw-r--r--compile.myr15
-rw-r--r--interp.myr38
-rw-r--r--test/data/regex-boundaries-expected28
-rw-r--r--test/regex-boundaries.myr17
-rw-r--r--test/tests5
-rw-r--r--types.myr2
6 files changed, 103 insertions, 2 deletions
diff --git a/compile.myr b/compile.myr
index 27e0ee9..aa152be 100644
--- a/compile.myr
+++ b/compile.myr
@@ -30,6 +30,8 @@ type tree = union
`Cap (std.size, tree#) /* id, tree */
`Bol /* beginning of line */
`Eol /* end of line */
+ `Bow /* beginning of word */
+ `Eow /* end of word */
;;
type parseresult = union
@@ -107,6 +109,8 @@ const gen = {re, t
/* meta */
|`Bol: append(re, `Ibol)
|`Eol: append(re, `Ibol)
+ |`Bow: append(re, `Ibow)
+ |`Eow: append(re, `Ieow)
|`Cap (m, a):
append(re, `Ilbra m)
gen(re, a)
@@ -312,6 +316,7 @@ const genchar = {re, c
var i
n = std.encode(b[:], c)
+ std.assert(n > 0 && n < 4, "non-utf character in regex\n")
for i = 0; i < n; i++
append(re, `Ibyte b[i])
;;
@@ -352,6 +357,8 @@ const idump = {re
/* anchors */
| `Ibol: std.put("`Ibol\n")
| `Ieol: std.put("`Ieol\n")
+ | `Ibow: std.put("`Ibow\n")
+ | `Ieow: std.put("`Ieow\n")
/* control flow */
| `Ifork (lip, rip): std.put("`Ifork (%z,%z)\n", lip, rip)
| `Ijmp ip: std.put("`Ijmp %z\n", ip)
@@ -399,6 +406,10 @@ const dump = {re, t, indent
std.put("Bol\n")
| `Eol:
std.put("Eol\n")
+ | `Bow:
+ std.put("Bow\n")
+ | `Eow:
+ std.put("Eow\n")
/* end matches */
| `Byte b:
std.put("Byte %b\n", b)
@@ -567,6 +578,10 @@ const escaped = {re
| 'p': ret = unicodeclass(re, false)
| 'P': ret = unicodeclass(re, true)
+ /* operators that need an escape */
+ | '<': ret = `Some mk(`Bow)
+ | '>': ret = `Some mk(`Eow)
+
/* escaped metachars */
| '^': ret = `Some mk(`Chr '^')
| '$': ret = `Some mk(`Chr '$')
diff --git a/interp.myr b/interp.myr
index 370a51c..7e1bb42 100644
--- a/interp.myr
+++ b/interp.myr
@@ -163,10 +163,31 @@ const step = {re, thr, curip
| `Ieol:
trace(re, thr, "\t%z:\tEol\n", thr.ip)
if re.strp == str.len || str[re.strp] == '\n' castto(byte)
+ thr.ip++
-> false
else
die(re, thr, "not end of line")
;;
+ /* check for word characters */
+ | `Ibow:
+ trace(re, thr, "\t%z:\tBow\n", thr.ip)
+ if iswordchar(str[re.strp:]) && (re.strp == 0 || !iswordchar(prevchar(str, re.strp)))
+ thr.ip++
+ -> false
+ else
+ die(re, thr, "not beginning of word")
+ ;;
+ | `Ieow:
+ trace(re, thr, "\t%z:\tEow\n", thr.ip)
+ if re.strp == str.len && iswordchar(prevchar(str, re.strp))
+ thr.ip++
+ -> false
+ elif re.strp > 0 && !iswordchar(str[re.strp:]) && iswordchar(prevchar(str, re.strp))
+ thr.ip++
+ -> false
+ else
+ die(re, thr, "not end of word")
+ ;;
| `Ilbra m:
trace(re, thr, "\t%z:\tLbra %z\n", thr.ip, m)
trace(re, thr, "\t\tmatch start = %z\n", re.strp)
@@ -268,3 +289,20 @@ const trace : (re : regex#, thr : rethread#, msg : byte[:], args : ... -> void)
std.putv(msg, std.vastart(&args))
;;
}
+
+/* must be called with i >= 1 */
+const prevchar = {s, i
+ std.assert(i != 0, "prevchar must be called with i >= 1\n")
+ i--
+ while i != 0 && s[i] >= 0x80
+ i--
+ ;;
+ -> s[i:]
+}
+
+const iswordchar = {s
+ var c
+
+ c = std.decode(s)
+ -> std.isalpha(c) || std.isdigit(c) || c == '_'
+}
diff --git a/test/data/regex-boundaries-expected b/test/data/regex-boundaries-expected
new file mode 100644
index 0000000..3706af6
--- /dev/null
+++ b/test/data/regex-boundaries-expected
@@ -0,0 +1,28 @@
+Matched abcdef via \<([a-z]*)\> : 2
+ match 0: abcdef
+ match 1: abcdef
+Matched !m! via .*(\<.*\>).* : 2
+ match 0: !m!
+ match 1: m
+Matched !m via .*(\<.*\>).* : 2
+ match 0: !m
+ match 1: m
+Matched m! via .*(\<.*\>).* : 2
+ match 0: m!
+ match 1: m
+Matched !@#!!matches!!%! via .*(\<.*\>).* : 2
+ match 0: !@#!!matches!!%!
+ match 1: matches
+Matched matches!!%! via .*(\<.*\>).* : 2
+ match 0: matches!!%!
+ match 1: matches
+Matched !@#!!matches via .*(\<.*\>).* : 2
+ match 0: !@#!!matches
+ match 1: matches
+Matched !@#!!matches!!%!foo via .*(\<.*\>).* : 2
+ match 0: !@#!!matches!!%!foo
+ match 1: foo
+Matched 123 via .*(\<.*\>).* : 2
+ match 0: 123
+ match 1: 123
+No match of abcdefoo via \<([a-z]*)\>foo
diff --git a/test/regex-boundaries.myr b/test/regex-boundaries.myr
new file mode 100644
index 0000000..03157e4
--- /dev/null
+++ b/test/regex-boundaries.myr
@@ -0,0 +1,17 @@
+use "testmatch.use"
+
+const main = {
+ /* expected matches */
+ testmatch("\\<([a-z]*)\\>", "abcdef") /* whole word */
+ testmatch(".*(\\<.*\\>).*", "!m!") /* single char word in midstring */
+ testmatch(".*(\\<.*\\>).*", "!m") /* single char word at end of string */
+ testmatch(".*(\\<.*\\>).*", "m!") /* single char word at start of string */
+ testmatch(".*(\\<.*\\>).*", "!@#!!matches!!%!") /* word in midstring */
+ testmatch(".*(\\<.*\\>).*", "matches!!%!") /* word at start of string */
+ testmatch(".*(\\<.*\\>).*", "!@#!!matches") /* word at end of string */
+ testmatch(".*(\\<.*\\>).*", "!@#!!matches!!%!foo") /* matches last word in string */
+ testmatch(".*(\\<.*\\>).*", "123") /* numbers are also word bounds */
+
+ /* nonmatches */
+ testmatch("\\<([a-z]*)\\>foo", "abcdefoo") /* word boundary needed in midstring */
+}
diff --git a/test/tests b/test/tests
index d57a814..2cbaa65 100644
--- a/test/tests
+++ b/test/tests
@@ -21,8 +21,9 @@ EXTRA_SRC=testmatch.myr
# What we compare with. This should be self-
# evident.
B regex-basic C
-B regex-class C
-B regex-negclass C
+B regex-boundaries C
B regex-capture C
+B regex-class C
B regex-failmatch C
+B regex-negclass C
B regex-unicode C
diff --git a/types.myr b/types.myr
index 223ada2..f633380 100644
--- a/types.myr
+++ b/types.myr
@@ -51,6 +51,8 @@ pkg regex =
/* anchors */
`Ibol
`Ieol
+ `Ibow
+ `Ieow
/* control flow */
`Ifork (std.size, std.size)