summaryrefslogtreecommitdiff
path: root/doc/api/libregex/index.txt
diff options
context:
space:
mode:
Diffstat (limited to 'doc/api/libregex/index.txt')
-rw-r--r--doc/api/libregex/index.txt279
1 files changed, 279 insertions, 0 deletions
diff --git a/doc/api/libregex/index.txt b/doc/api/libregex/index.txt
new file mode 100644
index 0000000..5cf4716
--- /dev/null
+++ b/doc/api/libregex/index.txt
@@ -0,0 +1,279 @@
+{
+ title: libregex
+ description: Libregex API documentation.
+}
+
+Summary
+-------
+
+ pkg regex =
+ type ast = union
+ /* basic string building */
+ `Alt (ast#, ast#)
+ `Cat (ast#, ast#)
+
+ /* repetition */
+ `Star ast#
+ `Rstar ast#
+ `Plus ast#
+ `Rplus ast#
+ `Quest ast#
+
+ /* end matches */
+ `Chr char
+ `Ranges char[2][:]
+
+ /* meta */
+ `Cap (std.size, ast#) /* id, ast */
+ `Bol /* beginning of line */
+ `Eol /* end of line */
+ `Bow /* beginning of word */
+ `Eow /* end of word */
+ ;;
+
+ type status = union
+ `Noimpl
+ `Incomplete
+ `Unbalanced char
+ `Emptyparen
+ `Badrep char
+ `Badrange byte[:]
+ `Badescape char
+ ;;
+
+ /* regex compilation */
+ const parse : (re : byte[:] -> std.result(ast#, status))
+ const compile : (re : byte[:] -> std.result(regex#, status))
+ const dbgcompile : (re : byte[:] -> std.result(regex#, status))
+ const free : (re : regex# -> void)
+
+ /* regex execution */
+ const exec : (re : regex#, str : byte[:] -> std.option(byte[:][:]))
+ const search : (re : regex#, str : byte[:] -> std.option(byte[:][:]))
+
+ const sub : (re : regex#, str : byte[:], subst : byte[:][:] -> std.option(byte[:]))
+ const sbsub : (sb : std.strbuf#, re : regex#, str : byte[:], subst : byte[:][:] -> bool)
+ const suball : (re : regex#, str : byte[:], subst : byte[:][:] -> byte[:])
+ const sbsuball : (sb : std.strbuf#, re : regex#, str : byte[:], subst : byte[:][:] -> void)
+
+ const matchfree : (pat : byte[:][:] -> void)
+ ;;
+
+
+Overview
+--------
+
+Libregex is a simple regex API that uses a parallel NFA implementation. This
+means that while it is not blazingly fast, it does not exhibit pathological
+behavior on regexes like `(aa|aab?)\*` that many common regex APIs will see.
+
+Regex Syntax
+-------------
+
+The grammar for regexes that are accepted is sketched out below.
+
+ regex : altexpr
+ altexpr : catexpr ('|' altexpr)+
+ catexpr : repexpr (catexpr)+
+ repexpr : baseexpr[*+?][?]
+ baseexpr : literal
+ | charclass
+ | charrange
+ | '.'
+ | '^'
+ | '$'
+ | '(' regex ')'
+ charclass : see below
+ charrange : '[' (literal('-' literal)?)+']'
+
+The following metacharacters have the meanings listed below:
+
+Matches a single unicode character
+
+<table>
+ <tr><tr><th>Metachar</th> <th>Description</th></tr>
+ <tr><td><code>^</td></code> <td>Matches the beginning of a line. Does not consume any characters.</td></tr>
+ <tr><td><code>$</td></code> <td>Matches the end of a line. Does not consume any characters.</td></tr>
+ <tr><td><code>*</td></code> <td>Matches any number of repetitions of the preceding regex fragment.</td></tr>
+ <tr><td><code>+</td></code> <td>Matches one or more repetitions of the preceding regex fragment.</td></tr>
+ <tr><td><code>?</td></code> <td>Matches zero or one of the preceding regex fragment.</td></tr>
+</table>
+
+In order to match a literal metacharacter, it needs to be preceded by a '\' character.
+
+The following character classes are supported:
+
+<table>
+ <tr><tr><th>Charclass</th> <th>Description</th></tr>
+ <tr><td><code>\d </code></td> <td>ASCII digits</td></tr>
+ <tr><td><code>\D </code></td> <td>Negation of ASCII digits</td></tr>
+ <tr><td><code>\x </code></td> <td>ASCII Hex digits</td></tr>
+ <tr><td><code>\X </code></td> <td>Negation of ASCII Hex digits</td></tr>
+ <tr><td><code>\s </code></td> <td>ASCII spaces</td></tr>
+ <tr><td><code>\S </code></td> <td>Negation of ASCII spaces</td></tr>
+ <tr><td><code>\w </code></td> <td>ASCII word characters</td></tr>
+ <tr><td><code>\W </code></td> <td>Negation of ASCII word characters</td></tr>
+ <tr><td><code>\h </code></td> <td>ASCII whitespace characters</td></tr>
+ <tr><td><code>\H </code></td> <td>Negation of ASCII whitespace characters</td></tr>
+ <tr><td><code>\pX</code></td> <td>Characters with unicode property 'X'</td></tr>
+ <tr><td><code>\PX</code></td> <td>Negation of characters with property 'X'</td></tr>
+</table>
+
+The current list of supported Unicode character classes `X` are
+
+<table>
+ <tr><th>Abbrev</th> <th>Full name</th> <th>Description</th></tr>
+ <tr>
+ <td><code>L</code></td> <td><code>Letter</code></td>
+ <td>All letters, including lowercase, uppercase, titlecase,
+ and uncased.</td>
+ </tr>
+ <tr>
+ <td><code>Lu</code></td> <td><code>Uppercase_Letter</code></td>
+ <td>All uppercase letters.</td>
+ </tr>
+ <tr>
+ <td><code>Ll</code></td> <td><code>Lowercase_Letter</code></td>
+ <td>All lowercase letters.</td>
+ </tr>
+ <tr>
+ <td><code>Lt</code></td> <td><code>Titlecase_Letter</code></td>
+ <td>All titlecase letters.</td>
+ </tr>
+ <tr>
+ <td><code>N</code></td> <td><code>Number</code></td>
+ <td>All numbers.</td>
+ </tr>
+ <tr>
+ <td><code>Z</code></td> <td><code>Separator</code></td>
+ <td>All separators, including spaces and control characers.</td>
+ </tr>
+ <tr>
+ <td><code>Zs</code></td> <td><code>Space_Separator</code></td>
+ <td>All space separators, including tabs and ASCII spaces.</td>
+ </tr>
+</table>
+
+Functions
+---------
+
+ const parse : (re : byte[:] -> std.result(ast#, status))
+
+Parse takes a regex string, and converts it to a regex syntax tree, returning
+`\`std.Success ast#` if the regex was valid, or a `\`std.Failure r` if the
+regex could not be parsed. This AST can be used to further process the regex,
+possibly turning it into a multiregex as in Hairless, or using it for NFA and
+DFA tricks.
+
+ const compile : (re : byte[:] -> std.result(regex#, status))
+ const dbgcompile : (re : byte[:] -> std.result(regex#, status))
+
+`compile` takes a regex string, and converts it to a compiled regex, returning
+`\`std.Success regex` if the regex was valid, or a `\`std.Failure r` with the
+reason that the compilation failed. `dbgcompile` is similar, however, the
+regex is compiled so it will spit out a good deal of debugging output. Unless
+you are intent on debugging the internals of the regex engine, this is likely
+only of academic interest.
+
+ const free : (re : regex# -> void)
+
+`free` must be called on a compiled regex to release it's resources after you
+are finished using it.
+
+ const exec : (re : regex#, str : byte[:] -> std.option(byte[:][:])
+
+`exec` runs the regex over the specified text, returning an `\`std.Some matches`
+if the text matched, or `std.None` if the text did not match. matches[0] is
+always the full text that was matched, and will always be returned regardless
+of whether capture groups are specified.
+
+ const search : (re : regex#, str : byte[:] -> std.option(byte[:][:]))
+
+`search` searches for a matching sub-segment of the regex over the specified
+text, returning an `\`std.Some matches` if the text matched, or `std.None` if
+the text did not match. matches[0] is always the full text that was matched,
+and will always be returned regardless of whether capture groups are
+specified. `search` returns the the earliest match in the string provided.
+
+
+ const sub : (re : regex#, str : byte[:], subst : byte[:][:] -> std.option(byte[:]))
+ const sbsub : (sb : std.strbuf#, re : regex#, str : byte[:], subst : byte[:][:] -> bool)
+
+`sub` will take a pattern, an input string, and a set of substitutions, and
+attempt to match. If the match is successful, it will replace each group
+within `str` with `subst`, returning a freshly allocated string. `sbsub`
+behaves identically, however it inserts the new string into the string
+buffer provided, instead of allocating a new string.
+
+If there is no match, then `\`std.None` will be returned.
+
+ const suball : (re : regex#, str : byte[:], subst : byte[:][:] -> byte[:])
+ const sbsuball : (sb : std.strbuf#, re : regex#, str : byte[:], subst : byte[:][:] -> void)
+
+`suball` replaces every match within the string using the given substitutions.
+Only captured groups will be substituted. The remaining text will be left in
+place.
+
+Example
+------
+
+#### Pattern matching
+
+```{runmyr regex}
+use std
+use regex
+
+const main = {
+ match regex.compile("ab(c+)")
+ | `std.Ok re: runwith(re, "abccc")
+ | `std.Fail m: std.fatal("Failed to compile regex\n")
+ ;;
+}
+
+const runwith = {re, txt
+ match regex.exec(re, txt)
+ | `std.Some matches:
+ std.put("matched {}, got {} matches\n", txt, matches.len)
+ for m in matches
+ std.put("Match: {}\n", m)
+ ;;
+ regex.matchfree(matches)
+ | `std.None:
+ std.put("%s did not match\n")
+ ;;
+}
+```
+
+#### Substitution
+
+```{runmyr regex}
+use std
+use regex
+
+const main = {
+ var re
+
+ re = std.try(regex.compile("(a*)bc(d)e"))
+ match regex.sub(re, "aaabcdef", ["HEY", "X"][:])
+ | `std.Some sub:
+ std.put("{}\n", sub[0])
+ regex.matchfree(matches)
+ | `std.None:
+ std.fatal("should have matched")
+ ;;
+}
+```
+
+```{runmyr regex}
+use std
+use regex
+
+const main = {
+ var re, sub
+
+ re = std.try(regex.compile("(b|e)"))
+ sub = regex.suball(re, "aaabbbcdef", ["SUB"][:])
+ std.put("subst: {}\n", sub)
+ std.slfree(sub)
+}
+```