diff options
Diffstat (limited to 'doc/api/libregex/index.txt')
-rw-r--r-- | doc/api/libregex/index.txt | 279 |
1 files changed, 279 insertions, 0 deletions
diff --git a/doc/api/libregex/index.txt b/doc/api/libregex/index.txt new file mode 100644 index 0000000..5cf4716 --- /dev/null +++ b/doc/api/libregex/index.txt @@ -0,0 +1,279 @@ +{ + title: libregex + description: Libregex API documentation. +} + +Summary +------- + + pkg regex = + type ast = union + /* basic string building */ + `Alt (ast#, ast#) + `Cat (ast#, ast#) + + /* repetition */ + `Star ast# + `Rstar ast# + `Plus ast# + `Rplus ast# + `Quest ast# + + /* end matches */ + `Chr char + `Ranges char[2][:] + + /* meta */ + `Cap (std.size, ast#) /* id, ast */ + `Bol /* beginning of line */ + `Eol /* end of line */ + `Bow /* beginning of word */ + `Eow /* end of word */ + ;; + + type status = union + `Noimpl + `Incomplete + `Unbalanced char + `Emptyparen + `Badrep char + `Badrange byte[:] + `Badescape char + ;; + + /* regex compilation */ + const parse : (re : byte[:] -> std.result(ast#, status)) + const compile : (re : byte[:] -> std.result(regex#, status)) + const dbgcompile : (re : byte[:] -> std.result(regex#, status)) + const free : (re : regex# -> void) + + /* regex execution */ + const exec : (re : regex#, str : byte[:] -> std.option(byte[:][:])) + const search : (re : regex#, str : byte[:] -> std.option(byte[:][:])) + + const sub : (re : regex#, str : byte[:], subst : byte[:][:] -> std.option(byte[:])) + const sbsub : (sb : std.strbuf#, re : regex#, str : byte[:], subst : byte[:][:] -> bool) + const suball : (re : regex#, str : byte[:], subst : byte[:][:] -> byte[:]) + const sbsuball : (sb : std.strbuf#, re : regex#, str : byte[:], subst : byte[:][:] -> void) + + const matchfree : (pat : byte[:][:] -> void) + ;; + + +Overview +-------- + +Libregex is a simple regex API that uses a parallel NFA implementation. This +means that while it is not blazingly fast, it does not exhibit pathological +behavior on regexes like `(aa|aab?)\*` that many common regex APIs will see. + +Regex Syntax +------------- + +The grammar for regexes that are accepted is sketched out below. + + regex : altexpr + altexpr : catexpr ('|' altexpr)+ + catexpr : repexpr (catexpr)+ + repexpr : baseexpr[*+?][?] + baseexpr : literal + | charclass + | charrange + | '.' + | '^' + | '$' + | '(' regex ')' + charclass : see below + charrange : '[' (literal('-' literal)?)+']' + +The following metacharacters have the meanings listed below: + +Matches a single unicode character + +<table> + <tr><tr><th>Metachar</th> <th>Description</th></tr> + <tr><td><code>^</td></code> <td>Matches the beginning of a line. Does not consume any characters.</td></tr> + <tr><td><code>$</td></code> <td>Matches the end of a line. Does not consume any characters.</td></tr> + <tr><td><code>*</td></code> <td>Matches any number of repetitions of the preceding regex fragment.</td></tr> + <tr><td><code>+</td></code> <td>Matches one or more repetitions of the preceding regex fragment.</td></tr> + <tr><td><code>?</td></code> <td>Matches zero or one of the preceding regex fragment.</td></tr> +</table> + +In order to match a literal metacharacter, it needs to be preceded by a '\' character. + +The following character classes are supported: + +<table> + <tr><tr><th>Charclass</th> <th>Description</th></tr> + <tr><td><code>\d </code></td> <td>ASCII digits</td></tr> + <tr><td><code>\D </code></td> <td>Negation of ASCII digits</td></tr> + <tr><td><code>\x </code></td> <td>ASCII Hex digits</td></tr> + <tr><td><code>\X </code></td> <td>Negation of ASCII Hex digits</td></tr> + <tr><td><code>\s </code></td> <td>ASCII spaces</td></tr> + <tr><td><code>\S </code></td> <td>Negation of ASCII spaces</td></tr> + <tr><td><code>\w </code></td> <td>ASCII word characters</td></tr> + <tr><td><code>\W </code></td> <td>Negation of ASCII word characters</td></tr> + <tr><td><code>\h </code></td> <td>ASCII whitespace characters</td></tr> + <tr><td><code>\H </code></td> <td>Negation of ASCII whitespace characters</td></tr> + <tr><td><code>\pX</code></td> <td>Characters with unicode property 'X'</td></tr> + <tr><td><code>\PX</code></td> <td>Negation of characters with property 'X'</td></tr> +</table> + +The current list of supported Unicode character classes `X` are + +<table> + <tr><th>Abbrev</th> <th>Full name</th> <th>Description</th></tr> + <tr> + <td><code>L</code></td> <td><code>Letter</code></td> + <td>All letters, including lowercase, uppercase, titlecase, + and uncased.</td> + </tr> + <tr> + <td><code>Lu</code></td> <td><code>Uppercase_Letter</code></td> + <td>All uppercase letters.</td> + </tr> + <tr> + <td><code>Ll</code></td> <td><code>Lowercase_Letter</code></td> + <td>All lowercase letters.</td> + </tr> + <tr> + <td><code>Lt</code></td> <td><code>Titlecase_Letter</code></td> + <td>All titlecase letters.</td> + </tr> + <tr> + <td><code>N</code></td> <td><code>Number</code></td> + <td>All numbers.</td> + </tr> + <tr> + <td><code>Z</code></td> <td><code>Separator</code></td> + <td>All separators, including spaces and control characers.</td> + </tr> + <tr> + <td><code>Zs</code></td> <td><code>Space_Separator</code></td> + <td>All space separators, including tabs and ASCII spaces.</td> + </tr> +</table> + +Functions +--------- + + const parse : (re : byte[:] -> std.result(ast#, status)) + +Parse takes a regex string, and converts it to a regex syntax tree, returning +`\`std.Success ast#` if the regex was valid, or a `\`std.Failure r` if the +regex could not be parsed. This AST can be used to further process the regex, +possibly turning it into a multiregex as in Hairless, or using it for NFA and +DFA tricks. + + const compile : (re : byte[:] -> std.result(regex#, status)) + const dbgcompile : (re : byte[:] -> std.result(regex#, status)) + +`compile` takes a regex string, and converts it to a compiled regex, returning +`\`std.Success regex` if the regex was valid, or a `\`std.Failure r` with the +reason that the compilation failed. `dbgcompile` is similar, however, the +regex is compiled so it will spit out a good deal of debugging output. Unless +you are intent on debugging the internals of the regex engine, this is likely +only of academic interest. + + const free : (re : regex# -> void) + +`free` must be called on a compiled regex to release it's resources after you +are finished using it. + + const exec : (re : regex#, str : byte[:] -> std.option(byte[:][:]) + +`exec` runs the regex over the specified text, returning an `\`std.Some matches` +if the text matched, or `std.None` if the text did not match. matches[0] is +always the full text that was matched, and will always be returned regardless +of whether capture groups are specified. + + const search : (re : regex#, str : byte[:] -> std.option(byte[:][:])) + +`search` searches for a matching sub-segment of the regex over the specified +text, returning an `\`std.Some matches` if the text matched, or `std.None` if +the text did not match. matches[0] is always the full text that was matched, +and will always be returned regardless of whether capture groups are +specified. `search` returns the the earliest match in the string provided. + + + const sub : (re : regex#, str : byte[:], subst : byte[:][:] -> std.option(byte[:])) + const sbsub : (sb : std.strbuf#, re : regex#, str : byte[:], subst : byte[:][:] -> bool) + +`sub` will take a pattern, an input string, and a set of substitutions, and +attempt to match. If the match is successful, it will replace each group +within `str` with `subst`, returning a freshly allocated string. `sbsub` +behaves identically, however it inserts the new string into the string +buffer provided, instead of allocating a new string. + +If there is no match, then `\`std.None` will be returned. + + const suball : (re : regex#, str : byte[:], subst : byte[:][:] -> byte[:]) + const sbsuball : (sb : std.strbuf#, re : regex#, str : byte[:], subst : byte[:][:] -> void) + +`suball` replaces every match within the string using the given substitutions. +Only captured groups will be substituted. The remaining text will be left in +place. + +Example +------ + +#### Pattern matching + +```{runmyr regex} +use std +use regex + +const main = { + match regex.compile("ab(c+)") + | `std.Ok re: runwith(re, "abccc") + | `std.Fail m: std.fatal("Failed to compile regex\n") + ;; +} + +const runwith = {re, txt + match regex.exec(re, txt) + | `std.Some matches: + std.put("matched {}, got {} matches\n", txt, matches.len) + for m in matches + std.put("Match: {}\n", m) + ;; + regex.matchfree(matches) + | `std.None: + std.put("%s did not match\n") + ;; +} +``` + +#### Substitution + +```{runmyr regex} +use std +use regex + +const main = { + var re + + re = std.try(regex.compile("(a*)bc(d)e")) + match regex.sub(re, "aaabcdef", ["HEY", "X"][:]) + | `std.Some sub: + std.put("{}\n", sub[0]) + regex.matchfree(matches) + | `std.None: + std.fatal("should have matched") + ;; +} +``` + +```{runmyr regex} +use std +use regex + +const main = { + var re, sub + + re = std.try(regex.compile("(b|e)")) + sub = regex.suball(re, "aaabbbcdef", ["SUB"][:]) + std.put("subst: {}\n", sub) + std.slfree(sub) +} +``` |