summaryrefslogtreecommitdiff
path: root/doc/api/libregex/index.txt
blob: 5cf4716fbe88445ae4107cd65fba8c95d1057544 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
{
	title: libregex
	description: Libregex API documentation.
}

Summary
-------

	pkg regex =
                type ast = union
                        /* basic string building */
                        `Alt	(ast#, ast#)
                        `Cat	(ast#, ast#)

                        /* repetition */
                        `Star	ast#
                        `Rstar  ast#
                        `Plus	ast#
                        `Rplus	ast#
                        `Quest	ast#	

                        /* end matches */
                        `Chr	char
                        `Ranges	char[2][:]

                        /* meta */
                        `Cap	(std.size, ast#) /* id, ast */
                        `Bol	/* beginning of line */
                        `Eol	/* end of line */
                        `Bow	/* beginning of word */
                        `Eow	/* end of word */
                ;;

                type status = union
                        `Noimpl
                        `Incomplete
                        `Unbalanced char
                        `Emptyparen
                        `Badrep char
                        `Badrange byte[:]
                        `Badescape char
                ;;

	/* regex compilation */
        const parse	: (re : byte[:]	-> std.result(ast#, status))
        const compile	: (re : byte[:] -> std.result(regex#, status))
        const dbgcompile	: (re : byte[:] -> std.result(regex#, status))
        const free	: (re : regex# -> void)

        /* regex execution */
        const exec	: (re : regex#, str : byte[:] -> std.option(byte[:][:]))
        const search	: (re : regex#, str : byte[:] -> std.option(byte[:][:]))

        const sub	: (re : regex#, str : byte[:], subst : byte[:][:] -> std.option(byte[:]))
        const sbsub	: (sb : std.strbuf#, re : regex#, str : byte[:], subst : byte[:][:] -> bool)
        const suball	: (re : regex#, str : byte[:], subst : byte[:][:] -> byte[:])
        const sbsuball	: (sb : std.strbuf#, re : regex#, str : byte[:], subst : byte[:][:] -> void)

        const matchfree	: (pat : byte[:][:] -> void)
    ;;


Overview
--------

Libregex is a simple regex API that uses a parallel NFA implementation. This
means that while it is not blazingly fast, it does not exhibit pathological
behavior on regexes like `(aa|aab?)\*` that many common regex APIs will see.

Regex Syntax
-------------

The grammar for regexes that are accepted is sketched out below.

           regex       : altexpr
           altexpr     : catexpr ('|' altexpr)+
           catexpr     : repexpr (catexpr)+
           repexpr     : baseexpr[*+?][?]
           baseexpr    : literal
                       | charclass
                       | charrange
                       | '.'
                       | '^'
                       | '$'
                       | '(' regex ')'
           charclass   : see below
           charrange   : '[' (literal('-' literal)?)+']'

The following metacharacters have the meanings listed below:

Matches a single unicode character

<table>
	<tr><tr><th>Metachar</th>        <th>Description</th></tr>
	<tr><td><code>^</td></code>      <td>Matches the beginning of a line. Does not consume any characters.</td></tr>
	<tr><td><code>$</td></code>      <td>Matches the end of a line. Does not consume any characters.</td></tr>
	<tr><td><code>*</td></code>      <td>Matches any number of repetitions of the preceding regex fragment.</td></tr>
	<tr><td><code>+</td></code>      <td>Matches one or more repetitions of the preceding regex fragment.</td></tr>
	<tr><td><code>?</td></code>      <td>Matches zero or one of the preceding regex fragment.</td></tr>
</table>

In order to match a literal metacharacter, it needs to be preceded by a '\' character.

The following character classes are supported:

<table>
	<tr><tr><th>Charclass</th>       <th>Description</th></tr>
	<tr><td><code>\d </code></td>    <td>ASCII digits</td></tr>
	<tr><td><code>\D </code></td>    <td>Negation of ASCII digits</td></tr>
	<tr><td><code>\x </code></td>    <td>ASCII Hex digits</td></tr>
	<tr><td><code>\X </code></td>    <td>Negation of ASCII Hex digits</td></tr>
	<tr><td><code>\s </code></td>    <td>ASCII spaces</td></tr>
	<tr><td><code>\S </code></td>    <td>Negation of ASCII spaces</td></tr>
	<tr><td><code>\w </code></td>    <td>ASCII word characters</td></tr>
	<tr><td><code>\W </code></td>    <td>Negation of ASCII word characters</td></tr>
	<tr><td><code>\h </code></td>    <td>ASCII whitespace characters</td></tr>
	<tr><td><code>\H </code></td>    <td>Negation of ASCII whitespace characters</td></tr>
	<tr><td><code>\pX</code></td>    <td>Characters with unicode property 'X'</td></tr>
	<tr><td><code>\PX</code></td>    <td>Negation of characters with property 'X'</td></tr>
</table>

The current list of supported Unicode character classes `X` are

<table>
	<tr><th>Abbrev</th> <th>Full name</th>      <th>Description</th></tr>
	<tr>
		<td><code>L</code></td>  <td><code>Letter</code></td>
		<td>All letters, including lowercase, uppercase, titlecase,
		and uncased.</td>
	</tr>
	<tr>
		<td><code>Lu</code></td> <td><code>Uppercase_Letter</code></td>
		<td>All uppercase letters.</td>
	</tr>
	<tr>
		<td><code>Ll</code></td> <td><code>Lowercase_Letter</code></td>
		<td>All lowercase letters.</td>
	</tr>
	<tr>
		<td><code>Lt</code></td> <td><code>Titlecase_Letter</code></td>
		<td>All titlecase letters.</td>
	</tr>
	<tr>
		<td><code>N</code></td>  <td><code>Number</code></td>
		<td>All numbers.</td>
	</tr>
	<tr>
		<td><code>Z</code></td>  <td><code>Separator</code></td>
		<td>All separators, including spaces and control characers.</td>
	</tr>
	<tr>
		<td><code>Zs</code></td> <td><code>Space_Separator</code></td>
		<td>All space separators, including tabs and ASCII spaces.</td>
        </tr>
</table>

Functions
---------

    const parse	: (re : byte[:]	-> std.result(ast#, status))

Parse takes a regex string, and converts it to a regex syntax tree, returning
`\`std.Success ast#` if the regex was valid, or a `\`std.Failure r` if the
regex could not be parsed. This AST can be used to further process the regex,
possibly turning it into a multiregex as in Hairless, or using it for NFA and
DFA tricks.

    const compile	: (re : byte[:] -> std.result(regex#, status))
    const dbgcompile	: (re : byte[:] -> std.result(regex#, status))

`compile` takes a regex string, and converts it to a compiled regex, returning
`\`std.Success regex` if the regex was valid, or a `\`std.Failure r` with the
reason that the compilation failed. `dbgcompile` is similar, however, the
regex is compiled so it will spit out a good deal of debugging output. Unless
you are intent on debugging the internals of the regex engine, this is likely
only of academic interest.

    const free	: (re : regex# -> void)

`free` must be called on a compiled regex to release it's resources after you
are finished using it.

    const exec	: (re : regex#, str : byte[:] -> std.option(byte[:][:])

`exec` runs the regex over the specified text, returning an `\`std.Some matches`
if the text matched, or `std.None` if the text did not match. matches[0] is
always the full text that was matched, and will always be returned regardless
of whether capture groups are specified.

    const search	: (re : regex#, str : byte[:] -> std.option(byte[:][:]))

`search` searches for a matching sub-segment of the regex over the specified
text, returning an `\`std.Some matches` if the text matched, or `std.None` if
the text did not match.  matches[0] is always the full text that was matched,
and will always be returned regardless of whether capture groups are
specified. `search` returns the the earliest match in the string provided.


    const sub	: (re : regex#, str : byte[:], subst : byte[:][:] -> std.option(byte[:]))
    const sbsub	: (sb : std.strbuf#, re : regex#, str : byte[:], subst : byte[:][:] -> bool)

`sub` will take a pattern, an input string, and a set of substitutions, and
attempt to match. If the match is successful, it will replace each group
within `str` with `subst`, returning a freshly allocated string.  `sbsub`
behaves identically, however it inserts the new string into the string
buffer provided, instead of allocating a new string.

If there is no match, then `\`std.None` will be returned.

    const suball	: (re : regex#, str : byte[:], subst : byte[:][:] -> byte[:])
    const sbsuball	: (sb : std.strbuf#, re : regex#, str : byte[:], subst : byte[:][:] -> void)

`suball` replaces every match within the string using the given substitutions.
Only captured groups will be substituted. The remaining text will be left in
place.

Example
------

#### Pattern matching

```{runmyr regex}
use std
use regex

const main = {
	match regex.compile("ab(c+)")
	| `std.Ok re:	runwith(re, "abccc")
	| `std.Fail m:	std.fatal("Failed to compile regex\n")
	;;
}

const runwith = {re, txt
	match regex.exec(re, txt)
	| `std.Some matches:
		std.put("matched {}, got {} matches\n", txt, matches.len)
		for m in matches
			std.put("Match: {}\n", m)
		;;
		regex.matchfree(matches)
	| `std.None:
		std.put("%s did not match\n")
	;;
}
```

#### Substitution

```{runmyr regex}
use std
use regex

const main = {
	var re

	re = std.try(regex.compile("(a*)bc(d)e"))
	match regex.sub(re, "aaabcdef", ["HEY", "X"][:])
	| `std.Some sub:
		std.put("{}\n", sub[0])
		regex.matchfree(matches)
	| `std.None:
		std.fatal("should have matched")
	;;
}
```

```{runmyr regex}
use std
use regex

const main = {
	var re, sub

	re = std.try(regex.compile("(b|e)"))
        sub = regex.suball(re, "aaabbbcdef", ["SUB"][:])
	std.put("subst: {}\n", sub)
	std.slfree(sub)
}
```