summaryrefslogtreecommitdiff
path: root/doc/api/libstd/unicode.txt
diff options
context:
space:
mode:
Diffstat (limited to 'doc/api/libstd/unicode.txt')
-rw-r--r--doc/api/libstd/unicode.txt133
1 files changed, 133 insertions, 0 deletions
diff --git a/doc/api/libstd/unicode.txt b/doc/api/libstd/unicode.txt
new file mode 100644
index 0000000..31ec714
--- /dev/null
+++ b/doc/api/libstd/unicode.txt
@@ -0,0 +1,133 @@
+{
+ title: Unicode
+ description: libstd: Unicode
+}
+
+Unicode
+--------
+
+ pkg std =
+ const Badchar : char
+ const Maxcharlen : size
+ const Maxcharval : char
+
+ /* iterators */
+ impl iterable chariter -> char
+
+ const chariter : (byte[:] -> chariter)
+
+ /* utf8 information */
+ const charlen : (chr : char -> size)
+ const encode : (buf : byte[:], chr : char -> size)
+ const decode : (buf : byte[:] -> char)
+ const strstep : (str : byte[:] -> (char, byte[:]))
+
+ /* character class predicates */
+ const isalpha : (c : char -> bool)
+ const isdigit : (c : char -> bool)
+ const isxdigit : (c : char -> bool)
+ const isnum : (c : char -> bool)
+ const isalnum : (c : char -> bool)
+ const isspace : (c : char -> bool)
+ const isblank : (c : char -> bool)
+ const islower : (c : char -> bool)
+ const isupper : (c : char -> bool)
+ const istitle : (c : char -> bool)
+
+ /* character class conversions */
+ const tolower : (c : char -> char)
+ const toupper : (c : char -> char)
+ const totitle : (c : char -> char)
+ ;;
+
+Summary
+-------
+
+As a reminder, Myrddin characters hold a single Unicode codepoint, and all
+strings are assumed to be encoded in UTF-8 by default. These functions are
+designed to facilitate manipuating unicode strings and codepoints.
+
+The APIs are generally designed that strings will be streamed through, and
+not encoded or decoded wholesale.
+
+Constants
+---------
+ const Badchar : char
+
+This is a character value that is not, and will never be, a valid unicode
+codepoint. This is generally returned when we encounter an error fr
+
+ const Maxcharlen : size
+
+This is a constant defining the maximum number of bytes that a character may
+be decoded into. It's guaranteed that a buffer that is at least Maxcharlen
+bytes long will be able to contain any character.
+
+ const Maxcharval : char
+
+This is the maximum value that any valid future unicode codepoint may decode
+into. Any character that is greater than this is an invalid character.
+
+Functions: Iterating over strings
+--------------------------------
+
+ impl iterable chariter -> char
+
+ const chariter : (byte[:] -> chariter)
+
+Chariter returns an iterator which steps through a string character by
+character.
+
+Functions: Encoding and Decoding
+--------------------------------
+
+ const charlen : (chr : char -> size)
+
+Charlen returns the length in bytes that decoding the character provided into
+unicode would take. This can vary between 1 and Maxcharlen bytes.
+
+ const encode : (buf : byte[:], chr : char -> size)
+
+Encode takes a single character, and encodes it to a utf8 string. The buffer
+must be at least long enough to hold the character.
+
+Returns: The number of bytes written, or -1 if the character could not be
+encoded.
+
+ const decode : (buf : byte[:] -> char)
+
+Decode converts the head of the buffer `buf` to a single unicode codepoint,
+returning the codepoint itself, or `Badchar` if the codepoint is invalid.
+
+The tail of the buffer is not considered, allowing this function to be used
+to peek at the contents of a string.
+
+ const strstep : (str : byte[:] -> (char, byte[:]))
+
+strstep is a function for stepping through unicode encoded strings. It
+returns the tuple (`Badchar`, str[1:]) if the value cannot be decoded,
+or `(charval, str[std.charlen(charval):])` therwise.
+
+
+```{runmyr striter}
+ s = "abcd"
+ while s.len != 0
+ (c, s) = std.striter(s)
+ std.put("next char is {}\n", s)
+ ;;
+```
+
+Character Classes
+-----------------
+
+ const isalpha : (c : char -> bool)
+ const isdigit : (c : char -> bool)
+ const isxdigit : (c : char -> bool)
+ const isnum : (c : char -> bool)
+ const isalnum : (c : char -> bool)
+ const isspace : (c : char -> bool)
+ const isblank : (c : char -> bool)
+ const islower : (c : char -> bool)
+ const isupper : (c : char -> bool)
+ const istitle : (c : char -> bool)
+