summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorS. Gilles <sgilles@math.umd.edu>2017-11-05 04:11:27 -0500
committerOri Bernstein <ori@eigenstate.org>2017-11-05 20:52:12 -0800
commitf8f5b71ca85fed63d295f50e3f45ba3230edaead (patch)
tree2dc6c605daf0bb9735a3b197596a8ccae519029d /lib
parenta2a63e13a470401d10625a0d5be163767adb7e46 (diff)
downloadmc-f8f5b71ca85fed63d295f50e3f45ba3230edaead.tar.gz
Implement graphemestep
And change 'strstep' to 'charstep' for consistency, now that it has a sibling function.
Diffstat (limited to 'lib')
-rw-r--r--lib/date/fmt.myr4
-rw-r--r--lib/date/parse.myr10
-rw-r--r--lib/http/parse.myr2
-rw-r--r--lib/http/url.myr2
-rw-r--r--lib/std/cmp.myr4
-rw-r--r--lib/std/fmt.myr4
-rw-r--r--lib/std/hashfuncs.myr6
-rw-r--r--lib/std/optparse.myr2
-rw-r--r--lib/std/striter.myr2
-rw-r--r--lib/std/test/utf.myr73
-rw-r--r--lib/std/utf.myr33
11 files changed, 120 insertions, 22 deletions
diff --git a/lib/date/fmt.myr b/lib/date/fmt.myr
index 82f865b..39df830 100644
--- a/lib/date/fmt.myr
+++ b/lib/date/fmt.myr
@@ -40,9 +40,9 @@ const sbfmt = {sb, ap, opts
const datefmt = {sb, fmt, d
var c
while fmt.len != 0
- (c, fmt) = std.strstep(fmt)
+ (c, fmt) = std.charstep(fmt)
if c == '%'
- (c, fmt) = std.strstep(fmt)
+ (c, fmt) = std.charstep(fmt)
match c
| 'a': std.sbfmt(sb, "{}", _names.abbrevday[d.wday])
| 'A': std.sbfmt(sb, "{}", _names.fullday[d.wday])
diff --git a/lib/date/parse.myr b/lib/date/parse.myr
index 19798dd..7f2eaff 100644
--- a/lib/date/parse.myr
+++ b/lib/date/parse.myr
@@ -70,9 +70,9 @@ const filldate = {d, f, s, seen, err
z = ""
am = `std.None
while f.len != 0
- (fc, f) = std.strstep(f)
+ (fc, f) = std.charstep(f)
if fc == '%'
- (fc, f) = std.strstep(f)
+ (fc, f) = std.charstep(f)
if std.bshas(seen, fc)
err# = `std.Some `Doublefmt fc
-> s
@@ -120,7 +120,7 @@ const filldate = {d, f, s, seen, err
| _: std.fatal("unknown format character %c\n", fc)
;;
else
- (sc, s) = std.strstep(s)
+ (sc, s) = std.charstep(s)
if std.isspace(fc) && std.isspace(sc)
s = eatspace(s)
elif sc != fc
@@ -157,7 +157,7 @@ const eatspace = {s
var c
while std.isspace(std.decode(s))
- (c, s) = std.strstep(s)
+ (c, s) = std.charstep(s)
;;
-> s
}
@@ -246,7 +246,7 @@ generic intval = {dst : @a::(numeric,integral)#, s : byte[:], \
num = s
for i = 0; i < min; i++
- (c, s) = std.strstep(s)
+ (c, s) = std.charstep(s)
if !std.isdigit(c)
err# = `std.Some `Shortint
-> s
diff --git a/lib/http/parse.myr b/lib/http/parse.myr
index fabf7a2..a91da3a 100644
--- a/lib/http/parse.myr
+++ b/lib/http/parse.myr
@@ -258,7 +258,7 @@ const parsenumber = {ln, base
s = ln#
ok = false
while true
- (c, s) = std.strstep(s)
+ (c, s) = std.charstep(s)
dig = std.charval(c, base)
if dig >= 0 && dig < base
ok = true
diff --git a/lib/http/url.myr b/lib/http/url.myr
index be17d20..038305f 100644
--- a/lib/http/url.myr
+++ b/lib/http/url.myr
@@ -150,7 +150,7 @@ const parseparams = {url
;;
match std.decode(url#)
- | '?': (_, url#) = std.strstep(url#)
+ | '?': (_, url#) = std.charstep(url#)
| _: -> `std.Err `Egarbled
;;
diff --git a/lib/std/cmp.myr b/lib/std/cmp.myr
index 15c4994..c022b2e 100644
--- a/lib/std/cmp.myr
+++ b/lib/std/cmp.myr
@@ -60,8 +60,8 @@ const strcasecmp = {a, b
var ca, cb
while a.len > 0 && b.len > 0
- (ca, a) = std.strstep(a)
- (cb, b) = std.strstep(b)
+ (ca, a) = std.charstep(a)
+ (cb, b) = std.charstep(b)
ca = toupper(ca)
cb = toupper(cb)
if ca < cb
diff --git a/lib/std/fmt.myr b/lib/std/fmt.myr
index 1f542d3..48191e9 100644
--- a/lib/std/fmt.myr
+++ b/lib/std/fmt.myr
@@ -163,11 +163,11 @@ const sbfmtv = {sb, fmt, ap -> size
nparams = ap.tc.nelt
nfmt = 0
while fmt.len != 0
- (c, fmt) = strstep(fmt)
+ (c, fmt) = charstep(fmt)
match c
| '{':
if decode(fmt) == '{'
- (c, fmt) = strstep(fmt)
+ (c, fmt) = charstep(fmt)
sbputc(sb, '{')
else
(params, fmt) = getparams(fmt)
diff --git a/lib/std/hashfuncs.myr b/lib/std/hashfuncs.myr
index da47215..08a83aa 100644
--- a/lib/std/hashfuncs.myr
+++ b/lib/std/hashfuncs.myr
@@ -50,8 +50,8 @@ const strcaseeq = {a, b
if a.len == 0 || b.len == 0
break
;;
- (ca, a) = std.strstep(a)
- (cb, b) = std.strstep(b)
+ (ca, a) = std.charstep(a)
+ (cb, b) = std.charstep(b)
if std.tolower(ca) != std.tolower(cb)
-> false
;;
@@ -65,7 +65,7 @@ const strcasehash = {s
chars = [][:]
while s.len != 0
- (c, s) = std.strstep(s)
+ (c, s) = std.charstep(s)
std.slpush(&chars, std.tolower(c))
;;
h = siphash24(slbytes(chars), Seed)
diff --git a/lib/std/optparse.myr b/lib/std/optparse.myr
index 0569dfc..6202aa4 100644
--- a/lib/std/optparse.myr
+++ b/lib/std/optparse.myr
@@ -105,7 +105,7 @@ const optnext = {ctx
var c
var arg
- (c, ctx.curarg) = strstep(ctx.curarg)
+ (c, ctx.curarg) = charstep(ctx.curarg)
match optinfo(ctx, c)
| `None:
diff --git a/lib/std/striter.myr b/lib/std/striter.myr
index 3c81c98..0dd1b02 100644
--- a/lib/std/striter.myr
+++ b/lib/std/striter.myr
@@ -33,7 +33,7 @@ impl iterable chariter -> char =
if ci.rest.len == 0
-> false
;;
- (c#, ci.rest) = strstep(ci.rest)
+ (c#, ci.rest) = charstep(ci.rest)
-> true
}
diff --git a/lib/std/test/utf.myr b/lib/std/test/utf.myr
index 9f73f5a..8778b05 100644
--- a/lib/std/test/utf.myr
+++ b/lib/std/test/utf.myr
@@ -28,4 +28,77 @@ const main = {
"wrong width of runes")
std.assert(std.strcellwidth("𒀸 𒌋𒅗 𒆷 𒂅𒌒 𒍜 𒀭𒉌𒄿 𒈗 𒁁𒉌 𒋬") == 22, \
"wrong width of Cuneiform")
+
+ /* graphemestep() */
+ var s = "a史cЯx̀̀̀̀̀yz̉"
+ var sub, rest
+
+ (sub, rest) = std.graphemestep(s)
+ std.assert(std.streq(sub, "a"), "didn't get \"a\" as next grapheme")
+
+ (sub, rest) = std.graphemestep(rest)
+ std.assert(std.streq(sub, "史"), "didn't get \"史\" as next grapheme")
+
+ (sub, rest) = std.graphemestep(rest)
+ std.assert(std.streq(sub, "c"), "didn't get \"c\" as next grapheme")
+
+ (sub, rest) = std.graphemestep(rest)
+ std.assert(std.streq(sub, "Я"), "didn't get \"Я\" as next grapheme")
+
+ (sub, rest) = std.graphemestep(rest)
+ std.assert(std.streq(sub, "x̀̀̀̀̀"), "didn't get \"x̀̀̀̀̀\" as next grapheme")
+
+ (sub, rest) = std.graphemestep(rest)
+ std.assert(std.streq(sub, "y"), "didn't get \"y\" as next grapheme")
+
+ (sub, rest) = std.graphemestep(rest)
+ std.assert(std.streq(sub, "z̉"), "didn't get \"z̉\" as next grapheme")
+
+ (sub, rest) = std.graphemestep(rest)
+ std.assert(sub.len == 0, "didn't get \"\" as last grapheme")
+
+
+ /* with excessive combiners */
+ s = "c̸̶̡̡̗̣͕̪͖ͯ͑̈̄̿͊ͣ̈́͝ḧ̵̸̛̥͚̭̣͈͖̼͈͓͓̫͍́̓ͪͫ̋͘͡a̢̩̱̠̘̹̤̯͚̦̰̼̯̲̞͆͂̿ͬ̂͋͒̈ͅͅo̷̷̶̥͖̼̮̳̗͚ͦ̉̆̅̃̍ͤ̆͑ͣ̽́̚s̓̍̍̄͏̖̞̟̱́͡͡͝"
+
+ (sub, rest) = std.graphemestep(s)
+ std.assert(std.streq(sub, "c̸̶̡̡̗̣͕̪͖ͯ͑̈̄̿͊ͣ̈́͝"), "didn't get \"c̸̶̡̡̗̣͕̪͖ͯ͑̈̄̿͊ͣ̈́͝\" as next grapheme")
+
+ (sub, rest) = std.graphemestep(rest)
+ std.assert(std.streq(sub, "ḧ̵̸̛̥͚̭̣͈͖̼͈͓͓̫͍́̓ͪͫ̋͘͡"), "didn't get \"ḧ̵̸̛̥͚̭̣͈͖̼͈͓͓̫͍́̓ͪͫ̋͘͡\" as next grapheme, it was {}", rest)
+
+ (sub, rest) = std.graphemestep(rest)
+ std.assert(std.streq(sub, "a̢̩̱̠̘̹̤̯͚̦̰̼̯̲̞͆͂̿ͬ̂͋͒̈ͅͅ"), "didn't get \"a̢̩̱̠̘̹̤̯͚̦̰̼̯̲̞͆͂̿ͬ̂͋͒̈ͅͅ\" as next grapheme")
+
+ (sub, rest) = std.graphemestep(rest)
+ std.assert(std.streq(sub, "o̷̷̶̥͖̼̮̳̗͚ͦ̉̆̅̃̍ͤ̆͑ͣ̽́̚"), "didn't get \"o̷̷̶̥͖̼̮̳̗͚ͦ̉̆̅̃̍ͤ̆͑ͣ̽́̚\" as next grapheme")
+
+ (sub, rest) = std.graphemestep(rest)
+ std.assert(std.streq(sub, "s̓̍̍̄͏̖̞̟̱́͡͡͝"), "didn't get \"s̓̍̍̄͏̖̞̟̱́͡͡͝\" as next grapheme")
+
+ (sub, rest) = std.graphemestep(rest)
+ std.assert(sub.len == 0, "didn't get \"\" as last grapheme")
+
+ /* now with invalid UTF-8 */
+ s = [ ('A' : byte), ('b' : byte), (0xFE : byte),
+ (0xFF : byte), (0x92 : byte), ('c' : byte) ][:]
+
+ (sub, rest) = std.graphemestep(s)
+ std.assert(std.streq(sub, "A"), "didn't get \"A\" as next grapheme")
+
+ (sub, rest) = std.graphemestep(rest)
+ std.assert(std.streq(sub, "b"), "didn't get \"b\" as next grapheme")
+
+
+ (sub, rest) = std.graphemestep(rest)
+ std.assert(std.streq(sub, [ (0xFE : byte) ][:]), "didn't get 0xEE, len={} as next grapheme", sub.len)
+
+ (sub, rest) = std.graphemestep(rest)
+ std.assert(std.streq(sub, [ (0xFF : byte) ][:]), "didn't get 0xEA as next grapheme")
+
+ (sub, rest) = std.graphemestep(rest)
+ std.assert(std.streq(sub, [ (0x92 : byte) ][:]), "didn't get 0xEF as next grapheme")
+
+ (sub, rest) = std.graphemestep(rest)
+ std.assert(std.streq(sub, "c"), "didn't get \"c\" as next grapheme")
}
diff --git a/lib/std/utf.myr b/lib/std/utf.myr
index 439254c..9e297b3 100644
--- a/lib/std/utf.myr
+++ b/lib/std/utf.myr
@@ -12,7 +12,8 @@ pkg std =
const charlen : (chr : char -> size)
const encode : (buf : byte[:], chr : char -> size)
const decode : (buf : byte[:] -> char)
- const strstep : (str : byte[:] -> (char, byte[:]))
+ const charstep : (str : byte[:] -> (char, byte[:]))
+ const graphemestep : (str : byte[:] -> (byte[:], byte[:]))
const strcellwidth : (str : byte[:] -> size)
;;
@@ -59,11 +60,35 @@ const decode = {buf
var c
var b
- (c, b) = strstep(buf)
+ (c, b) = charstep(buf)
-> c
}
-const strstep = {str
+const graphemestep = {str
+ var len = 0
+ var rest = str
+ var c
+ var cn
+ var width
+
+ while rest.len > 0
+ (c, rest) = charstep(rest)
+ cn = cellwidth(c)
+
+ if (cn > 0 || c == Badchar) && width > 0
+ -> (str[:len], str[len:])
+ elif c == Badchar
+ -> (str[:1], str[1:])
+ else
+ len += charlen(c)
+ width += cn
+ ;;
+ ;;
+
+ -> (str[:len], str[len:])
+}
+
+const charstep = {str
var len
var mask
var chr
@@ -111,7 +136,7 @@ const strcellwidth = {str
var n : size = 0
while s.len > 0
- (c, s) = strstep(s)
+ (c, s) = charstep(s)
if c == Badchar
/* Something will probably be printed as U+FFFD */
n++