diff options
author | Thomas Adam <thomas@xteddy.org> | 2015-11-12 12:01:17 +0000 |
---|---|---|
committer | Thomas Adam <thomas@xteddy.org> | 2015-11-12 12:01:17 +0000 |
commit | 5f483499f3a7b98da9ac67cd62ed91034a5949ed (patch) | |
tree | de84187f104010233d96acc68bb9cc91d173c243 /utf8.c | |
parent | 333da3b64b4ce8c0343f082c3923473205ab2b27 (diff) | |
parent | 0cc812ae342d1a71c0337db8ffb4d7701668cb38 (diff) | |
download | rtmux-5f483499f3a7b98da9ac67cd62ed91034a5949ed.tar.gz rtmux-5f483499f3a7b98da9ac67cd62ed91034a5949ed.tar.bz2 rtmux-5f483499f3a7b98da9ac67cd62ed91034a5949ed.zip |
Merge branch 'obsd-master'
Diffstat (limited to 'utf8.c')
-rw-r--r-- | utf8.c | 78 |
1 files changed, 76 insertions, 2 deletions
@@ -393,6 +393,8 @@ utf8_open(struct utf8_data *utf8data, u_char ch) int utf8_append(struct utf8_data *utf8data, u_char ch) { + /* XXX this should do validity checks too! */ + if (utf8data->have >= utf8data->size) fatalx("UTF-8 character overflow"); if (utf8data->size > sizeof utf8data->data) @@ -466,18 +468,46 @@ utf8_combine(const struct utf8_data *utf8data) case 3: value = utf8data->data[2] & 0x3f; value |= (utf8data->data[1] & 0x3f) << 6; - value |= (utf8data->data[0] & 0x0f) << 12; + value |= (utf8data->data[0] & 0xf) << 12; break; case 4: value = utf8data->data[3] & 0x3f; value |= (utf8data->data[2] & 0x3f) << 6; value |= (utf8data->data[1] & 0x3f) << 12; - value |= (utf8data->data[0] & 0x07) << 18; + value |= (utf8data->data[0] & 0x7) << 18; break; } return (value); } +/* Split a UTF-8 character. */ +int +utf8_split(u_int uc, struct utf8_data *utf8data) +{ + if (uc < 0x7f) { + utf8data->size = 1; + utf8data->data[0] = uc; + } else if (uc < 0x7ff) { + utf8data->size = 2; + utf8data->data[0] = 0xc0 | ((uc >> 6) & 0x1f); + utf8data->data[1] = 0x80 | (uc & 0x3f); + } else if (uc < 0xffff) { + utf8data->size = 3; + utf8data->data[0] = 0xe0 | ((uc >> 12) & 0xf); + utf8data->data[1] = 0x80 | ((uc >> 6) & 0x3f); + utf8data->data[2] = 0x80 | (uc & 0x3f); + } else if (uc < 0x1fffff) { + utf8data->size = 4; + utf8data->data[0] = 0xf0 | ((uc >> 18) & 0x7); + utf8data->data[1] = 0x80 | ((uc >> 12) & 0x3f); + utf8data->data[2] = 0x80 | ((uc >> 6) & 0x3f); + utf8data->data[3] = 0x80 | (uc & 0x3f); + } else + return (-1); + utf8data->width = utf8_width(utf8data); + return (0); +} + /* Split a two-byte UTF-8 character. */ u_int utf8_split2(u_int uc, u_char *ptr) @@ -555,6 +585,50 @@ utf8_strvis(char *dst, const char *src, size_t len, int flag) } /* + * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free + * the returned string. Anything not valid printable ASCII or UTF-8 is + * stripped. + */ +char * +utf8_sanitize(const char *src) +{ + char *dst; + size_t n; + int more; + struct utf8_data utf8data; + u_int i; + + dst = NULL; + + n = 0; + while (*src != '\0') { + dst = xreallocarray(dst, n + 1, sizeof *dst); + if (utf8_open(&utf8data, *src)) { + more = 1; + while (*++src != '\0' && more) + more = utf8_append(&utf8data, *src); + if (!more) { + dst = xreallocarray(dst, n + utf8data.width, + sizeof *dst); + for (i = 0; i < utf8data.width; i++) + dst[n++] = '_'; + continue; + } + src -= utf8data.have; + } + if (*src > 0x1f && *src < 0x7f) + dst[n] = *src; + src++; + + n++; + } + + dst = xreallocarray(dst, n + 1, sizeof *dst); + dst[n] = '\0'; + return (dst); +} + +/* * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0. * Caller frees. */ |