diff options
Diffstat (limited to 'utf8.c')
-rw-r--r-- | utf8.c | 44 |
1 files changed, 44 insertions, 0 deletions
@@ -586,6 +586,50 @@ utf8_strvis(char *dst, const char *src, size_t len, int flag) } /* + * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free + * the returned string. Anything not valid printable ASCII or UTF-8 is + * stripped. + */ +char * +utf8_sanitize(const char *src) +{ + char *dst; + size_t n; + int more; + struct utf8_data utf8data; + u_int i; + + dst = NULL; + + n = 0; + while (*src != '\0') { + dst = xreallocarray(dst, n + 1, sizeof *dst); + if (utf8_open(&utf8data, *src)) { + more = 1; + while (*++src != '\0' && more) + more = utf8_append(&utf8data, *src); + if (!more) { + dst = xreallocarray(dst, n + utf8data.width, + sizeof *dst); + for (i = 0; i < utf8data.width; i++) + dst[n++] = '_'; + continue; + } + src -= utf8data.have; + } + if (*src > 0x1f && *src < 0x7f) + dst[n] = *src; + src++; + + n++; + } + + dst = xreallocarray(dst, n + 1, sizeof *dst); + dst[n] = '\0'; + return (dst); +} + +/* * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0. * Caller frees. */ |