diff options
Diffstat (limited to 'src/nvim/mbyte.c')
| -rw-r--r-- | src/nvim/mbyte.c | 90 |
1 files changed, 59 insertions, 31 deletions
diff --git a/src/nvim/mbyte.c b/src/nvim/mbyte.c index 62cc3b56ed..12460646ed 100644 --- a/src/nvim/mbyte.c +++ b/src/nvim/mbyte.c @@ -346,7 +346,6 @@ static int enc_canon_search(const char_u *name) } - /* * Find canonical encoding "name" in the list and return its properties. * Returns 0 if not found. @@ -565,7 +564,7 @@ size_t mb_string2cells(const char_u *str) { size_t clen = 0; - for (const char_u *p = str; *p != NUL; p += (*mb_ptr2len)(p)) { + for (const char_u *p = str; *p != NUL; p += utfc_ptr2len(p)) { clen += utf_ptr2cells(p); } @@ -675,16 +674,16 @@ static int utf_safe_read_char_adv(const char_u **s, size_t *n) } if (k <= *n) { - /* We have a multibyte sequence and it isn't truncated by buffer - * limits so utf_ptr2char() is safe to use. Or the first byte is - * illegal (k=0), and it's also safe to use utf_ptr2char(). */ + // We have a multibyte sequence and it isn't truncated by buffer + // limits so utf_ptr2char() is safe to use. Or the first byte is + // illegal (k=0), and it's also safe to use utf_ptr2char(). c = utf_ptr2char(*s); - /* On failure, utf_ptr2char() returns the first byte, so here we - * check equality with the first byte. The only non-ASCII character - * which equals the first byte of its own UTF-8 representation is - * U+00C3 (UTF-8: 0xC3 0x83), so need to check that special case too. - * It's safe even if n=1, else we would have k=2 > n. */ + // On failure, utf_ptr2char() returns the first byte, so here we + // check equality with the first byte. The only non-ASCII character + // which equals the first byte of its own UTF-8 representation is + // U+00C3 (UTF-8: 0xC3 0x83), so need to check that special case too. + // It's safe even if n=1, else we would have k=2 > n. if (c != (int)(**s) || (c == 0xC3 && (*s)[1] == 0x83)) { // byte sequence was successfully decoded *s += k; @@ -706,7 +705,7 @@ int mb_ptr2char_adv(const char_u **const pp) int c; c = utf_ptr2char(*pp); - *pp += (*mb_ptr2len)(*pp); + *pp += utfc_ptr2len(*pp); return c; } @@ -763,7 +762,7 @@ int utfc_ptr2char(const char_u *p, int *pcc) // Only accept a composing char when the first char isn't illegal. if ((len > 1 || *p < 0x80) && p[len] >= 0x80 - && UTF_COMPOSINGLIKE(p, p + len)) { + && utf_composinglike(p, p + len)) { cc = utf_ptr2char(p + len); for (;; ) { pcc[i++] = cc; @@ -792,9 +791,6 @@ int utfc_ptr2char(const char_u *p, int *pcc) */ int utfc_ptr2char_len(const char_u *p, int *pcc, int maxlen) { -#define IS_COMPOSING(s1, s2, s3) \ - (i == 0 ? UTF_COMPOSINGLIKE((s1), (s2)) : utf_iscomposing((s3))) - assert(maxlen > 0); int i = 0; @@ -810,7 +806,7 @@ int utfc_ptr2char_len(const char_u *p, int *pcc, int maxlen) int len_cc = utf_ptr2len_len(p + len, maxlen - len); safe = len_cc > 1 && len_cc <= maxlen - len; if (!safe || (pcc[i] = utf_ptr2char(p + len)) < 0x80 - || !IS_COMPOSING(p, p + len, pcc[i])) { + || !(i == 0 ? utf_composinglike(p, p+len) : utf_iscomposing(pcc[i]))) { break; } len += len_cc; @@ -915,7 +911,7 @@ int utfc_ptr2len(const char_u *const p) // skip all of them (otherwise the cursor would get stuck). int prevlen = 0; for (;;) { - if (p[len] < 0x80 || !UTF_COMPOSINGLIKE(p + prevlen, p + len)) { + if (p[len] < 0x80 || !utf_composinglike(p + prevlen, p + len)) { return len; } @@ -965,14 +961,14 @@ int utfc_ptr2len_len(const char_u *p, int size) /* * Next character length should not go beyond size to ensure that - * UTF_COMPOSINGLIKE(...) does not read beyond size. + * utf_composinglike(...) does not read beyond size. */ len_next_char = utf_ptr2len_len(p + len, size - len); if (len_next_char > size - len) { break; } - if (!UTF_COMPOSINGLIKE(p + prevlen, p + len)) { + if (!utf_composinglike(p + prevlen, p + len)) { break; } @@ -1582,12 +1578,12 @@ void show_utf8(void) int clen; int i; - /* Get the byte length of the char under the cursor, including composing - * characters. */ + // Get the byte length of the char under the cursor, including composing + // characters. line = get_cursor_pos_ptr(); len = utfc_ptr2len(line); if (len == 0) { - MSG("NUL"); + msg("NUL"); return; } @@ -1610,7 +1606,7 @@ void show_utf8(void) } } - msg(IObuff); + msg((char *)IObuff); } /// Return offset from "p" to the first byte of the character it points into. @@ -1625,8 +1621,8 @@ int utf_head_off(const char_u *base, const char_u *p) return 0; } - /* Skip backwards over trailing bytes: 10xx.xxxx - * Skip backwards again if on a composing char. */ + // Skip backwards over trailing bytes: 10xx.xxxx + // Skip backwards again if on a composing char. const char_u *q; for (q = p;; --q) { // Move s to the last byte of this char. @@ -1883,6 +1879,40 @@ int mb_tail_off(char_u *base, char_u *p) return i; } + +/// Return the offset from "p" to the first byte of the character it points +/// into. Can start anywhere in a stream of bytes. +/// +/// @param[in] base Pointer to start of string +/// @param[in] p Pointer to byte for which to return the offset to the previous codepoint +// +/// @return 0 if invalid sequence, else offset to previous codepoint +int mb_head_off(char_u *base, char_u *p) +{ + int i; + int j; + + if (*p == NUL) { + return 0; + } + + // Find the first character that is not 10xx.xxxx + for (i = 0; p - i > base; i--) { + if ((p[i] & 0xc0) != 0x80) { + break; + } + } + + // Find the last character that is 10xx.xxxx + for (j = 0; (p[j + 1] & 0xc0) == 0x80; j++) {} + + // Check for illegal sequence. + if (utf8len_tab[p[i]] == 1) { + return 0; + } + return i; +} + /* * Find the next illegal byte sequence. */ @@ -1915,8 +1945,8 @@ void utf_find_illegal(void) } while (*p != NUL) { - /* Illegal means that there are not enough trail bytes (checked by - * utf_ptr2len()) or too many of them (overlong sequence). */ + // Illegal means that there are not enough trail bytes (checked by + // utf_ptr2len()) or too many of them (overlong sequence). len = utf_ptr2len(p); if (*p >= 0x80 && (len == 1 || utf_char2len(utf_ptr2char(p)) != len)) { @@ -2021,7 +2051,7 @@ int mb_charlen(char_u *str) } for (count = 0; *p != NUL; count++) { - p += (*mb_ptr2len)(p); + p += utfc_ptr2len(p); } return count; @@ -2036,7 +2066,7 @@ int mb_charlen_len(char_u *str, int len) int count; for (count = 0; *p != NUL && p < str + len; count++) { - p += (*mb_ptr2len)(p); + p += utfc_ptr2len(p); } return count; @@ -2398,8 +2428,6 @@ static char_u *iconv_string(const vimconv_T *const vcp, char_u *str, size_t slen #endif // HAVE_ICONV - - /* * Setup "vcp" for conversion from "from" to "to". * The names must have been made canonical with enc_canonize(). |
