diff options
Diffstat (limited to 'src/nvim/mbyte.c')
-rw-r--r-- | src/nvim/mbyte.c | 39 |
1 files changed, 22 insertions, 17 deletions
diff --git a/src/nvim/mbyte.c b/src/nvim/mbyte.c index 07bf574c17..df490ff3c9 100644 --- a/src/nvim/mbyte.c +++ b/src/nvim/mbyte.c @@ -523,12 +523,14 @@ int utf_ptr2cells(const char *p_in) } /// Convert a UTF-8 byte sequence to a character number. -/// Doesn't handle ascii! only multibyte and illegal sequences. +/// Doesn't handle ascii! only multibyte and illegal sequences. ASCII (including NUL) +/// are treated like illegal sequences. /// /// @param[in] p String to convert. /// @param[in] len Length of the character in bytes, 0 or 1 if illegal. /// -/// @return Unicode codepoint. A negative value when the sequence is illegal. +/// @return Unicode codepoint. A negative value when the sequence is illegal (or +/// ASCII, including NUL). int32_t utf_ptr2CharInfo_impl(uint8_t const *p, uintptr_t const len) FUNC_ATTR_PURE FUNC_ATTR_NONNULL_ALL FUNC_ATTR_WARN_UNUSED_RESULT { @@ -1780,15 +1782,15 @@ int utf_head_off(const char *base_in, const char *p_in) start--; } - uint8_t cur_len = utf8len_tab[*start]; - int32_t cur_code = utf_ptr2CharInfo_impl(start, (uintptr_t)cur_len); - if (cur_code < 0) { + const uint8_t last_len = utf8len_tab[*start]; + int32_t cur_code = utf_ptr2CharInfo_impl(start, (uintptr_t)last_len); + if (cur_code < 0 || p - start >= last_len) { return 0; // p must be part of an illegal sequence } - const uint8_t * const safe_end = start + cur_len; + const uint8_t * const safe_end = start + last_len; int cur_bc = utf8proc_get_property(cur_code)->boundclass; - if (always_break(cur_bc)) { + if (always_break(cur_bc) || start == base) { return (int)(p - start); } @@ -1796,18 +1798,23 @@ int utf_head_off(const char *base_in, const char *p_in) const uint8_t *cur_pos = start; const uint8_t *const p_start = start; - if (start == base) { - return (int)(p - start); - } + while (true) { + if (start[-1] == NUL) { + break; + } + + start--; + if (*start < 0x80) { // stop on ascii, we are done + break; + } - start--; - while (*start >= 0x80) { // stop on ascii, we are done while (start > base && (*start & 0xc0) == 0x80 && (cur_pos - start) < 6) { start--; } - int32_t prev_code = utf_ptr2CharInfo_impl(start, (uintptr_t)utf8len_tab[*start]); - if (prev_code < 0) { + int prev_len = utf8len_tab[*start]; + int32_t prev_code = utf_ptr2CharInfo_impl(start, (uintptr_t)prev_len); + if (prev_code < 0 || prev_len < cur_pos - start) { start = cur_pos; // start at valid sequence after invalid bytes break; } @@ -1822,12 +1829,10 @@ int utf_head_off(const char *base_in, const char *p_in) cur_pos = start; cur_bc = prev_bc; cur_code = prev_code; - - start--; } // hot path: we are already on the first codepoint of a sequence - if (start == p_start) { + if (start == p_start && last_len > p - start) { return (int)(p - start); } |