diff options
author | VanaIgr <vanaigranov@gmail.com> | 2024-02-26 04:12:55 -0600 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-02-26 18:12:55 +0800 |
commit | ad5a155b1f4b387d3aaa54c91d0146cb0287bb9f (patch) | |
tree | ae35dff22d4f418f040d39acc88206e64ffb1984 /src/nvim/mbyte.c | |
parent | 8b4e26915612caf2d143edca31919cae18a848a1 (diff) | |
download | rneovim-ad5a155b1f4b387d3aaa54c91d0146cb0287bb9f.tar.gz rneovim-ad5a155b1f4b387d3aaa54c91d0146cb0287bb9f.tar.bz2 rneovim-ad5a155b1f4b387d3aaa54c91d0146cb0287bb9f.zip |
fix(mbyte): fix bugs in utf_cp_*_off() functions
Problems:
- Illegal bytes after valid UTF-8 char cause utf_cp_*_off() to fail.
- When stream isn't NUL-terminated, utf_cp_*_off() may go over the end.
Solution: Don't go over end of the char of end of the string.
Diffstat (limited to 'src/nvim/mbyte.c')
-rw-r--r-- | src/nvim/mbyte.c | 111 |
1 files changed, 32 insertions, 79 deletions
diff --git a/src/nvim/mbyte.c b/src/nvim/mbyte.c index f8451e62e2..cf206aa68b 100644 --- a/src/nvim/mbyte.c +++ b/src/nvim/mbyte.c @@ -1884,99 +1884,52 @@ void mb_copy_char(const char **const fp, char **const tp) *fp += l; } -/// Return the offset from "p_in" to the first byte of a character. When "p_in" is -/// at the start of a character 0 is returned, otherwise the offset to the next -/// character. Can start anywhere in a stream of bytes. -int mb_off_next(const char *base, const char *p_in) +/// Returns the offset in bytes from "p_in" to the first and one-past-end bytes +/// of the codepoint it points to. +/// "p_in" can point anywhere in a stream of bytes. +/// "p_len" limits number of bytes after "p_in". +/// Note: Counts individual codepoints of composed characters separately. +CharBoundsOff utf_cp_bounds_len(char const *base, char const *p_in, int p_len) + FUNC_ATTR_PURE FUNC_ATTR_NONNULL_ALL { - const uint8_t *p = (uint8_t *)p_in; - int i; - - if (*p < 0x80) { // be quick for ASCII - return 0; + assert(base <= p_in && p_len > 0); + uint8_t const *const b = (uint8_t *)base; + uint8_t const *const p = (uint8_t *)p_in; + if (*p < 0x80U) { // be quick for ASCII + return (CharBoundsOff){ 0, 1 }; } - // Find the next character that isn't 10xx.xxxx - for (i = 0; (p[i] & 0xc0) == 0x80; i++) {} - if (i > 0) { - int j; - // Check for illegal sequence. - for (j = 0; p - j > (uint8_t *)base; j++) { - if ((p[-j] & 0xc0) != 0x80) { - break; - } - } - if (utf8len_tab[p[-j]] != i + j) { - return 0; + int const max_first_off = -MIN((int)(p - b), MB_MAXCHAR - 1); + int first_off = 0; + for (; utf_is_trail_byte(p[first_off]); first_off--) { + if (first_off == max_first_off) { // failed to find first byte + return (CharBoundsOff){ 0, 1 }; } } - return i; -} -/// Return the offset from `p_in` to the last byte of the codepoint it points -/// to. Can start anywhere in a stream of bytes. -/// Note: Counts individual codepoints of composed characters separately. -int utf_cp_tail_off(const char *base, const char *p_in) -{ - const uint8_t *p = (uint8_t *)p_in; - int i; - int j; - - if (*p == NUL) { - return 0; + int const max_end_off = utf8len_tab[p[first_off]] + first_off; + if (max_end_off <= 0 || max_end_off > p_len) { // illegal or incomplete sequence + return (CharBoundsOff){ 0, 1 }; } - // Find the last character that is 10xx.xxxx - for (i = 0; (p[i + 1] & 0xc0) == 0x80; i++) {} - - // Check for illegal sequence. - for (j = 0; p_in - j > base; j++) { - if ((p[-j] & 0xc0) != 0x80) { - break; + for (int end_off = 1; end_off < max_end_off; end_off++) { + if (!utf_is_trail_byte(p[end_off])) { // not enough trail bytes + return (CharBoundsOff){ 0, 1 }; } } - if (utf8len_tab[p[-j]] != i + j + 1) { - return 0; - } - return i; + return (CharBoundsOff){ .begin_off = (int8_t)-first_off, .end_off = (int8_t)max_end_off }; } -/// Return the offset from "p" to the first byte of the codepoint it points -/// to. Can start anywhere in a stream of bytes. -/// Note: Unlike `utf_head_off`, this counts individual codepoints of composed characters -/// separately. -/// -/// @param[in] base Pointer to start of string -/// @param[in] p Pointer to byte for which to return the offset to the previous codepoint -// -/// @return 0 if invalid sequence, else number of bytes to previous codepoint -int utf_cp_head_off(const char *base, const char *p) +/// Returns the offset in bytes from "p_in" to the first and one-past-end bytes +/// of the codepoint it points to. +/// "p_in" can point anywhere in a stream of bytes. +/// Stream must be NUL-terminated. +/// Note: Counts individual codepoints of composed characters separately. +CharBoundsOff utf_cp_bounds(char const *base, char const *p_in) + FUNC_ATTR_PURE FUNC_ATTR_NONNULL_ALL { - int i; - - if (*p == NUL) { - return 0; - } - - // Find the first character that is not 10xx.xxxx - for (i = 0; p - i >= base; i++) { - if (((uint8_t)p[-i] & 0xc0) != 0x80) { - break; - } - } - - // Find the last character that is 10xx.xxxx (condition terminates on NUL) - int j = 1; - while (((uint8_t)p[j] & 0xc0) == 0x80) { - j++; - } - - // Check for illegal sequence. - if (utf8len_tab[(uint8_t)p[-i]] != j + i) { - return 0; - } - return i; + return utf_cp_bounds_len(base, p_in, INT_MAX); } // Find the next illegal byte sequence. |