aboutsummaryrefslogtreecommitdiff
path: root/src/nvim/mbyte.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/nvim/mbyte.c')
-rw-r--r--src/nvim/mbyte.c39
1 files changed, 22 insertions, 17 deletions
diff --git a/src/nvim/mbyte.c b/src/nvim/mbyte.c
index 07bf574c17..df490ff3c9 100644
--- a/src/nvim/mbyte.c
+++ b/src/nvim/mbyte.c
@@ -523,12 +523,14 @@ int utf_ptr2cells(const char *p_in)
}
/// Convert a UTF-8 byte sequence to a character number.
-/// Doesn't handle ascii! only multibyte and illegal sequences.
+/// Doesn't handle ascii! only multibyte and illegal sequences. ASCII (including NUL)
+/// are treated like illegal sequences.
///
/// @param[in] p String to convert.
/// @param[in] len Length of the character in bytes, 0 or 1 if illegal.
///
-/// @return Unicode codepoint. A negative value when the sequence is illegal.
+/// @return Unicode codepoint. A negative value when the sequence is illegal (or
+/// ASCII, including NUL).
int32_t utf_ptr2CharInfo_impl(uint8_t const *p, uintptr_t const len)
FUNC_ATTR_PURE FUNC_ATTR_NONNULL_ALL FUNC_ATTR_WARN_UNUSED_RESULT
{
@@ -1780,15 +1782,15 @@ int utf_head_off(const char *base_in, const char *p_in)
start--;
}
- uint8_t cur_len = utf8len_tab[*start];
- int32_t cur_code = utf_ptr2CharInfo_impl(start, (uintptr_t)cur_len);
- if (cur_code < 0) {
+ const uint8_t last_len = utf8len_tab[*start];
+ int32_t cur_code = utf_ptr2CharInfo_impl(start, (uintptr_t)last_len);
+ if (cur_code < 0 || p - start >= last_len) {
return 0; // p must be part of an illegal sequence
}
- const uint8_t * const safe_end = start + cur_len;
+ const uint8_t * const safe_end = start + last_len;
int cur_bc = utf8proc_get_property(cur_code)->boundclass;
- if (always_break(cur_bc)) {
+ if (always_break(cur_bc) || start == base) {
return (int)(p - start);
}
@@ -1796,18 +1798,23 @@ int utf_head_off(const char *base_in, const char *p_in)
const uint8_t *cur_pos = start;
const uint8_t *const p_start = start;
- if (start == base) {
- return (int)(p - start);
- }
+ while (true) {
+ if (start[-1] == NUL) {
+ break;
+ }
+
+ start--;
+ if (*start < 0x80) { // stop on ascii, we are done
+ break;
+ }
- start--;
- while (*start >= 0x80) { // stop on ascii, we are done
while (start > base && (*start & 0xc0) == 0x80 && (cur_pos - start) < 6) {
start--;
}
- int32_t prev_code = utf_ptr2CharInfo_impl(start, (uintptr_t)utf8len_tab[*start]);
- if (prev_code < 0) {
+ int prev_len = utf8len_tab[*start];
+ int32_t prev_code = utf_ptr2CharInfo_impl(start, (uintptr_t)prev_len);
+ if (prev_code < 0 || prev_len < cur_pos - start) {
start = cur_pos; // start at valid sequence after invalid bytes
break;
}
@@ -1822,12 +1829,10 @@ int utf_head_off(const char *base_in, const char *p_in)
cur_pos = start;
cur_bc = prev_bc;
cur_code = prev_code;
-
- start--;
}
// hot path: we are already on the first codepoint of a sequence
- if (start == p_start) {
+ if (start == p_start && last_len > p - start) {
return (int)(p - start);
}