diff options
Diffstat (limited to 'src/nvim/mbyte.c')
-rw-r--r-- | src/nvim/mbyte.c | 123 |
1 files changed, 57 insertions, 66 deletions
diff --git a/src/nvim/mbyte.c b/src/nvim/mbyte.c index 0d468889a4..3a13aeddb8 100644 --- a/src/nvim/mbyte.c +++ b/src/nvim/mbyte.c @@ -48,6 +48,7 @@ #include "nvim/getchar.h" #include "nvim/gettext.h" #include "nvim/globals.h" +#include "nvim/grid.h" #include "nvim/grid_defs.h" #include "nvim/iconv.h" #include "nvim/keycodes.h" @@ -722,80 +723,68 @@ bool utf_composinglike(const char *p1, const char *p2) return arabic_combine(utf_ptr2char(p1), c2); } -/// Convert a UTF-8 string to a wide character +/// Get the screen char at the beginning of a string /// -/// Also gets up to #MAX_MCO composing characters. +/// Caller is expected to check for things like unprintable chars etc +/// If first char in string is a composing char, prepend a space to display it correctly. /// -/// @param[out] pcc Location where to store composing characters. Must have -/// space at least for #MAX_MCO + 1 elements. +/// If "p" starts with an invalid sequence, zero is returned. /// -/// @return leading character. -int utfc_ptr2char(const char *p, int *pcc) +/// @param[out] firstc (required) The first codepoint of the screen char, +/// or the first byte of an invalid sequence +/// +/// @return the char +schar_T utfc_ptr2schar(const char *p, int *firstc) + FUNC_ATTR_NONNULL_ALL { - int i = 0; - int c = utf_ptr2char(p); - int len = utf_ptr2len(p); - - // Only accept a composing char when the first char isn't illegal. - if ((len > 1 || (uint8_t)(*p) < 0x80) - && (uint8_t)p[len] >= 0x80 - && utf_composinglike(p, p + len)) { - int cc = utf_ptr2char(p + len); - while (true) { - pcc[i++] = cc; - if (i == MAX_MCO) { - break; - } - len += utf_ptr2len(p + len); - if ((uint8_t)p[len] < 0x80 || !utf_iscomposing(cc = utf_ptr2char(p + len))) { - break; - } - } - } + *firstc = c; // NOT optional, you are gonna need it + bool first_compose = utf_iscomposing(c); + size_t maxlen = MAX_SCHAR_SIZE - 1 - first_compose; + size_t len = (size_t)utfc_ptr2len_len(p, (int)maxlen); - if (i < MAX_MCO) { // last composing char must be 0 - pcc[i] = 0; + if (len == 1 && (uint8_t)(*p) >= 0x80) { + return 0; // invalid sequence } - return c; + return schar_from_buf_first(p, len, first_compose); } -// Convert a UTF-8 byte string to a wide character. Also get up to MAX_MCO -// composing characters. Use no more than p[maxlen]. -// -// @param [out] pcc: composing chars, last one is 0 -int utfc_ptr2char_len(const char *p, int *pcc, int maxlen) +/// Get the screen char at the beginning of a string with length +/// +/// Like utfc_ptr2schar but use no more than p[maxlen]. +schar_T utfc_ptr2schar_len(const char *p, int maxlen, int *firstc) + FUNC_ATTR_NONNULL_ALL { assert(maxlen > 0); - int i = 0; + size_t len = (size_t)utf_ptr2len_len(p, maxlen); + if (len > (size_t)maxlen || (len == 1 && (uint8_t)(*p) >= 0x80) || len == 0) { + // invalid or truncated sequence + *firstc = (uint8_t)(*p); + return 0; + } - int len = utf_ptr2len_len(p, maxlen); - // Is it safe to use utf_ptr2char()? - bool safe = len > 1 && len <= maxlen; - int c = safe ? utf_ptr2char(p) : (uint8_t)(*p); + int c = utf_ptr2char(p); + *firstc = c; + bool first_compose = utf_iscomposing(c); + maxlen = MIN(maxlen, MAX_SCHAR_SIZE - 1 - first_compose); + len = (size_t)utfc_ptr2len_len(p, maxlen); - // Only accept a composing char when the first char isn't illegal. - if ((safe || c < 0x80) && len < maxlen && (uint8_t)p[len] >= 0x80) { - for (; i < MAX_MCO; i++) { - int len_cc = utf_ptr2len_len(p + len, maxlen - len); - safe = len_cc > 1 && len_cc <= maxlen - len; - if (!safe || (pcc[i] = utf_ptr2char(p + len)) < 0x80 - || !(i == 0 ? utf_composinglike(p, p + len) : utf_iscomposing(pcc[i]))) { - break; - } - len += len_cc; - } - } + return schar_from_buf_first(p, len, first_compose); +} - if (i < MAX_MCO) { - // last composing char must be 0 - pcc[i] = 0; +/// Caller must ensure there is space for `first_compose` +static schar_T schar_from_buf_first(const char *buf, size_t len, bool first_compose) +{ + if (first_compose) { + char cbuf[MAX_SCHAR_SIZE]; + cbuf[0] = ' '; + memcpy(cbuf + 1, buf, len); + return schar_from_buf(cbuf, len + 1); + } else { + return schar_from_buf(buf, len); } - - return c; -#undef ISCOMPOSING } /// Get the length of a UTF-8 byte sequence representing a single codepoint @@ -878,8 +867,7 @@ int utfc_ptr2len(const char *const p) return 1; } - // Check for composing characters. We can handle only the first six, but - // skip all of them (otherwise the cursor would get stuck). + // Check for composing characters. int prevlen = 0; while (true) { if ((uint8_t)p[len] < 0x80 || !utf_composinglike(p + prevlen, p + len)) { @@ -1815,12 +1803,12 @@ int utf_cp_tail_off(const char *base, const char *p_in) /// Return the offset from "p" to the first byte of the codepoint it points /// to. Can start anywhere in a stream of bytes. /// Note: Unlike `utf_head_off`, this counts individual codepoints of composed characters -/// separately and returns a negative offset. +/// separately. /// /// @param[in] base Pointer to start of string /// @param[in] p Pointer to byte for which to return the offset to the previous codepoint // -/// @return 0 if invalid sequence, else offset to previous codepoint +/// @return 0 if invalid sequence, else number of bytes to previous codepoint int utf_cp_head_off(const char *base, const char *p) { int i; @@ -1830,17 +1818,20 @@ int utf_cp_head_off(const char *base, const char *p) } // Find the first character that is not 10xx.xxxx - for (i = 0; p - i > base; i--) { - if (((uint8_t)p[i] & 0xc0) != 0x80) { + for (i = 0; p - i >= base; i++) { + if (((uint8_t)p[-i] & 0xc0) != 0x80) { break; } } - // Find the last character that is 10xx.xxxx - for (int j = 0; ((uint8_t)p[j + 1] & 0xc0) == 0x80; j++) {} + // Find the last character that is 10xx.xxxx (condition terminates on NUL) + int j = 1; + while (((uint8_t)p[j] & 0xc0) == 0x80) { + j++; + } // Check for illegal sequence. - if (utf8len_tab[(uint8_t)p[i]] == 1) { + if (utf8len_tab[(uint8_t)p[-i]] != j + i) { return 0; } return i; |