diff options
-rw-r--r-- | src/nvim/globals.h | 23 | ||||
-rw-r--r-- | src/nvim/mbyte.c | 248 |
2 files changed, 137 insertions, 134 deletions
diff --git a/src/nvim/globals.h b/src/nvim/globals.h index 13ecafcbe3..86fff46737 100644 --- a/src/nvim/globals.h +++ b/src/nvim/globals.h @@ -725,29 +725,6 @@ EXTERN int vr_lines_changed INIT(= 0); /* #Lines changed by "gR" so far */ /// Encoding used when 'fencs' is set to "default" EXTERN char_u *fenc_default INIT(= NULL); -// To speed up BYTELEN(); keep a lookup table to quickly get the length in -// bytes of a UTF-8 character from the first byte of a UTF-8 string. Bytes -// which are illegal when used as the first byte have a 1. The NUL byte has -// length 1. -EXTERN char utf8len_tab[256] INIT(= { - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, - 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1, -}); - # if defined(USE_ICONV) && defined(DYNAMIC_ICONV) /* Pointers to functions and variables to be loaded at runtime */ EXTERN size_t (*iconv)(iconv_t cd, const char **inbuf, size_t *inbytesleft, diff --git a/src/nvim/mbyte.c b/src/nvim/mbyte.c index b24770a409..f65d7a6b13 100644 --- a/src/nvim/mbyte.c +++ b/src/nvim/mbyte.c @@ -72,19 +72,41 @@ struct interval { # include "unicode_tables.generated.h" #endif -/* - * Like utf8len_tab above, but using a zero for illegal lead bytes. - */ -const uint8_t utf8len_tab_zero[256] = -{ - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,0,0, +// To speed up BYTELEN(); keep a lookup table to quickly get the length in +// bytes of a UTF-8 character from the first byte of a UTF-8 string. Bytes +// which are illegal when used as the first byte have a 1. The NUL byte has +// length 1. +const uint8_t utf8len_tab[] = { + // ?1 ?2 ?3 ?4 ?5 ?6 ?7 ?8 ?9 ?A ?B ?C ?D ?E ?F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0? + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1? + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 2? + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 3? + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4? + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5? + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6? + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7? + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 8? + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 9? + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A? + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B? + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C? + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D? + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E? + 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1, // F? +}; + +// Like utf8len_tab above, but using a zero for illegal lead bytes. +const uint8_t utf8len_tab_zero[] = { + //1 2 3 4 5 6 7 8 9 A B C D E F 0 1 2 3 4 5 6 7 8 9 A B C D E F + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 2 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 4 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 6 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 8 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // A + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // C + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,0,0, // E }; /* @@ -528,45 +550,52 @@ int utf_off2cells(unsigned off, unsigned max_off) return (off + 1 < max_off && ScreenLines[off + 1] == 0) ? 2 : 1; } -/* - * Convert a UTF-8 byte sequence to a wide character. - * If the sequence is illegal or truncated by a NUL the first byte is - * returned. - * Does not include composing characters, of course. - */ -int utf_ptr2char(const char_u *p) +/// Convert a UTF-8 byte sequence to a wide character +/// +/// If the sequence is illegal or truncated by a NUL then the first byte is +/// returned. Does not include composing characters for obvious reasons. +/// +/// @param[in] p String to convert. +/// +/// @return Unicode codepoint or byte value. +int utf_ptr2char(const char_u *const p) + FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT { - uint8_t len; - - if (p[0] < 0x80) /* be quick for ASCII */ + if (p[0] < 0x80) { // Be quick for ASCII. return p[0]; + } - len = utf8len_tab_zero[p[0]]; + const uint8_t len = utf8len_tab_zero[p[0]]; if (len > 1 && (p[1] & 0xc0) == 0x80) { - if (len == 2) + if (len == 2) { return ((p[0] & 0x1f) << 6) + (p[1] & 0x3f); + } if ((p[2] & 0xc0) == 0x80) { - if (len == 3) - return ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) - + (p[2] & 0x3f); + if (len == 3) { + return (((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + + (p[2] & 0x3f)); + } if ((p[3] & 0xc0) == 0x80) { - if (len == 4) - return ((p[0] & 0x07) << 18) + ((p[1] & 0x3f) << 12) - + ((p[2] & 0x3f) << 6) + (p[3] & 0x3f); + if (len == 4) { + return (((p[0] & 0x07) << 18) + ((p[1] & 0x3f) << 12) + + ((p[2] & 0x3f) << 6) + (p[3] & 0x3f)); + } if ((p[4] & 0xc0) == 0x80) { - if (len == 5) - return ((p[0] & 0x03) << 24) + ((p[1] & 0x3f) << 18) - + ((p[2] & 0x3f) << 12) + ((p[3] & 0x3f) << 6) - + (p[4] & 0x3f); - if ((p[5] & 0xc0) == 0x80 && len == 6) - return ((p[0] & 0x01) << 30) + ((p[1] & 0x3f) << 24) - + ((p[2] & 0x3f) << 18) + ((p[3] & 0x3f) << 12) - + ((p[4] & 0x3f) << 6) + (p[5] & 0x3f); + if (len == 5) { + return (((p[0] & 0x03) << 24) + ((p[1] & 0x3f) << 18) + + ((p[2] & 0x3f) << 12) + ((p[3] & 0x3f) << 6) + + (p[4] & 0x3f)); + } + if ((p[5] & 0xc0) == 0x80 && len == 6) { + return (((p[0] & 0x01) << 30) + ((p[1] & 0x3f) << 24) + + ((p[2] & 0x3f) << 18) + ((p[3] & 0x3f) << 12) + + ((p[4] & 0x3f) << 6) + (p[5] & 0x3f)); + } } } } } - /* Illegal value, just return the first byte */ + // Illegal value: just return the first byte. return p[0]; } @@ -767,23 +796,24 @@ int utfc_char2bytes(int off, char_u *buf) return len; } -/* - * Get the length of a UTF-8 byte sequence, not including any following - * composing characters. - * Returns 0 for "". - * Returns 1 for an illegal byte sequence. - */ -int utf_ptr2len(const char_u *p) +/// Get the length of a UTF-8 byte sequence representing a single codepoint +/// +/// @param[in] p UTF-8 string. +/// +/// @return Sequence length, 0 for empty string and 1 for non-UTF-8 byte +/// sequence. +int utf_ptr2len(const char_u *const p) + FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_NONNULL_ALL { - int len; - int i; - - if (*p == NUL) + if (*p == NUL) { return 0; - len = utf8len_tab[*p]; - for (i = 1; i < len; ++i) - if ((p[i] & 0xc0) != 0x80) + } + const int len = utf8len_tab[*p]; + for (int i = 1; i < len; i++) { + if ((p[i] & 0xc0) != 0x80) { return 1; + } + } return len; } @@ -824,38 +854,38 @@ int utf_ptr2len_len(const char_u *p, int size) return len; } -/* - * Return the number of bytes the UTF-8 encoding of the character at "p" takes. - * This includes following composing characters. - */ -int utfc_ptr2len(const char_u *p) +/// Return the number of bytes occupied by a UTF-8 character in a string +/// +/// This includes following composing characters. +int utfc_ptr2len(const char_u *const p) + FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_NONNULL_ALL { - int len; - int b0 = *p; - int prevlen; + uint8_t b0 = (uint8_t)(*p); - if (b0 == NUL) + if (b0 == NUL) { return 0; - if (b0 < 0x80 && p[1] < 0x80) /* be quick for ASCII */ + } + if (b0 < 0x80 && p[1] < 0x80) { // be quick for ASCII return 1; + } - /* Skip over first UTF-8 char, stopping at a NUL byte. */ - len = utf_ptr2len(p); + // Skip over first UTF-8 char, stopping at a NUL byte. + int len = utf_ptr2len(p); - /* Check for illegal byte. */ - if (len == 1 && b0 >= 0x80) + // Check for illegal byte. + if (len == 1 && b0 >= 0x80) { return 1; + } - /* - * Check for composing characters. We can handle only the first six, but - * skip all of them (otherwise the cursor would get stuck). - */ - prevlen = 0; - for (;; ) { - if (p[len] < 0x80 || !UTF_COMPOSINGLIKE(p + prevlen, p + len)) + // Check for composing characters. We can handle only the first six, but + // skip all of them (otherwise the cursor would get stuck). + int prevlen = 0; + for (;;) { + if (p[len] < 0x80 || !UTF_COMPOSINGLIKE(p + prevlen, p + len)) { return len; + } - /* Skip over composing char */ + // Skip over composing char. prevlen = len; len += utf_ptr2len(p + len); } @@ -913,23 +943,22 @@ int utfc_ptr2len_len(const char_u *p, int size) return len; } -/* - * Return the number of bytes the UTF-8 encoding of character "c" takes. - * This does not include composing characters. - */ -int utf_char2len(int c) +/// Determine how many bytes certain unicode codepoint will occupy +int utf_char2len(const int c) { - if (c < 0x80) + if (c < 0x80) { return 1; - if (c < 0x800) + } else if (c < 0x800) { return 2; - if (c < 0x10000) + } else if (c < 0x10000) { return 3; - if (c < 0x200000) + } else if (c < 0x200000) { return 4; - if (c < 0x4000000) + } else if (c < 0x4000000) { return 5; - return 6; + } else { + return 6; + } } /// Convert Unicode character to UTF-8 string @@ -937,46 +966,42 @@ int utf_char2len(int c) /// @param c character to convert to \p buf /// @param[out] buf UTF-8 string generated from \p c, does not add \0 /// @return Number of bytes (1-6). Does not include composing characters. -int utf_char2bytes(int c, char_u *const buf) +int utf_char2bytes(const int c, char_u *const buf) { - if (c < 0x80) { /* 7 bits */ + if (c < 0x80) { // 7 bits buf[0] = c; return 1; - } - if (c < 0x800) { /* 11 bits */ + } else if (c < 0x800) { // 11 bits buf[0] = 0xc0 + ((unsigned)c >> 6); buf[1] = 0x80 + (c & 0x3f); return 2; - } - if (c < 0x10000) { /* 16 bits */ + } else if (c < 0x10000) { // 16 bits buf[0] = 0xe0 + ((unsigned)c >> 12); buf[1] = 0x80 + (((unsigned)c >> 6) & 0x3f); buf[2] = 0x80 + (c & 0x3f); return 3; - } - if (c < 0x200000) { /* 21 bits */ + } else if (c < 0x200000) { // 21 bits buf[0] = 0xf0 + ((unsigned)c >> 18); buf[1] = 0x80 + (((unsigned)c >> 12) & 0x3f); buf[2] = 0x80 + (((unsigned)c >> 6) & 0x3f); buf[3] = 0x80 + (c & 0x3f); return 4; - } - if (c < 0x4000000) { /* 26 bits */ + } else if (c < 0x4000000) { // 26 bits buf[0] = 0xf8 + ((unsigned)c >> 24); buf[1] = 0x80 + (((unsigned)c >> 18) & 0x3f); buf[2] = 0x80 + (((unsigned)c >> 12) & 0x3f); buf[3] = 0x80 + (((unsigned)c >> 6) & 0x3f); buf[4] = 0x80 + (c & 0x3f); return 5; + } else { // 31 bits + buf[0] = 0xfc + ((unsigned)c >> 30); + buf[1] = 0x80 + (((unsigned)c >> 24) & 0x3f); + buf[2] = 0x80 + (((unsigned)c >> 18) & 0x3f); + buf[3] = 0x80 + (((unsigned)c >> 12) & 0x3f); + buf[4] = 0x80 + (((unsigned)c >> 6) & 0x3f); + buf[5] = 0x80 + (c & 0x3f); + return 6; } - /* 31 bits */ - buf[0] = 0xfc + ((unsigned)c >> 30); - buf[1] = 0x80 + (((unsigned)c >> 24) & 0x3f); - buf[2] = 0x80 + (((unsigned)c >> 18) & 0x3f); - buf[3] = 0x80 + (((unsigned)c >> 12) & 0x3f); - buf[4] = 0x80 + (((unsigned)c >> 6) & 0x3f); - buf[5] = 0x80 + (c & 0x3f); - return 6; } /* @@ -1513,14 +1538,15 @@ int utf_head_off(const char_u *base, const char_u *p) return (int)(p - q); } -/* - * Copy a character from "*fp" to "*tp" and advance the pointers. - */ -void mb_copy_char(const char_u **fp, char_u **tp) +/// Copy a character, advancing the pointers +/// +/// @param[in,out] fp Source of the character to copy. +/// @param[in,out] tp Destination to copy to. +void mb_copy_char(const char_u **const fp, char_u **const tp) { - int l = (*mb_ptr2len)(*fp); + const size_t l = (size_t)utfc_ptr2len(*fp); - memmove(*tp, *fp, (size_t)l); + memmove(*tp, *fp, l); *tp += l; *fp += l; } |