aboutsummaryrefslogtreecommitdiff
path: root/src/nvim/mbyte.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/nvim/mbyte.c')
-rw-r--r--src/nvim/mbyte.c69
1 files changed, 50 insertions, 19 deletions
diff --git a/src/nvim/mbyte.c b/src/nvim/mbyte.c
index 62cc3b56ed..cc488d486f 100644
--- a/src/nvim/mbyte.c
+++ b/src/nvim/mbyte.c
@@ -346,7 +346,6 @@ static int enc_canon_search(const char_u *name)
}
-
/*
* Find canonical encoding "name" in the list and return its properties.
* Returns 0 if not found.
@@ -675,16 +674,16 @@ static int utf_safe_read_char_adv(const char_u **s, size_t *n)
}
if (k <= *n) {
- /* We have a multibyte sequence and it isn't truncated by buffer
- * limits so utf_ptr2char() is safe to use. Or the first byte is
- * illegal (k=0), and it's also safe to use utf_ptr2char(). */
+ // We have a multibyte sequence and it isn't truncated by buffer
+ // limits so utf_ptr2char() is safe to use. Or the first byte is
+ // illegal (k=0), and it's also safe to use utf_ptr2char().
c = utf_ptr2char(*s);
- /* On failure, utf_ptr2char() returns the first byte, so here we
- * check equality with the first byte. The only non-ASCII character
- * which equals the first byte of its own UTF-8 representation is
- * U+00C3 (UTF-8: 0xC3 0x83), so need to check that special case too.
- * It's safe even if n=1, else we would have k=2 > n. */
+ // On failure, utf_ptr2char() returns the first byte, so here we
+ // check equality with the first byte. The only non-ASCII character
+ // which equals the first byte of its own UTF-8 representation is
+ // U+00C3 (UTF-8: 0xC3 0x83), so need to check that special case too.
+ // It's safe even if n=1, else we would have k=2 > n.
if (c != (int)(**s) || (c == 0xC3 && (*s)[1] == 0x83)) {
// byte sequence was successfully decoded
*s += k;
@@ -1582,12 +1581,12 @@ void show_utf8(void)
int clen;
int i;
- /* Get the byte length of the char under the cursor, including composing
- * characters. */
+ // Get the byte length of the char under the cursor, including composing
+ // characters.
line = get_cursor_pos_ptr();
len = utfc_ptr2len(line);
if (len == 0) {
- MSG("NUL");
+ msg("NUL");
return;
}
@@ -1610,7 +1609,7 @@ void show_utf8(void)
}
}
- msg(IObuff);
+ msg((char *)IObuff);
}
/// Return offset from "p" to the first byte of the character it points into.
@@ -1625,8 +1624,8 @@ int utf_head_off(const char_u *base, const char_u *p)
return 0;
}
- /* Skip backwards over trailing bytes: 10xx.xxxx
- * Skip backwards again if on a composing char. */
+ // Skip backwards over trailing bytes: 10xx.xxxx
+ // Skip backwards again if on a composing char.
const char_u *q;
for (q = p;; --q) {
// Move s to the last byte of this char.
@@ -1883,6 +1882,40 @@ int mb_tail_off(char_u *base, char_u *p)
return i;
}
+
+/// Return the offset from "p" to the first byte of the character it points
+/// into. Can start anywhere in a stream of bytes.
+///
+/// @param[in] base Pointer to start of string
+/// @param[in] p Pointer to byte for which to return the offset to the previous codepoint
+//
+/// @return 0 if invalid sequence, else offset to previous codepoint
+int mb_head_off(char_u *base, char_u *p)
+{
+ int i;
+ int j;
+
+ if (*p == NUL) {
+ return 0;
+ }
+
+ // Find the first character that is not 10xx.xxxx
+ for (i = 0; p - i > base; i--) {
+ if ((p[i] & 0xc0) != 0x80) {
+ break;
+ }
+ }
+
+ // Find the last character that is 10xx.xxxx
+ for (j = 0; (p[j + 1] & 0xc0) == 0x80; j++) {}
+
+ // Check for illegal sequence.
+ if (utf8len_tab[p[i]] == 1) {
+ return 0;
+ }
+ return i;
+}
+
/*
* Find the next illegal byte sequence.
*/
@@ -1915,8 +1948,8 @@ void utf_find_illegal(void)
}
while (*p != NUL) {
- /* Illegal means that there are not enough trail bytes (checked by
- * utf_ptr2len()) or too many of them (overlong sequence). */
+ // Illegal means that there are not enough trail bytes (checked by
+ // utf_ptr2len()) or too many of them (overlong sequence).
len = utf_ptr2len(p);
if (*p >= 0x80 && (len == 1
|| utf_char2len(utf_ptr2char(p)) != len)) {
@@ -2398,8 +2431,6 @@ static char_u *iconv_string(const vimconv_T *const vcp, char_u *str, size_t slen
#endif // HAVE_ICONV
-
-
/*
* Setup "vcp" for conversion from "from" to "to".
* The names must have been made canonical with enc_canonize().