aboutsummaryrefslogtreecommitdiff
path: root/src/nvim/mbyte.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/nvim/mbyte.c')
-rw-r--r--src/nvim/mbyte.c123
1 files changed, 57 insertions, 66 deletions
diff --git a/src/nvim/mbyte.c b/src/nvim/mbyte.c
index 0d468889a4..3a13aeddb8 100644
--- a/src/nvim/mbyte.c
+++ b/src/nvim/mbyte.c
@@ -48,6 +48,7 @@
#include "nvim/getchar.h"
#include "nvim/gettext.h"
#include "nvim/globals.h"
+#include "nvim/grid.h"
#include "nvim/grid_defs.h"
#include "nvim/iconv.h"
#include "nvim/keycodes.h"
@@ -722,80 +723,68 @@ bool utf_composinglike(const char *p1, const char *p2)
return arabic_combine(utf_ptr2char(p1), c2);
}
-/// Convert a UTF-8 string to a wide character
+/// Get the screen char at the beginning of a string
///
-/// Also gets up to #MAX_MCO composing characters.
+/// Caller is expected to check for things like unprintable chars etc
+/// If first char in string is a composing char, prepend a space to display it correctly.
///
-/// @param[out] pcc Location where to store composing characters. Must have
-/// space at least for #MAX_MCO + 1 elements.
+/// If "p" starts with an invalid sequence, zero is returned.
///
-/// @return leading character.
-int utfc_ptr2char(const char *p, int *pcc)
+/// @param[out] firstc (required) The first codepoint of the screen char,
+/// or the first byte of an invalid sequence
+///
+/// @return the char
+schar_T utfc_ptr2schar(const char *p, int *firstc)
+ FUNC_ATTR_NONNULL_ALL
{
- int i = 0;
-
int c = utf_ptr2char(p);
- int len = utf_ptr2len(p);
-
- // Only accept a composing char when the first char isn't illegal.
- if ((len > 1 || (uint8_t)(*p) < 0x80)
- && (uint8_t)p[len] >= 0x80
- && utf_composinglike(p, p + len)) {
- int cc = utf_ptr2char(p + len);
- while (true) {
- pcc[i++] = cc;
- if (i == MAX_MCO) {
- break;
- }
- len += utf_ptr2len(p + len);
- if ((uint8_t)p[len] < 0x80 || !utf_iscomposing(cc = utf_ptr2char(p + len))) {
- break;
- }
- }
- }
+ *firstc = c; // NOT optional, you are gonna need it
+ bool first_compose = utf_iscomposing(c);
+ size_t maxlen = MAX_SCHAR_SIZE - 1 - first_compose;
+ size_t len = (size_t)utfc_ptr2len_len(p, (int)maxlen);
- if (i < MAX_MCO) { // last composing char must be 0
- pcc[i] = 0;
+ if (len == 1 && (uint8_t)(*p) >= 0x80) {
+ return 0; // invalid sequence
}
- return c;
+ return schar_from_buf_first(p, len, first_compose);
}
-// Convert a UTF-8 byte string to a wide character. Also get up to MAX_MCO
-// composing characters. Use no more than p[maxlen].
-//
-// @param [out] pcc: composing chars, last one is 0
-int utfc_ptr2char_len(const char *p, int *pcc, int maxlen)
+/// Get the screen char at the beginning of a string with length
+///
+/// Like utfc_ptr2schar but use no more than p[maxlen].
+schar_T utfc_ptr2schar_len(const char *p, int maxlen, int *firstc)
+ FUNC_ATTR_NONNULL_ALL
{
assert(maxlen > 0);
- int i = 0;
+ size_t len = (size_t)utf_ptr2len_len(p, maxlen);
+ if (len > (size_t)maxlen || (len == 1 && (uint8_t)(*p) >= 0x80) || len == 0) {
+ // invalid or truncated sequence
+ *firstc = (uint8_t)(*p);
+ return 0;
+ }
- int len = utf_ptr2len_len(p, maxlen);
- // Is it safe to use utf_ptr2char()?
- bool safe = len > 1 && len <= maxlen;
- int c = safe ? utf_ptr2char(p) : (uint8_t)(*p);
+ int c = utf_ptr2char(p);
+ *firstc = c;
+ bool first_compose = utf_iscomposing(c);
+ maxlen = MIN(maxlen, MAX_SCHAR_SIZE - 1 - first_compose);
+ len = (size_t)utfc_ptr2len_len(p, maxlen);
- // Only accept a composing char when the first char isn't illegal.
- if ((safe || c < 0x80) && len < maxlen && (uint8_t)p[len] >= 0x80) {
- for (; i < MAX_MCO; i++) {
- int len_cc = utf_ptr2len_len(p + len, maxlen - len);
- safe = len_cc > 1 && len_cc <= maxlen - len;
- if (!safe || (pcc[i] = utf_ptr2char(p + len)) < 0x80
- || !(i == 0 ? utf_composinglike(p, p + len) : utf_iscomposing(pcc[i]))) {
- break;
- }
- len += len_cc;
- }
- }
+ return schar_from_buf_first(p, len, first_compose);
+}
- if (i < MAX_MCO) {
- // last composing char must be 0
- pcc[i] = 0;
+/// Caller must ensure there is space for `first_compose`
+static schar_T schar_from_buf_first(const char *buf, size_t len, bool first_compose)
+{
+ if (first_compose) {
+ char cbuf[MAX_SCHAR_SIZE];
+ cbuf[0] = ' ';
+ memcpy(cbuf + 1, buf, len);
+ return schar_from_buf(cbuf, len + 1);
+ } else {
+ return schar_from_buf(buf, len);
}
-
- return c;
-#undef ISCOMPOSING
}
/// Get the length of a UTF-8 byte sequence representing a single codepoint
@@ -878,8 +867,7 @@ int utfc_ptr2len(const char *const p)
return 1;
}
- // Check for composing characters. We can handle only the first six, but
- // skip all of them (otherwise the cursor would get stuck).
+ // Check for composing characters.
int prevlen = 0;
while (true) {
if ((uint8_t)p[len] < 0x80 || !utf_composinglike(p + prevlen, p + len)) {
@@ -1815,12 +1803,12 @@ int utf_cp_tail_off(const char *base, const char *p_in)
/// Return the offset from "p" to the first byte of the codepoint it points
/// to. Can start anywhere in a stream of bytes.
/// Note: Unlike `utf_head_off`, this counts individual codepoints of composed characters
-/// separately and returns a negative offset.
+/// separately.
///
/// @param[in] base Pointer to start of string
/// @param[in] p Pointer to byte for which to return the offset to the previous codepoint
//
-/// @return 0 if invalid sequence, else offset to previous codepoint
+/// @return 0 if invalid sequence, else number of bytes to previous codepoint
int utf_cp_head_off(const char *base, const char *p)
{
int i;
@@ -1830,17 +1818,20 @@ int utf_cp_head_off(const char *base, const char *p)
}
// Find the first character that is not 10xx.xxxx
- for (i = 0; p - i > base; i--) {
- if (((uint8_t)p[i] & 0xc0) != 0x80) {
+ for (i = 0; p - i >= base; i++) {
+ if (((uint8_t)p[-i] & 0xc0) != 0x80) {
break;
}
}
- // Find the last character that is 10xx.xxxx
- for (int j = 0; ((uint8_t)p[j + 1] & 0xc0) == 0x80; j++) {}
+ // Find the last character that is 10xx.xxxx (condition terminates on NUL)
+ int j = 1;
+ while (((uint8_t)p[j] & 0xc0) == 0x80) {
+ j++;
+ }
// Check for illegal sequence.
- if (utf8len_tab[(uint8_t)p[i]] == 1) {
+ if (utf8len_tab[(uint8_t)p[-i]] != j + i) {
return 0;
}
return i;