refactor(grid): make screen rendering more multibyte than ever before

Problem: buffer text with composing chars are converted from UTF-8 to an array of up to seven UTF-32 values and then converted back to UTF-8 strings. Solution: Convert buffer text directly to UTF-8 based schar_T values. The limit of the text size is now in schar_T bytes, which is currently 31+1 but easily could be raised as it no longer multiplies the size of the entire screen grid when not used, the full size is only required for temporary scratch buffers. Also does some general cleanup to win_line text handling, which was unnecessarily complicated due to multibyte rendering being an "opt-in" feature long ago. Nowadays, a char is just a char, regardless if it consists of one ASCII byte or multiple bytes.
author: bfredl <bjorn.linse@gmail.com> 2023-11-06 14:52:27 +0100
committer: bfredl <bjorn.linse@gmail.com> 2023-11-17 12:58:57 +0100
commit: b522cb1ac3fbdf6e68eed5d0b6e1cbeaf3ac2254 (patch)
tree: 434ec27e069ba57406ce9f6d194627e95c3d315c /src/nvim/mbyte.c
parent: 20ec4c776a07492c2e3b995e10b40b1cdb52bc7a (diff)
download: rneovim-b522cb1ac3fbdf6e68eed5d0b6e1cbeaf3ac2254.tar.gz
rneovim-b522cb1ac3fbdf6e68eed5d0b6e1cbeaf3ac2254.tar.bz2
rneovim-b522cb1ac3fbdf6e68eed5d0b6e1cbeaf3ac2254.zip
1 files changed, 57 insertions, 66 deletions
diff --git a/src/nvim/mbyte.c b/src/nvim/mbyte.c
index 0d468889a4..3a13aeddb8 100644
--- a/src/nvim/mbyte.c
+++ b/src/nvim/mbyte.c
@@ -48,6 +48,7 @@
 #include "nvim/getchar.h"
 #include "nvim/gettext.h"
 #include "nvim/globals.h"
+#include "nvim/grid.h"
 #include "nvim/grid_defs.h"
 #include "nvim/iconv.h"
 #include "nvim/keycodes.h"
@@ -722,80 +723,68 @@ bool utf_composinglike(const char *p1, const char *p2)
   return arabic_combine(utf_ptr2char(p1), c2);
 }
 
-/// Convert a UTF-8 string to a wide character
+/// Get the screen char at the beginning of a string
 ///
-/// Also gets up to #MAX_MCO composing characters.
+/// Caller is expected to check for things like unprintable chars etc
+/// If first char in string is a composing char, prepend a space to display it correctly.
 ///
-/// @param[out]  pcc  Location where to store composing characters. Must have
-///                   space at least for #MAX_MCO + 1 elements.
+/// If "p" starts with an invalid sequence, zero is returned.
 ///
-/// @return leading character.
-int utfc_ptr2char(const char *p, int *pcc)
+/// @param[out] firstc (required) The first codepoint of the screen char,
+///                    or the first byte of an invalid sequence
+///
+/// @return the char
+schar_T utfc_ptr2schar(const char *p, int *firstc)
+  FUNC_ATTR_NONNULL_ALL
 {
-  int i = 0;
-
   int c = utf_ptr2char(p);
-  int len = utf_ptr2len(p);
-
-  // Only accept a composing char when the first char isn't illegal.
-  if ((len > 1 || (uint8_t)(*p) < 0x80)
-      && (uint8_t)p[len] >= 0x80
-      && utf_composinglike(p, p + len)) {
-    int cc = utf_ptr2char(p + len);
-    while (true) {
-      pcc[i++] = cc;
-      if (i == MAX_MCO) {
-        break;
-      }
-      len += utf_ptr2len(p + len);
-      if ((uint8_t)p[len] < 0x80 || !utf_iscomposing(cc = utf_ptr2char(p + len))) {
-        break;
-      }
-    }
-  }
+  *firstc = c;  // NOT optional, you are gonna need it
+  bool first_compose = utf_iscomposing(c);
+  size_t maxlen = MAX_SCHAR_SIZE - 1 - first_compose;
+  size_t len = (size_t)utfc_ptr2len_len(p, (int)maxlen);
 
-  if (i < MAX_MCO) {    // last composing char must be 0
-    pcc[i] = 0;
+  if (len == 1 && (uint8_t)(*p) >= 0x80) {
+    return 0;  // invalid sequence
   }
 
-  return c;
+  return schar_from_buf_first(p, len, first_compose);
 }
 
-// Convert a UTF-8 byte string to a wide character.  Also get up to MAX_MCO
-// composing characters.  Use no more than p[maxlen].
-//
-// @param [out] pcc: composing chars, last one is 0
-int utfc_ptr2char_len(const char *p, int *pcc, int maxlen)
+/// Get the screen char at the beginning of a string with length
+///
+/// Like utfc_ptr2schar but use no more than p[maxlen].
+schar_T utfc_ptr2schar_len(const char *p, int maxlen, int *firstc)
+  FUNC_ATTR_NONNULL_ALL
 {
   assert(maxlen > 0);
 
-  int i = 0;
+  size_t len = (size_t)utf_ptr2len_len(p, maxlen);
+  if (len > (size_t)maxlen || (len == 1 && (uint8_t)(*p) >= 0x80) || len == 0) {
+    // invalid or truncated sequence
+    *firstc = (uint8_t)(*p);
+    return 0;
+  }
 
-  int len = utf_ptr2len_len(p, maxlen);
-  // Is it safe to use utf_ptr2char()?
-  bool safe = len > 1 && len <= maxlen;
-  int c = safe ? utf_ptr2char(p) : (uint8_t)(*p);
+  int c = utf_ptr2char(p);
+  *firstc = c;
+  bool first_compose = utf_iscomposing(c);
+  maxlen = MIN(maxlen, MAX_SCHAR_SIZE - 1 - first_compose);
+  len = (size_t)utfc_ptr2len_len(p, maxlen);
 
-  // Only accept a composing char when the first char isn't illegal.
-  if ((safe || c < 0x80) && len < maxlen && (uint8_t)p[len] >= 0x80) {
-    for (; i < MAX_MCO; i++) {
-      int len_cc = utf_ptr2len_len(p + len, maxlen - len);
-      safe = len_cc > 1 && len_cc <= maxlen - len;
-      if (!safe || (pcc[i] = utf_ptr2char(p + len)) < 0x80
-          || !(i == 0 ? utf_composinglike(p, p + len) : utf_iscomposing(pcc[i]))) {
-        break;
-      }
-      len += len_cc;
-    }
-  }
+  return schar_from_buf_first(p, len, first_compose);
+}
 
-  if (i < MAX_MCO) {
-    // last composing char must be 0
-    pcc[i] = 0;
+/// Caller must ensure there is space for `first_compose`
+static schar_T schar_from_buf_first(const char *buf, size_t len, bool first_compose)
+{
+  if (first_compose) {
+    char cbuf[MAX_SCHAR_SIZE];
+    cbuf[0] = ' ';
+    memcpy(cbuf + 1, buf, len);
+    return schar_from_buf(cbuf, len + 1);
+  } else {
+    return schar_from_buf(buf, len);
   }
-
-  return c;
-#undef ISCOMPOSING
 }
 
 /// Get the length of a UTF-8 byte sequence representing a single codepoint
@@ -878,8 +867,7 @@ int utfc_ptr2len(const char *const p)
     return 1;
   }
 
-  // Check for composing characters.  We can handle only the first six, but
-  // skip all of them (otherwise the cursor would get stuck).
+  // Check for composing characters.
   int prevlen = 0;
   while (true) {
     if ((uint8_t)p[len] < 0x80 || !utf_composinglike(p + prevlen, p + len)) {
@@ -1815,12 +1803,12 @@ int utf_cp_tail_off(const char *base, const char *p_in)
 /// Return the offset from "p" to the first byte of the codepoint it points
 /// to. Can start anywhere in a stream of bytes.
 /// Note: Unlike `utf_head_off`, this counts individual codepoints of composed characters
-/// separately and returns a negative offset.
+/// separately.
 ///
 /// @param[in] base  Pointer to start of string
 /// @param[in] p     Pointer to byte for which to return the offset to the previous codepoint
 //
-/// @return 0 if invalid sequence, else offset to previous codepoint
+/// @return 0 if invalid sequence, else number of bytes to previous codepoint
 int utf_cp_head_off(const char *base, const char *p)
 {
   int i;
@@ -1830,17 +1818,20 @@ int utf_cp_head_off(const char *base, const char *p)
   }
 
   // Find the first character that is not 10xx.xxxx
-  for (i = 0; p - i > base; i--) {
-    if (((uint8_t)p[i] & 0xc0) != 0x80) {
+  for (i = 0; p - i >= base; i++) {
+    if (((uint8_t)p[-i] & 0xc0) != 0x80) {
       break;
     }
   }
 
-  // Find the last character that is 10xx.xxxx
-  for (int j = 0; ((uint8_t)p[j + 1] & 0xc0) == 0x80; j++) {}
+  // Find the last character that is 10xx.xxxx (condition terminates on NUL)
+  int j = 1;
+  while (((uint8_t)p[j] & 0xc0) == 0x80) {
+    j++;
+  }
 
   // Check for illegal sequence.
-  if (utf8len_tab[(uint8_t)p[i]] == 1) {
+  if (utf8len_tab[(uint8_t)p[-i]] != j + i) {
     return 0;
   }
   return i;
author	bfredl <bjorn.linse@gmail.com>	2023-11-06 14:52:27 +0100
committer	bfredl <bjorn.linse@gmail.com>	2023-11-17 12:58:57 +0100
commit	b522cb1ac3fbdf6e68eed5d0b6e1cbeaf3ac2254 (patch)
tree	434ec27e069ba57406ce9f6d194627e95c3d315c /src/nvim/mbyte.c
parent	20ec4c776a07492c2e3b995e10b40b1cdb52bc7a (diff)
download	rneovim-b522cb1ac3fbdf6e68eed5d0b6e1cbeaf3ac2254.tar.gz rneovim-b522cb1ac3fbdf6e68eed5d0b6e1cbeaf3ac2254.tar.bz2 rneovim-b522cb1ac3fbdf6e68eed5d0b6e1cbeaf3ac2254.zip