Merge pull request #30042 from bfredl/bobbytables

refactor(multibyte): replace generated unicode tables with utf8proc
author: bfredl <bjorn.linse@gmail.com> 2024-08-31 18:55:09 +0200
committer: GitHub <noreply@github.com> 2024-08-31 18:55:09 +0200
commit: e1937286f04863cf1aa984c4b27a7502576e6c88 (patch)
tree: 117020f129a2d7bbb2307d32c23aa868a7933bb6 /src/nvim
parent: a6c4487e8bfc8dc527ed64651515963e46ebeee8 (diff)
parent: 26be6446e5ea1c5b22c50bfd9a0e5aa85927aff9 (diff)
download: rneovim-e1937286f04863cf1aa984c4b27a7502576e6c88.tar.gz
rneovim-e1937286f04863cf1aa984c4b27a7502576e6c88.tar.bz2
rneovim-e1937286f04863cf1aa984c4b27a7502576e6c88.zip
3 files changed, 50 insertions, 312 deletions
diff --git a/src/nvim/CMakeLists.txt b/src/nvim/CMakeLists.txt
index 73aa508d9d..1b229c1d87 100644
--- a/src/nvim/CMakeLists.txt
+++ b/src/nvim/CMakeLists.txt
@@ -301,7 +301,6 @@ set(GENERATOR_DIR ${CMAKE_CURRENT_LIST_DIR}/generators)
 set(GEN_EVAL_TOUCH ${TOUCHES_DIR}/gen_doc_eval)
 set(LUAJIT_RUNTIME_DIR ${DEPS_PREFIX}/share/luajit-2.1/jit)
 set(NVIM_RUNTIME_DIR ${PROJECT_SOURCE_DIR}/runtime)
-set(UNICODE_DIR ${PROJECT_SOURCE_DIR}/src/unicode)
 
 # GENERATOR_DIR
 set(API_DISPATCH_GENERATOR ${GENERATOR_DIR}/gen_api_dispatch.lua)
@@ -316,7 +315,6 @@ set(GENERATOR_PRELOAD ${GENERATOR_DIR}/preload.lua)
 set(HEADER_GENERATOR ${GENERATOR_DIR}/gen_declarations.lua)
 set(OPTIONS_ENUM_GENERATOR ${GENERATOR_DIR}/gen_options_enum.lua)
 set(OPTIONS_GENERATOR ${GENERATOR_DIR}/gen_options.lua)
-set(UNICODE_TABLES_GENERATOR ${GENERATOR_DIR}/gen_unicode_tables.lua)
 
 # GENERATED_DIR and GENERATED_INCLUDES_DIR
 set(GENERATED_API_DISPATCH ${GENERATED_DIR}/api/private/dispatch_wrappers.generated.h)
@@ -333,7 +331,6 @@ set(GENERATED_OPTIONS_MAP ${GENERATED_DIR}/options_map.generated.h)
 set(GENERATED_UI_EVENTS_CALL ${GENERATED_DIR}/ui_events_call.generated.h)
 set(GENERATED_UI_EVENTS_CLIENT ${GENERATED_DIR}/ui_events_client.generated.h)
 set(GENERATED_UI_EVENTS_REMOTE ${GENERATED_DIR}/ui_events_remote.generated.h)
-set(GENERATED_UNICODE_TABLES ${GENERATED_DIR}/unicode_tables.generated.h)
 set(LUA_API_C_BINDINGS ${GENERATED_DIR}/lua_api_c_bindings.generated.h)
 set(VIM_MODULE_FILE ${GENERATED_DIR}/lua/vim_module.generated.h)
 
@@ -350,7 +347,6 @@ set(LUA_LOADER_MODULE_SOURCE ${NVIM_RUNTIME_DIR}/lua/vim/loader.lua)
 set(LUA_OPTIONS_MODULE_SOURCE ${NVIM_RUNTIME_DIR}/lua/vim/_options.lua)
 set(LUA_SHARED_MODULE_SOURCE ${NVIM_RUNTIME_DIR}/lua/vim/shared.lua)
 
-file(GLOB UNICODE_FILES CONFIGURE_DEPENDS ${UNICODE_DIR}/*.txt)
 file(GLOB API_HEADERS CONFIGURE_DEPENDS api/*.h)
 list(REMOVE_ITEM API_HEADERS ${CMAKE_CURRENT_LIST_DIR}/api/ui_events.in.h)
 file(GLOB MSGPACK_RPC_HEADERS CONFIGURE_DEPENDS msgpack_rpc/*.h)
@@ -587,15 +583,6 @@ foreach(sfile ${NVIM_SOURCES}
   endif()
 endforeach()
 
-add_custom_command(OUTPUT ${GENERATED_UNICODE_TABLES}
-  COMMAND ${LUA_PRG} ${UNICODE_TABLES_GENERATOR}
-                     ${UNICODE_DIR}
-                     ${GENERATED_UNICODE_TABLES}
-  DEPENDS
-    ${UNICODE_TABLES_GENERATOR}
-    ${UNICODE_FILES}
-)
-
 set(NVIM_VERSION_LUA ${PROJECT_BINARY_DIR}/nvim_version.lua)
 configure_file(${GENERATOR_DIR}/nvim_version.lua.in ${NVIM_VERSION_LUA})
 
@@ -687,7 +674,6 @@ list(APPEND NVIM_GENERATED_FOR_SOURCES
   "${GENERATED_EVENTS_NAMES_MAP}"
   "${GENERATED_OPTIONS}"
   "${GENERATED_OPTIONS_MAP}"
-  "${GENERATED_UNICODE_TABLES}"
   "${VIM_MODULE_FILE}"
   "${PROJECT_BINARY_DIR}/cmake.config/auto/pathdef.h"
 )
diff --git a/src/nvim/generators/gen_unicode_tables.lua b/src/nvim/generators/gen_unicode_tables.lua
deleted file mode 100644
index 01eb34be88..0000000000
--- a/src/nvim/generators/gen_unicode_tables.lua
+++ /dev/null
@@ -1,264 +0,0 @@
--- Script creates the following tables in unicode_tables.generated.h:
---
--- 1. doublewidth and ambiguous tables: sorted list of non-overlapping closed
---    intervals. Codepoints in these intervals have double (W or F) or ambiguous
---    (A) east asian width respectively.
--- 2. combining table: same as the above, but characters inside are combining
---    characters (i.e. have general categories equal to Mn, Mc or Me).
--- 3. foldCase table used to convert characters to
---    folded variants. In this table first two values are
---    character ranges: like in previous tables they are sorted and must be
---    non-overlapping. Third value means step inside the range: e.g. if it is
---    2 then interval applies only to first, third, fifth, … character in range.
---    Fourth value is number that should be added to the codepoint to yield
---    folded codepoint.
--- 4. emoji_wide and emoji_all tables: sorted lists of non-overlapping closed
---    intervals of Emoji characters.  emoji_wide contains all the characters
---    which don't have ambiguous or double width, and emoji_all has all Emojis.
-if arg[1] == '--help' then
-  print('Usage:')
-  print('  gen_unicode_tables.lua unicode/ unicode_tables.generated.h')
-  os.exit(0)
-end
-
-local basedir = arg[1]
-local pathsep = package.config:sub(1, 1)
-local get_path = function(fname)
-  return basedir .. pathsep .. fname
-end
-
-local unicodedata_fname = get_path('UnicodeData.txt')
-local eastasianwidth_fname = get_path('EastAsianWidth.txt')
-local emoji_fname = get_path('emoji-data.txt')
-
-local utf_tables_fname = arg[2]
-
-local split_on_semicolons = function(s)
-  local ret = {}
-  local idx = 1
-  while idx <= #s + 1 do
-    local item = s:match('^[^;]*', idx)
-    idx = idx + #item + 1
-    if idx <= #s + 1 then
-      assert(s:sub(idx - 1, idx - 1) == ';')
-    end
-    item = item:gsub('^%s*', '')
-    item = item:gsub('%s*$', '')
-    table.insert(ret, item)
-  end
-  return ret
-end
-
-local fp_lines_to_lists = function(fp, n, has_comments)
-  local ret = {}
-  local line
-  local i = 0
-  while true do
-    i = i + 1
-    line = fp:read('*l')
-    if not line then
-      break
-    end
-    if not has_comments or (line:sub(1, 1) ~= '#' and not line:match('^%s*$')) then
-      local l = split_on_semicolons(line)
-      if #l ~= n then
-        io.stderr:write(('Found %s items in line %u, expected %u\n'):format(#l, i, n))
-        io.stderr:write('Line: ' .. line .. '\n')
-        return nil
-      end
-      table.insert(ret, l)
-    end
-  end
-  return ret
-end
-
-local parse_data_to_props = function(ud_fp)
-  return fp_lines_to_lists(ud_fp, 15, false)
-end
-
-local parse_width_props = function(eaw_fp)
-  return fp_lines_to_lists(eaw_fp, 2, true)
-end
-
-local parse_emoji_props = function(emoji_fp)
-  return fp_lines_to_lists(emoji_fp, 2, true)
-end
-
-local make_range = function(start, end_, step, add)
-  if step and add then
-    return ('  {0x%x, 0x%x, %d, %d},\n'):format(start, end_, step == 0 and -1 or step, add)
-  else
-    return ('  {0x%04x, 0x%04x},\n'):format(start, end_)
-  end
-end
-
-local build_combining_table = function(ut_fp, dataprops)
-  ut_fp:write('static const struct interval combining[] = {\n')
-  local start = -1
-  local end_ = -1
-  for _, p in ipairs(dataprops) do
-    -- The 'Mc' property was removed, it does take up space.
-    if ({ Mn = true, Me = true })[p[3]] then
-      local n = tonumber(p[1], 16)
-      if start >= 0 and end_ + 1 == n then
-        -- Continue with the same range.
-        end_ = n
-      else
-        if start >= 0 then
-          -- Produce previous range.
-          ut_fp:write(make_range(start, end_))
-        end
-        start = n
-        end_ = n
-      end
-    end
-  end
-  if start >= 0 then
-    ut_fp:write(make_range(start, end_))
-  end
-  ut_fp:write('};\n')
-end
-
-local build_width_table = function(ut_fp, dataprops, widthprops, widths, table_name)
-  ut_fp:write('static const struct interval ' .. table_name .. '[] = {\n')
-  local start = -1
-  local end_ = -1
-  local dataidx = 1
-  local ret = {}
-  for _, p in ipairs(widthprops) do
-    if widths[p[2]:sub(1, 1)] then
-      local rng_start, rng_end = p[1]:find('%.%.')
-      local n, n_last
-      if rng_start then
-        -- It is a range. We don’t check for composing char then.
-        n = tonumber(p[1]:sub(1, rng_start - 1), 16)
-        n_last = tonumber(p[1]:sub(rng_end + 1), 16)
-      else
-        n = tonumber(p[1], 16)
-        n_last = n
-      end
-      local dn
-      while true do
-        dn = tonumber(dataprops[dataidx][1], 16)
-        if dn >= n then
-          break
-        end
-        dataidx = dataidx + 1
-      end
-      if dn ~= n and n_last == n then
-        io.stderr:write('Cannot find character ' .. n .. ' in data table.\n')
-      end
-      -- Only use the char when it’s not a composing char.
-      -- But use all chars from a range.
-      local dp = dataprops[dataidx]
-      if (n_last > n) or not ({ Mn = true, Mc = true, Me = true })[dp[3]] then
-        if start >= 0 and end_ + 1 == n then -- luacheck: ignore 542
-          -- Continue with the same range.
-        else
-          if start >= 0 then
-            ut_fp:write(make_range(start, end_))
-            table.insert(ret, { start, end_ })
-          end
-          start = n
-        end
-        end_ = n_last
-      end
-    end
-  end
-  if start >= 0 then
-    ut_fp:write(make_range(start, end_))
-    table.insert(ret, { start, end_ })
-  end
-  ut_fp:write('};\n')
-  return ret
-end
-
-local build_emoji_table = function(ut_fp, emojiprops, doublewidth, ambiwidth)
-  local emojiwidth = {}
-  local emoji = {}
-  for _, p in ipairs(emojiprops) do
-    if p[2]:match('Emoji%s+#') then
-      local rng_start, rng_end = p[1]:find('%.%.')
-      local n
-      local n_last
-      if rng_start then
-        n = tonumber(p[1]:sub(1, rng_start - 1), 16)
-        n_last = tonumber(p[1]:sub(rng_end + 1), 16)
-      else
-        n = tonumber(p[1], 16)
-        n_last = n
-      end
-      if #emoji > 0 and n - 1 == emoji[#emoji][2] then
-        emoji[#emoji][2] = n_last
-      else
-        table.insert(emoji, { n, n_last })
-      end
-
-      -- Characters below 1F000 may be considered single width traditionally,
-      -- making them double width causes problems.
-      if n >= 0x1f000 then
-        -- exclude characters that are in the ambiguous/doublewidth table
-        for _, ambi in ipairs(ambiwidth) do
-          if n >= ambi[1] and n <= ambi[2] then
-            n = ambi[2] + 1
-          end
-          if n_last >= ambi[1] and n_last <= ambi[2] then
-            n_last = ambi[1] - 1
-          end
-        end
-        for _, double in ipairs(doublewidth) do
-          if n >= double[1] and n <= double[2] then
-            n = double[2] + 1
-          end
-          if n_last >= double[1] and n_last <= double[2] then
-            n_last = double[1] - 1
-          end
-        end
-
-        if n <= n_last then
-          if #emojiwidth > 0 and n - 1 == emojiwidth[#emojiwidth][2] then
-            emojiwidth[#emojiwidth][2] = n_last
-          else
-            table.insert(emojiwidth, { n, n_last })
-          end
-        end
-      end
-    end
-  end
-
-  ut_fp:write('static const struct interval emoji_all[] = {\n')
-  for _, p in ipairs(emoji) do
-    ut_fp:write(make_range(p[1], p[2]))
-  end
-  ut_fp:write('};\n')
-
-  ut_fp:write('static const struct interval emoji_wide[] = {\n')
-  for _, p in ipairs(emojiwidth) do
-    ut_fp:write(make_range(p[1], p[2]))
-  end
-  ut_fp:write('};\n')
-end
-
-local ud_fp = io.open(unicodedata_fname, 'r')
-local dataprops = parse_data_to_props(ud_fp)
-ud_fp:close()
-
-local ut_fp = io.open(utf_tables_fname, 'w')
-
-build_combining_table(ut_fp, dataprops)
-
-local eaw_fp = io.open(eastasianwidth_fname, 'r')
-local widthprops = parse_width_props(eaw_fp)
-eaw_fp:close()
-
-local doublewidth =
-  build_width_table(ut_fp, dataprops, widthprops, { W = true, F = true }, 'doublewidth')
-local ambiwidth = build_width_table(ut_fp, dataprops, widthprops, { A = true }, 'ambiguous')
-
-local emoji_fp = io.open(emoji_fname, 'r')
-local emojiprops = parse_emoji_props(emoji_fp)
-emoji_fp:close()
-
-build_emoji_table(ut_fp, emojiprops, doublewidth, ambiwidth)
-
-ut_fp:close()
diff --git a/src/nvim/mbyte.c b/src/nvim/mbyte.c
index 666a904fc5..db4730408b 100644
--- a/src/nvim/mbyte.c
+++ b/src/nvim/mbyte.c
@@ -85,7 +85,6 @@ struct interval {
 // uncrustify:off
 #ifdef INCLUDE_GENERATED_DECLARATIONS
 # include "mbyte.c.generated.h"
-# include "unicode_tables.generated.h"
 #endif
 // uncrustify:on
 
@@ -444,31 +443,10 @@ int mb_get_class_tab(const char *p, const uint64_t *const chartab)
   return utf_class_tab(utf_ptr2char(p), chartab);
 }
 
-// Return true if "c" is in "table".
-static bool intable(const struct interval *table, size_t n_items, int c)
-  FUNC_ATTR_PURE
+static bool prop_is_emojilike(const utf8proc_property_t *prop)
 {
-  assert(n_items > 0);
-  // first quick check for Latin1 etc. characters
-  if (c < table[0].first) {
-    return false;
-  }
-
-  assert(n_items <= SIZE_MAX / 2);
-  // binary search in table
-  size_t bot = 0;
-  size_t top = n_items;
-  do {
-    size_t mid = (bot + top) >> 1;
-    if (table[mid].last < c) {
-      bot = mid + 1;
-    } else if (table[mid].first > c) {
-      top = mid;
-    } else {
-      return true;
-    }
-  } while (top > bot);
-  return false;
+  return prop->boundclass == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC
+         || prop->boundclass == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR;
 }
 
 /// For UTF-8 character "c" return 2 for a double-width character, 1 for others.
@@ -496,13 +474,18 @@ int utf_char2cells(int c)
     return n;
   }
 
-  if (intable(doublewidth, ARRAY_SIZE(doublewidth), c)) {
+  const utf8proc_property_t *prop = utf8proc_get_property(c);
+
+  if (prop->charwidth == 2) {
     return 2;
   }
-  if (p_emoji && intable(emoji_wide, ARRAY_SIZE(emoji_wide), c)) {
+  if (*p_ambw == 'd' && prop->ambiguous_width) {
     return 2;
   }
-  if (*p_ambw == 'd' && intable(ambiguous, ARRAY_SIZE(ambiguous), c)) {
+
+  // Characters below 1F000 may be considered single width traditionally,
+  // making them double width causes problems.
+  if (p_emoji && c >= 0x1f000 && !prop->ambiguous_width && prop_is_emojilike(prop)) {
     return 2;
   }
 
@@ -528,7 +511,7 @@ int utf_ptr2cells(const char *p_in)
     }
     int cells = utf_char2cells(c);
     if (cells == 1 && p_emoji
-        && intable(emoji_all, ARRAY_SIZE(emoji_all), c)) {
+        && prop_is_emojilike(utf8proc_get_property(c))) {
       int c2 = utf_ptr2char(p_in + len);
       if (c2 == 0xFE0F) {
         return 2;  // emoji presentation
@@ -628,7 +611,7 @@ int utf_ptr2cells_len(const char *p, int size)
     }
     int cells = utf_char2cells(c);
     if (cells == 1 && p_emoji && size > len
-        && intable(emoji_all, ARRAY_SIZE(emoji_all), c)
+        && prop_is_emojilike(utf8proc_get_property(c))
         && utf_ptr2len_len(p + len, size - len) == utf8len_tab[(uint8_t)p[len]]) {
       int c2 = utf_ptr2char(p + len);
       if (c2 == 0xFE0F) {
@@ -1137,7 +1120,8 @@ int utf_char2bytes(const int c, char *const buf)
 /// Returns false for negative values.
 bool utf_iscomposing_legacy(int c)
 {
-  return intable(combining, ARRAY_SIZE(combining), c);
+  const utf8proc_property_t *prop = utf8proc_get_property(c);
+  return prop->category == UTF8PROC_CATEGORY_MN || prop->category == UTF8PROC_CATEGORY_ME;
 }
 
 #ifdef __SSE2__
@@ -1182,6 +1166,33 @@ bool utf_printable(int c)
 
 #else
 
+// Return true if "c" is in "table".
+static bool intable(const struct interval *table, size_t n_items, int c)
+  FUNC_ATTR_PURE
+{
+  assert(n_items > 0);
+  // first quick check for Latin1 etc. characters
+  if (c < table[0].first) {
+    return false;
+  }
+
+  assert(n_items <= SIZE_MAX / 2);
+  // binary search in table
+  size_t bot = 0;
+  size_t top = n_items;
+  do {
+    size_t mid = (bot + top) >> 1;
+    if (table[mid].last < c) {
+      bot = mid + 1;
+    } else if (table[mid].first > c) {
+      top = mid;
+    } else {
+      return true;
+    }
+  } while (top > bot);
+  return false;
+}
+
 // Return true for characters that can be displayed in a normal way.
 // Only for characters of 0x100 and above!
 bool utf_printable(int c)
@@ -1304,8 +1315,9 @@ int utf_class_tab(const int c, const uint64_t *const chartab)
     return 1;               // punctuation
   }
 
+  const utf8proc_property_t *prop = utf8proc_get_property(c);
   // emoji
-  if (intable(emoji_all, ARRAY_SIZE(emoji_all), c)) {
+  if (prop_is_emojilike(prop)) {
     return 3;
   }
 
@@ -1328,8 +1340,12 @@ int utf_class_tab(const int c, const uint64_t *const chartab)
 bool utf_ambiguous_width(const char *p)
 {
   int c = utf_ptr2char(p);
-  return c >= 0x80 && (intable(ambiguous, ARRAY_SIZE(ambiguous), c)
-                       || intable(emoji_all, ARRAY_SIZE(emoji_all), c));
+  if (c < 0x80) {
+    return false;
+  }
+
+  const utf8proc_property_t *prop = utf8proc_get_property(c);
+  return prop->ambiguous_width || prop_is_emojilike(prop);
 }
 
 // Return the folded-case equivalent of "a", which is a UCS-4 character.  Uses
author	bfredl <bjorn.linse@gmail.com>	2024-08-31 18:55:09 +0200
committer	GitHub <noreply@github.com>	2024-08-31 18:55:09 +0200
commit	e1937286f04863cf1aa984c4b27a7502576e6c88 (patch)
tree	117020f129a2d7bbb2307d32c23aa868a7933bb6 /src/nvim
parent	a6c4487e8bfc8dc527ed64651515963e46ebeee8 (diff)
parent	26be6446e5ea1c5b22c50bfd9a0e5aa85927aff9 (diff)
download	rneovim-e1937286f04863cf1aa984c4b27a7502576e6c88.tar.gz rneovim-e1937286f04863cf1aa984c4b27a7502576e6c88.tar.bz2 rneovim-e1937286f04863cf1aa984c4b27a7502576e6c88.zip