aboutsummaryrefslogtreecommitdiff
path: root/src/nvim
diff options
context:
space:
mode:
authorbfredl <bjorn.linse@gmail.com>2024-08-31 18:55:09 +0200
committerGitHub <noreply@github.com>2024-08-31 18:55:09 +0200
commite1937286f04863cf1aa984c4b27a7502576e6c88 (patch)
tree117020f129a2d7bbb2307d32c23aa868a7933bb6 /src/nvim
parenta6c4487e8bfc8dc527ed64651515963e46ebeee8 (diff)
parent26be6446e5ea1c5b22c50bfd9a0e5aa85927aff9 (diff)
downloadrneovim-e1937286f04863cf1aa984c4b27a7502576e6c88.tar.gz
rneovim-e1937286f04863cf1aa984c4b27a7502576e6c88.tar.bz2
rneovim-e1937286f04863cf1aa984c4b27a7502576e6c88.zip
Merge pull request #30042 from bfredl/bobbytables
refactor(multibyte): replace generated unicode tables with utf8proc
Diffstat (limited to 'src/nvim')
-rw-r--r--src/nvim/CMakeLists.txt14
-rw-r--r--src/nvim/generators/gen_unicode_tables.lua264
-rw-r--r--src/nvim/mbyte.c84
3 files changed, 50 insertions, 312 deletions
diff --git a/src/nvim/CMakeLists.txt b/src/nvim/CMakeLists.txt
index 73aa508d9d..1b229c1d87 100644
--- a/src/nvim/CMakeLists.txt
+++ b/src/nvim/CMakeLists.txt
@@ -301,7 +301,6 @@ set(GENERATOR_DIR ${CMAKE_CURRENT_LIST_DIR}/generators)
set(GEN_EVAL_TOUCH ${TOUCHES_DIR}/gen_doc_eval)
set(LUAJIT_RUNTIME_DIR ${DEPS_PREFIX}/share/luajit-2.1/jit)
set(NVIM_RUNTIME_DIR ${PROJECT_SOURCE_DIR}/runtime)
-set(UNICODE_DIR ${PROJECT_SOURCE_DIR}/src/unicode)
# GENERATOR_DIR
set(API_DISPATCH_GENERATOR ${GENERATOR_DIR}/gen_api_dispatch.lua)
@@ -316,7 +315,6 @@ set(GENERATOR_PRELOAD ${GENERATOR_DIR}/preload.lua)
set(HEADER_GENERATOR ${GENERATOR_DIR}/gen_declarations.lua)
set(OPTIONS_ENUM_GENERATOR ${GENERATOR_DIR}/gen_options_enum.lua)
set(OPTIONS_GENERATOR ${GENERATOR_DIR}/gen_options.lua)
-set(UNICODE_TABLES_GENERATOR ${GENERATOR_DIR}/gen_unicode_tables.lua)
# GENERATED_DIR and GENERATED_INCLUDES_DIR
set(GENERATED_API_DISPATCH ${GENERATED_DIR}/api/private/dispatch_wrappers.generated.h)
@@ -333,7 +331,6 @@ set(GENERATED_OPTIONS_MAP ${GENERATED_DIR}/options_map.generated.h)
set(GENERATED_UI_EVENTS_CALL ${GENERATED_DIR}/ui_events_call.generated.h)
set(GENERATED_UI_EVENTS_CLIENT ${GENERATED_DIR}/ui_events_client.generated.h)
set(GENERATED_UI_EVENTS_REMOTE ${GENERATED_DIR}/ui_events_remote.generated.h)
-set(GENERATED_UNICODE_TABLES ${GENERATED_DIR}/unicode_tables.generated.h)
set(LUA_API_C_BINDINGS ${GENERATED_DIR}/lua_api_c_bindings.generated.h)
set(VIM_MODULE_FILE ${GENERATED_DIR}/lua/vim_module.generated.h)
@@ -350,7 +347,6 @@ set(LUA_LOADER_MODULE_SOURCE ${NVIM_RUNTIME_DIR}/lua/vim/loader.lua)
set(LUA_OPTIONS_MODULE_SOURCE ${NVIM_RUNTIME_DIR}/lua/vim/_options.lua)
set(LUA_SHARED_MODULE_SOURCE ${NVIM_RUNTIME_DIR}/lua/vim/shared.lua)
-file(GLOB UNICODE_FILES CONFIGURE_DEPENDS ${UNICODE_DIR}/*.txt)
file(GLOB API_HEADERS CONFIGURE_DEPENDS api/*.h)
list(REMOVE_ITEM API_HEADERS ${CMAKE_CURRENT_LIST_DIR}/api/ui_events.in.h)
file(GLOB MSGPACK_RPC_HEADERS CONFIGURE_DEPENDS msgpack_rpc/*.h)
@@ -587,15 +583,6 @@ foreach(sfile ${NVIM_SOURCES}
endif()
endforeach()
-add_custom_command(OUTPUT ${GENERATED_UNICODE_TABLES}
- COMMAND ${LUA_PRG} ${UNICODE_TABLES_GENERATOR}
- ${UNICODE_DIR}
- ${GENERATED_UNICODE_TABLES}
- DEPENDS
- ${UNICODE_TABLES_GENERATOR}
- ${UNICODE_FILES}
-)
-
set(NVIM_VERSION_LUA ${PROJECT_BINARY_DIR}/nvim_version.lua)
configure_file(${GENERATOR_DIR}/nvim_version.lua.in ${NVIM_VERSION_LUA})
@@ -687,7 +674,6 @@ list(APPEND NVIM_GENERATED_FOR_SOURCES
"${GENERATED_EVENTS_NAMES_MAP}"
"${GENERATED_OPTIONS}"
"${GENERATED_OPTIONS_MAP}"
- "${GENERATED_UNICODE_TABLES}"
"${VIM_MODULE_FILE}"
"${PROJECT_BINARY_DIR}/cmake.config/auto/pathdef.h"
)
diff --git a/src/nvim/generators/gen_unicode_tables.lua b/src/nvim/generators/gen_unicode_tables.lua
deleted file mode 100644
index 01eb34be88..0000000000
--- a/src/nvim/generators/gen_unicode_tables.lua
+++ /dev/null
@@ -1,264 +0,0 @@
--- Script creates the following tables in unicode_tables.generated.h:
---
--- 1. doublewidth and ambiguous tables: sorted list of non-overlapping closed
--- intervals. Codepoints in these intervals have double (W or F) or ambiguous
--- (A) east asian width respectively.
--- 2. combining table: same as the above, but characters inside are combining
--- characters (i.e. have general categories equal to Mn, Mc or Me).
--- 3. foldCase table used to convert characters to
--- folded variants. In this table first two values are
--- character ranges: like in previous tables they are sorted and must be
--- non-overlapping. Third value means step inside the range: e.g. if it is
--- 2 then interval applies only to first, third, fifth, … character in range.
--- Fourth value is number that should be added to the codepoint to yield
--- folded codepoint.
--- 4. emoji_wide and emoji_all tables: sorted lists of non-overlapping closed
--- intervals of Emoji characters. emoji_wide contains all the characters
--- which don't have ambiguous or double width, and emoji_all has all Emojis.
-if arg[1] == '--help' then
- print('Usage:')
- print(' gen_unicode_tables.lua unicode/ unicode_tables.generated.h')
- os.exit(0)
-end
-
-local basedir = arg[1]
-local pathsep = package.config:sub(1, 1)
-local get_path = function(fname)
- return basedir .. pathsep .. fname
-end
-
-local unicodedata_fname = get_path('UnicodeData.txt')
-local eastasianwidth_fname = get_path('EastAsianWidth.txt')
-local emoji_fname = get_path('emoji-data.txt')
-
-local utf_tables_fname = arg[2]
-
-local split_on_semicolons = function(s)
- local ret = {}
- local idx = 1
- while idx <= #s + 1 do
- local item = s:match('^[^;]*', idx)
- idx = idx + #item + 1
- if idx <= #s + 1 then
- assert(s:sub(idx - 1, idx - 1) == ';')
- end
- item = item:gsub('^%s*', '')
- item = item:gsub('%s*$', '')
- table.insert(ret, item)
- end
- return ret
-end
-
-local fp_lines_to_lists = function(fp, n, has_comments)
- local ret = {}
- local line
- local i = 0
- while true do
- i = i + 1
- line = fp:read('*l')
- if not line then
- break
- end
- if not has_comments or (line:sub(1, 1) ~= '#' and not line:match('^%s*$')) then
- local l = split_on_semicolons(line)
- if #l ~= n then
- io.stderr:write(('Found %s items in line %u, expected %u\n'):format(#l, i, n))
- io.stderr:write('Line: ' .. line .. '\n')
- return nil
- end
- table.insert(ret, l)
- end
- end
- return ret
-end
-
-local parse_data_to_props = function(ud_fp)
- return fp_lines_to_lists(ud_fp, 15, false)
-end
-
-local parse_width_props = function(eaw_fp)
- return fp_lines_to_lists(eaw_fp, 2, true)
-end
-
-local parse_emoji_props = function(emoji_fp)
- return fp_lines_to_lists(emoji_fp, 2, true)
-end
-
-local make_range = function(start, end_, step, add)
- if step and add then
- return (' {0x%x, 0x%x, %d, %d},\n'):format(start, end_, step == 0 and -1 or step, add)
- else
- return (' {0x%04x, 0x%04x},\n'):format(start, end_)
- end
-end
-
-local build_combining_table = function(ut_fp, dataprops)
- ut_fp:write('static const struct interval combining[] = {\n')
- local start = -1
- local end_ = -1
- for _, p in ipairs(dataprops) do
- -- The 'Mc' property was removed, it does take up space.
- if ({ Mn = true, Me = true })[p[3]] then
- local n = tonumber(p[1], 16)
- if start >= 0 and end_ + 1 == n then
- -- Continue with the same range.
- end_ = n
- else
- if start >= 0 then
- -- Produce previous range.
- ut_fp:write(make_range(start, end_))
- end
- start = n
- end_ = n
- end
- end
- end
- if start >= 0 then
- ut_fp:write(make_range(start, end_))
- end
- ut_fp:write('};\n')
-end
-
-local build_width_table = function(ut_fp, dataprops, widthprops, widths, table_name)
- ut_fp:write('static const struct interval ' .. table_name .. '[] = {\n')
- local start = -1
- local end_ = -1
- local dataidx = 1
- local ret = {}
- for _, p in ipairs(widthprops) do
- if widths[p[2]:sub(1, 1)] then
- local rng_start, rng_end = p[1]:find('%.%.')
- local n, n_last
- if rng_start then
- -- It is a range. We don’t check for composing char then.
- n = tonumber(p[1]:sub(1, rng_start - 1), 16)
- n_last = tonumber(p[1]:sub(rng_end + 1), 16)
- else
- n = tonumber(p[1], 16)
- n_last = n
- end
- local dn
- while true do
- dn = tonumber(dataprops[dataidx][1], 16)
- if dn >= n then
- break
- end
- dataidx = dataidx + 1
- end
- if dn ~= n and n_last == n then
- io.stderr:write('Cannot find character ' .. n .. ' in data table.\n')
- end
- -- Only use the char when it’s not a composing char.
- -- But use all chars from a range.
- local dp = dataprops[dataidx]
- if (n_last > n) or not ({ Mn = true, Mc = true, Me = true })[dp[3]] then
- if start >= 0 and end_ + 1 == n then -- luacheck: ignore 542
- -- Continue with the same range.
- else
- if start >= 0 then
- ut_fp:write(make_range(start, end_))
- table.insert(ret, { start, end_ })
- end
- start = n
- end
- end_ = n_last
- end
- end
- end
- if start >= 0 then
- ut_fp:write(make_range(start, end_))
- table.insert(ret, { start, end_ })
- end
- ut_fp:write('};\n')
- return ret
-end
-
-local build_emoji_table = function(ut_fp, emojiprops, doublewidth, ambiwidth)
- local emojiwidth = {}
- local emoji = {}
- for _, p in ipairs(emojiprops) do
- if p[2]:match('Emoji%s+#') then
- local rng_start, rng_end = p[1]:find('%.%.')
- local n
- local n_last
- if rng_start then
- n = tonumber(p[1]:sub(1, rng_start - 1), 16)
- n_last = tonumber(p[1]:sub(rng_end + 1), 16)
- else
- n = tonumber(p[1], 16)
- n_last = n
- end
- if #emoji > 0 and n - 1 == emoji[#emoji][2] then
- emoji[#emoji][2] = n_last
- else
- table.insert(emoji, { n, n_last })
- end
-
- -- Characters below 1F000 may be considered single width traditionally,
- -- making them double width causes problems.
- if n >= 0x1f000 then
- -- exclude characters that are in the ambiguous/doublewidth table
- for _, ambi in ipairs(ambiwidth) do
- if n >= ambi[1] and n <= ambi[2] then
- n = ambi[2] + 1
- end
- if n_last >= ambi[1] and n_last <= ambi[2] then
- n_last = ambi[1] - 1
- end
- end
- for _, double in ipairs(doublewidth) do
- if n >= double[1] and n <= double[2] then
- n = double[2] + 1
- end
- if n_last >= double[1] and n_last <= double[2] then
- n_last = double[1] - 1
- end
- end
-
- if n <= n_last then
- if #emojiwidth > 0 and n - 1 == emojiwidth[#emojiwidth][2] then
- emojiwidth[#emojiwidth][2] = n_last
- else
- table.insert(emojiwidth, { n, n_last })
- end
- end
- end
- end
- end
-
- ut_fp:write('static const struct interval emoji_all[] = {\n')
- for _, p in ipairs(emoji) do
- ut_fp:write(make_range(p[1], p[2]))
- end
- ut_fp:write('};\n')
-
- ut_fp:write('static const struct interval emoji_wide[] = {\n')
- for _, p in ipairs(emojiwidth) do
- ut_fp:write(make_range(p[1], p[2]))
- end
- ut_fp:write('};\n')
-end
-
-local ud_fp = io.open(unicodedata_fname, 'r')
-local dataprops = parse_data_to_props(ud_fp)
-ud_fp:close()
-
-local ut_fp = io.open(utf_tables_fname, 'w')
-
-build_combining_table(ut_fp, dataprops)
-
-local eaw_fp = io.open(eastasianwidth_fname, 'r')
-local widthprops = parse_width_props(eaw_fp)
-eaw_fp:close()
-
-local doublewidth =
- build_width_table(ut_fp, dataprops, widthprops, { W = true, F = true }, 'doublewidth')
-local ambiwidth = build_width_table(ut_fp, dataprops, widthprops, { A = true }, 'ambiguous')
-
-local emoji_fp = io.open(emoji_fname, 'r')
-local emojiprops = parse_emoji_props(emoji_fp)
-emoji_fp:close()
-
-build_emoji_table(ut_fp, emojiprops, doublewidth, ambiwidth)
-
-ut_fp:close()
diff --git a/src/nvim/mbyte.c b/src/nvim/mbyte.c
index 666a904fc5..db4730408b 100644
--- a/src/nvim/mbyte.c
+++ b/src/nvim/mbyte.c
@@ -85,7 +85,6 @@ struct interval {
// uncrustify:off
#ifdef INCLUDE_GENERATED_DECLARATIONS
# include "mbyte.c.generated.h"
-# include "unicode_tables.generated.h"
#endif
// uncrustify:on
@@ -444,31 +443,10 @@ int mb_get_class_tab(const char *p, const uint64_t *const chartab)
return utf_class_tab(utf_ptr2char(p), chartab);
}
-// Return true if "c" is in "table".
-static bool intable(const struct interval *table, size_t n_items, int c)
- FUNC_ATTR_PURE
+static bool prop_is_emojilike(const utf8proc_property_t *prop)
{
- assert(n_items > 0);
- // first quick check for Latin1 etc. characters
- if (c < table[0].first) {
- return false;
- }
-
- assert(n_items <= SIZE_MAX / 2);
- // binary search in table
- size_t bot = 0;
- size_t top = n_items;
- do {
- size_t mid = (bot + top) >> 1;
- if (table[mid].last < c) {
- bot = mid + 1;
- } else if (table[mid].first > c) {
- top = mid;
- } else {
- return true;
- }
- } while (top > bot);
- return false;
+ return prop->boundclass == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC
+ || prop->boundclass == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR;
}
/// For UTF-8 character "c" return 2 for a double-width character, 1 for others.
@@ -496,13 +474,18 @@ int utf_char2cells(int c)
return n;
}
- if (intable(doublewidth, ARRAY_SIZE(doublewidth), c)) {
+ const utf8proc_property_t *prop = utf8proc_get_property(c);
+
+ if (prop->charwidth == 2) {
return 2;
}
- if (p_emoji && intable(emoji_wide, ARRAY_SIZE(emoji_wide), c)) {
+ if (*p_ambw == 'd' && prop->ambiguous_width) {
return 2;
}
- if (*p_ambw == 'd' && intable(ambiguous, ARRAY_SIZE(ambiguous), c)) {
+
+ // Characters below 1F000 may be considered single width traditionally,
+ // making them double width causes problems.
+ if (p_emoji && c >= 0x1f000 && !prop->ambiguous_width && prop_is_emojilike(prop)) {
return 2;
}
@@ -528,7 +511,7 @@ int utf_ptr2cells(const char *p_in)
}
int cells = utf_char2cells(c);
if (cells == 1 && p_emoji
- && intable(emoji_all, ARRAY_SIZE(emoji_all), c)) {
+ && prop_is_emojilike(utf8proc_get_property(c))) {
int c2 = utf_ptr2char(p_in + len);
if (c2 == 0xFE0F) {
return 2; // emoji presentation
@@ -628,7 +611,7 @@ int utf_ptr2cells_len(const char *p, int size)
}
int cells = utf_char2cells(c);
if (cells == 1 && p_emoji && size > len
- && intable(emoji_all, ARRAY_SIZE(emoji_all), c)
+ && prop_is_emojilike(utf8proc_get_property(c))
&& utf_ptr2len_len(p + len, size - len) == utf8len_tab[(uint8_t)p[len]]) {
int c2 = utf_ptr2char(p + len);
if (c2 == 0xFE0F) {
@@ -1137,7 +1120,8 @@ int utf_char2bytes(const int c, char *const buf)
/// Returns false for negative values.
bool utf_iscomposing_legacy(int c)
{
- return intable(combining, ARRAY_SIZE(combining), c);
+ const utf8proc_property_t *prop = utf8proc_get_property(c);
+ return prop->category == UTF8PROC_CATEGORY_MN || prop->category == UTF8PROC_CATEGORY_ME;
}
#ifdef __SSE2__
@@ -1182,6 +1166,33 @@ bool utf_printable(int c)
#else
+// Return true if "c" is in "table".
+static bool intable(const struct interval *table, size_t n_items, int c)
+ FUNC_ATTR_PURE
+{
+ assert(n_items > 0);
+ // first quick check for Latin1 etc. characters
+ if (c < table[0].first) {
+ return false;
+ }
+
+ assert(n_items <= SIZE_MAX / 2);
+ // binary search in table
+ size_t bot = 0;
+ size_t top = n_items;
+ do {
+ size_t mid = (bot + top) >> 1;
+ if (table[mid].last < c) {
+ bot = mid + 1;
+ } else if (table[mid].first > c) {
+ top = mid;
+ } else {
+ return true;
+ }
+ } while (top > bot);
+ return false;
+}
+
// Return true for characters that can be displayed in a normal way.
// Only for characters of 0x100 and above!
bool utf_printable(int c)
@@ -1304,8 +1315,9 @@ int utf_class_tab(const int c, const uint64_t *const chartab)
return 1; // punctuation
}
+ const utf8proc_property_t *prop = utf8proc_get_property(c);
// emoji
- if (intable(emoji_all, ARRAY_SIZE(emoji_all), c)) {
+ if (prop_is_emojilike(prop)) {
return 3;
}
@@ -1328,8 +1340,12 @@ int utf_class_tab(const int c, const uint64_t *const chartab)
bool utf_ambiguous_width(const char *p)
{
int c = utf_ptr2char(p);
- return c >= 0x80 && (intable(ambiguous, ARRAY_SIZE(ambiguous), c)
- || intable(emoji_all, ARRAY_SIZE(emoji_all), c));
+ if (c < 0x80) {
+ return false;
+ }
+
+ const utf8proc_property_t *prop = utf8proc_get_property(c);
+ return prop->ambiguous_width || prop_is_emojilike(prop);
}
// Return the folded-case equivalent of "a", which is a UCS-4 character. Uses