diff options
| author | bfredl <bjorn.linse@gmail.com> | 2024-08-31 18:55:09 +0200 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2024-08-31 18:55:09 +0200 |
| commit | e1937286f04863cf1aa984c4b27a7502576e6c88 (patch) | |
| tree | 117020f129a2d7bbb2307d32c23aa868a7933bb6 /src/nvim | |
| parent | a6c4487e8bfc8dc527ed64651515963e46ebeee8 (diff) | |
| parent | 26be6446e5ea1c5b22c50bfd9a0e5aa85927aff9 (diff) | |
| download | rneovim-e1937286f04863cf1aa984c4b27a7502576e6c88.tar.gz rneovim-e1937286f04863cf1aa984c4b27a7502576e6c88.tar.bz2 rneovim-e1937286f04863cf1aa984c4b27a7502576e6c88.zip | |
Merge pull request #30042 from bfredl/bobbytables
refactor(multibyte): replace generated unicode tables with utf8proc
Diffstat (limited to 'src/nvim')
| -rw-r--r-- | src/nvim/CMakeLists.txt | 14 | ||||
| -rw-r--r-- | src/nvim/generators/gen_unicode_tables.lua | 264 | ||||
| -rw-r--r-- | src/nvim/mbyte.c | 84 |
3 files changed, 50 insertions, 312 deletions
diff --git a/src/nvim/CMakeLists.txt b/src/nvim/CMakeLists.txt index 73aa508d9d..1b229c1d87 100644 --- a/src/nvim/CMakeLists.txt +++ b/src/nvim/CMakeLists.txt @@ -301,7 +301,6 @@ set(GENERATOR_DIR ${CMAKE_CURRENT_LIST_DIR}/generators) set(GEN_EVAL_TOUCH ${TOUCHES_DIR}/gen_doc_eval) set(LUAJIT_RUNTIME_DIR ${DEPS_PREFIX}/share/luajit-2.1/jit) set(NVIM_RUNTIME_DIR ${PROJECT_SOURCE_DIR}/runtime) -set(UNICODE_DIR ${PROJECT_SOURCE_DIR}/src/unicode) # GENERATOR_DIR set(API_DISPATCH_GENERATOR ${GENERATOR_DIR}/gen_api_dispatch.lua) @@ -316,7 +315,6 @@ set(GENERATOR_PRELOAD ${GENERATOR_DIR}/preload.lua) set(HEADER_GENERATOR ${GENERATOR_DIR}/gen_declarations.lua) set(OPTIONS_ENUM_GENERATOR ${GENERATOR_DIR}/gen_options_enum.lua) set(OPTIONS_GENERATOR ${GENERATOR_DIR}/gen_options.lua) -set(UNICODE_TABLES_GENERATOR ${GENERATOR_DIR}/gen_unicode_tables.lua) # GENERATED_DIR and GENERATED_INCLUDES_DIR set(GENERATED_API_DISPATCH ${GENERATED_DIR}/api/private/dispatch_wrappers.generated.h) @@ -333,7 +331,6 @@ set(GENERATED_OPTIONS_MAP ${GENERATED_DIR}/options_map.generated.h) set(GENERATED_UI_EVENTS_CALL ${GENERATED_DIR}/ui_events_call.generated.h) set(GENERATED_UI_EVENTS_CLIENT ${GENERATED_DIR}/ui_events_client.generated.h) set(GENERATED_UI_EVENTS_REMOTE ${GENERATED_DIR}/ui_events_remote.generated.h) -set(GENERATED_UNICODE_TABLES ${GENERATED_DIR}/unicode_tables.generated.h) set(LUA_API_C_BINDINGS ${GENERATED_DIR}/lua_api_c_bindings.generated.h) set(VIM_MODULE_FILE ${GENERATED_DIR}/lua/vim_module.generated.h) @@ -350,7 +347,6 @@ set(LUA_LOADER_MODULE_SOURCE ${NVIM_RUNTIME_DIR}/lua/vim/loader.lua) set(LUA_OPTIONS_MODULE_SOURCE ${NVIM_RUNTIME_DIR}/lua/vim/_options.lua) set(LUA_SHARED_MODULE_SOURCE ${NVIM_RUNTIME_DIR}/lua/vim/shared.lua) -file(GLOB UNICODE_FILES CONFIGURE_DEPENDS ${UNICODE_DIR}/*.txt) file(GLOB API_HEADERS CONFIGURE_DEPENDS api/*.h) list(REMOVE_ITEM API_HEADERS ${CMAKE_CURRENT_LIST_DIR}/api/ui_events.in.h) file(GLOB MSGPACK_RPC_HEADERS CONFIGURE_DEPENDS msgpack_rpc/*.h) @@ -587,15 +583,6 @@ foreach(sfile ${NVIM_SOURCES} endif() endforeach() -add_custom_command(OUTPUT ${GENERATED_UNICODE_TABLES} - COMMAND ${LUA_PRG} ${UNICODE_TABLES_GENERATOR} - ${UNICODE_DIR} - ${GENERATED_UNICODE_TABLES} - DEPENDS - ${UNICODE_TABLES_GENERATOR} - ${UNICODE_FILES} -) - set(NVIM_VERSION_LUA ${PROJECT_BINARY_DIR}/nvim_version.lua) configure_file(${GENERATOR_DIR}/nvim_version.lua.in ${NVIM_VERSION_LUA}) @@ -687,7 +674,6 @@ list(APPEND NVIM_GENERATED_FOR_SOURCES "${GENERATED_EVENTS_NAMES_MAP}" "${GENERATED_OPTIONS}" "${GENERATED_OPTIONS_MAP}" - "${GENERATED_UNICODE_TABLES}" "${VIM_MODULE_FILE}" "${PROJECT_BINARY_DIR}/cmake.config/auto/pathdef.h" ) diff --git a/src/nvim/generators/gen_unicode_tables.lua b/src/nvim/generators/gen_unicode_tables.lua deleted file mode 100644 index 01eb34be88..0000000000 --- a/src/nvim/generators/gen_unicode_tables.lua +++ /dev/null @@ -1,264 +0,0 @@ --- Script creates the following tables in unicode_tables.generated.h: --- --- 1. doublewidth and ambiguous tables: sorted list of non-overlapping closed --- intervals. Codepoints in these intervals have double (W or F) or ambiguous --- (A) east asian width respectively. --- 2. combining table: same as the above, but characters inside are combining --- characters (i.e. have general categories equal to Mn, Mc or Me). --- 3. foldCase table used to convert characters to --- folded variants. In this table first two values are --- character ranges: like in previous tables they are sorted and must be --- non-overlapping. Third value means step inside the range: e.g. if it is --- 2 then interval applies only to first, third, fifth, … character in range. --- Fourth value is number that should be added to the codepoint to yield --- folded codepoint. --- 4. emoji_wide and emoji_all tables: sorted lists of non-overlapping closed --- intervals of Emoji characters. emoji_wide contains all the characters --- which don't have ambiguous or double width, and emoji_all has all Emojis. -if arg[1] == '--help' then - print('Usage:') - print(' gen_unicode_tables.lua unicode/ unicode_tables.generated.h') - os.exit(0) -end - -local basedir = arg[1] -local pathsep = package.config:sub(1, 1) -local get_path = function(fname) - return basedir .. pathsep .. fname -end - -local unicodedata_fname = get_path('UnicodeData.txt') -local eastasianwidth_fname = get_path('EastAsianWidth.txt') -local emoji_fname = get_path('emoji-data.txt') - -local utf_tables_fname = arg[2] - -local split_on_semicolons = function(s) - local ret = {} - local idx = 1 - while idx <= #s + 1 do - local item = s:match('^[^;]*', idx) - idx = idx + #item + 1 - if idx <= #s + 1 then - assert(s:sub(idx - 1, idx - 1) == ';') - end - item = item:gsub('^%s*', '') - item = item:gsub('%s*$', '') - table.insert(ret, item) - end - return ret -end - -local fp_lines_to_lists = function(fp, n, has_comments) - local ret = {} - local line - local i = 0 - while true do - i = i + 1 - line = fp:read('*l') - if not line then - break - end - if not has_comments or (line:sub(1, 1) ~= '#' and not line:match('^%s*$')) then - local l = split_on_semicolons(line) - if #l ~= n then - io.stderr:write(('Found %s items in line %u, expected %u\n'):format(#l, i, n)) - io.stderr:write('Line: ' .. line .. '\n') - return nil - end - table.insert(ret, l) - end - end - return ret -end - -local parse_data_to_props = function(ud_fp) - return fp_lines_to_lists(ud_fp, 15, false) -end - -local parse_width_props = function(eaw_fp) - return fp_lines_to_lists(eaw_fp, 2, true) -end - -local parse_emoji_props = function(emoji_fp) - return fp_lines_to_lists(emoji_fp, 2, true) -end - -local make_range = function(start, end_, step, add) - if step and add then - return (' {0x%x, 0x%x, %d, %d},\n'):format(start, end_, step == 0 and -1 or step, add) - else - return (' {0x%04x, 0x%04x},\n'):format(start, end_) - end -end - -local build_combining_table = function(ut_fp, dataprops) - ut_fp:write('static const struct interval combining[] = {\n') - local start = -1 - local end_ = -1 - for _, p in ipairs(dataprops) do - -- The 'Mc' property was removed, it does take up space. - if ({ Mn = true, Me = true })[p[3]] then - local n = tonumber(p[1], 16) - if start >= 0 and end_ + 1 == n then - -- Continue with the same range. - end_ = n - else - if start >= 0 then - -- Produce previous range. - ut_fp:write(make_range(start, end_)) - end - start = n - end_ = n - end - end - end - if start >= 0 then - ut_fp:write(make_range(start, end_)) - end - ut_fp:write('};\n') -end - -local build_width_table = function(ut_fp, dataprops, widthprops, widths, table_name) - ut_fp:write('static const struct interval ' .. table_name .. '[] = {\n') - local start = -1 - local end_ = -1 - local dataidx = 1 - local ret = {} - for _, p in ipairs(widthprops) do - if widths[p[2]:sub(1, 1)] then - local rng_start, rng_end = p[1]:find('%.%.') - local n, n_last - if rng_start then - -- It is a range. We don’t check for composing char then. - n = tonumber(p[1]:sub(1, rng_start - 1), 16) - n_last = tonumber(p[1]:sub(rng_end + 1), 16) - else - n = tonumber(p[1], 16) - n_last = n - end - local dn - while true do - dn = tonumber(dataprops[dataidx][1], 16) - if dn >= n then - break - end - dataidx = dataidx + 1 - end - if dn ~= n and n_last == n then - io.stderr:write('Cannot find character ' .. n .. ' in data table.\n') - end - -- Only use the char when it’s not a composing char. - -- But use all chars from a range. - local dp = dataprops[dataidx] - if (n_last > n) or not ({ Mn = true, Mc = true, Me = true })[dp[3]] then - if start >= 0 and end_ + 1 == n then -- luacheck: ignore 542 - -- Continue with the same range. - else - if start >= 0 then - ut_fp:write(make_range(start, end_)) - table.insert(ret, { start, end_ }) - end - start = n - end - end_ = n_last - end - end - end - if start >= 0 then - ut_fp:write(make_range(start, end_)) - table.insert(ret, { start, end_ }) - end - ut_fp:write('};\n') - return ret -end - -local build_emoji_table = function(ut_fp, emojiprops, doublewidth, ambiwidth) - local emojiwidth = {} - local emoji = {} - for _, p in ipairs(emojiprops) do - if p[2]:match('Emoji%s+#') then - local rng_start, rng_end = p[1]:find('%.%.') - local n - local n_last - if rng_start then - n = tonumber(p[1]:sub(1, rng_start - 1), 16) - n_last = tonumber(p[1]:sub(rng_end + 1), 16) - else - n = tonumber(p[1], 16) - n_last = n - end - if #emoji > 0 and n - 1 == emoji[#emoji][2] then - emoji[#emoji][2] = n_last - else - table.insert(emoji, { n, n_last }) - end - - -- Characters below 1F000 may be considered single width traditionally, - -- making them double width causes problems. - if n >= 0x1f000 then - -- exclude characters that are in the ambiguous/doublewidth table - for _, ambi in ipairs(ambiwidth) do - if n >= ambi[1] and n <= ambi[2] then - n = ambi[2] + 1 - end - if n_last >= ambi[1] and n_last <= ambi[2] then - n_last = ambi[1] - 1 - end - end - for _, double in ipairs(doublewidth) do - if n >= double[1] and n <= double[2] then - n = double[2] + 1 - end - if n_last >= double[1] and n_last <= double[2] then - n_last = double[1] - 1 - end - end - - if n <= n_last then - if #emojiwidth > 0 and n - 1 == emojiwidth[#emojiwidth][2] then - emojiwidth[#emojiwidth][2] = n_last - else - table.insert(emojiwidth, { n, n_last }) - end - end - end - end - end - - ut_fp:write('static const struct interval emoji_all[] = {\n') - for _, p in ipairs(emoji) do - ut_fp:write(make_range(p[1], p[2])) - end - ut_fp:write('};\n') - - ut_fp:write('static const struct interval emoji_wide[] = {\n') - for _, p in ipairs(emojiwidth) do - ut_fp:write(make_range(p[1], p[2])) - end - ut_fp:write('};\n') -end - -local ud_fp = io.open(unicodedata_fname, 'r') -local dataprops = parse_data_to_props(ud_fp) -ud_fp:close() - -local ut_fp = io.open(utf_tables_fname, 'w') - -build_combining_table(ut_fp, dataprops) - -local eaw_fp = io.open(eastasianwidth_fname, 'r') -local widthprops = parse_width_props(eaw_fp) -eaw_fp:close() - -local doublewidth = - build_width_table(ut_fp, dataprops, widthprops, { W = true, F = true }, 'doublewidth') -local ambiwidth = build_width_table(ut_fp, dataprops, widthprops, { A = true }, 'ambiguous') - -local emoji_fp = io.open(emoji_fname, 'r') -local emojiprops = parse_emoji_props(emoji_fp) -emoji_fp:close() - -build_emoji_table(ut_fp, emojiprops, doublewidth, ambiwidth) - -ut_fp:close() diff --git a/src/nvim/mbyte.c b/src/nvim/mbyte.c index 666a904fc5..db4730408b 100644 --- a/src/nvim/mbyte.c +++ b/src/nvim/mbyte.c @@ -85,7 +85,6 @@ struct interval { // uncrustify:off #ifdef INCLUDE_GENERATED_DECLARATIONS # include "mbyte.c.generated.h" -# include "unicode_tables.generated.h" #endif // uncrustify:on @@ -444,31 +443,10 @@ int mb_get_class_tab(const char *p, const uint64_t *const chartab) return utf_class_tab(utf_ptr2char(p), chartab); } -// Return true if "c" is in "table". -static bool intable(const struct interval *table, size_t n_items, int c) - FUNC_ATTR_PURE +static bool prop_is_emojilike(const utf8proc_property_t *prop) { - assert(n_items > 0); - // first quick check for Latin1 etc. characters - if (c < table[0].first) { - return false; - } - - assert(n_items <= SIZE_MAX / 2); - // binary search in table - size_t bot = 0; - size_t top = n_items; - do { - size_t mid = (bot + top) >> 1; - if (table[mid].last < c) { - bot = mid + 1; - } else if (table[mid].first > c) { - top = mid; - } else { - return true; - } - } while (top > bot); - return false; + return prop->boundclass == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC + || prop->boundclass == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR; } /// For UTF-8 character "c" return 2 for a double-width character, 1 for others. @@ -496,13 +474,18 @@ int utf_char2cells(int c) return n; } - if (intable(doublewidth, ARRAY_SIZE(doublewidth), c)) { + const utf8proc_property_t *prop = utf8proc_get_property(c); + + if (prop->charwidth == 2) { return 2; } - if (p_emoji && intable(emoji_wide, ARRAY_SIZE(emoji_wide), c)) { + if (*p_ambw == 'd' && prop->ambiguous_width) { return 2; } - if (*p_ambw == 'd' && intable(ambiguous, ARRAY_SIZE(ambiguous), c)) { + + // Characters below 1F000 may be considered single width traditionally, + // making them double width causes problems. + if (p_emoji && c >= 0x1f000 && !prop->ambiguous_width && prop_is_emojilike(prop)) { return 2; } @@ -528,7 +511,7 @@ int utf_ptr2cells(const char *p_in) } int cells = utf_char2cells(c); if (cells == 1 && p_emoji - && intable(emoji_all, ARRAY_SIZE(emoji_all), c)) { + && prop_is_emojilike(utf8proc_get_property(c))) { int c2 = utf_ptr2char(p_in + len); if (c2 == 0xFE0F) { return 2; // emoji presentation @@ -628,7 +611,7 @@ int utf_ptr2cells_len(const char *p, int size) } int cells = utf_char2cells(c); if (cells == 1 && p_emoji && size > len - && intable(emoji_all, ARRAY_SIZE(emoji_all), c) + && prop_is_emojilike(utf8proc_get_property(c)) && utf_ptr2len_len(p + len, size - len) == utf8len_tab[(uint8_t)p[len]]) { int c2 = utf_ptr2char(p + len); if (c2 == 0xFE0F) { @@ -1137,7 +1120,8 @@ int utf_char2bytes(const int c, char *const buf) /// Returns false for negative values. bool utf_iscomposing_legacy(int c) { - return intable(combining, ARRAY_SIZE(combining), c); + const utf8proc_property_t *prop = utf8proc_get_property(c); + return prop->category == UTF8PROC_CATEGORY_MN || prop->category == UTF8PROC_CATEGORY_ME; } #ifdef __SSE2__ @@ -1182,6 +1166,33 @@ bool utf_printable(int c) #else +// Return true if "c" is in "table". +static bool intable(const struct interval *table, size_t n_items, int c) + FUNC_ATTR_PURE +{ + assert(n_items > 0); + // first quick check for Latin1 etc. characters + if (c < table[0].first) { + return false; + } + + assert(n_items <= SIZE_MAX / 2); + // binary search in table + size_t bot = 0; + size_t top = n_items; + do { + size_t mid = (bot + top) >> 1; + if (table[mid].last < c) { + bot = mid + 1; + } else if (table[mid].first > c) { + top = mid; + } else { + return true; + } + } while (top > bot); + return false; +} + // Return true for characters that can be displayed in a normal way. // Only for characters of 0x100 and above! bool utf_printable(int c) @@ -1304,8 +1315,9 @@ int utf_class_tab(const int c, const uint64_t *const chartab) return 1; // punctuation } + const utf8proc_property_t *prop = utf8proc_get_property(c); // emoji - if (intable(emoji_all, ARRAY_SIZE(emoji_all), c)) { + if (prop_is_emojilike(prop)) { return 3; } @@ -1328,8 +1340,12 @@ int utf_class_tab(const int c, const uint64_t *const chartab) bool utf_ambiguous_width(const char *p) { int c = utf_ptr2char(p); - return c >= 0x80 && (intable(ambiguous, ARRAY_SIZE(ambiguous), c) - || intable(emoji_all, ARRAY_SIZE(emoji_all), c)); + if (c < 0x80) { + return false; + } + + const utf8proc_property_t *prop = utf8proc_get_property(c); + return prop->ambiguous_width || prop_is_emojilike(prop); } // Return the folded-case equivalent of "a", which is a UCS-4 character. Uses |