diff options
| author | bfredl <bjorn.linse@gmail.com> | 2024-08-14 10:10:54 +0200 |
|---|---|---|
| committer | bfredl <bjorn.linse@gmail.com> | 2024-08-31 18:09:38 +0200 |
| commit | 26be6446e5ea1c5b22c50bfd9a0e5aa85927aff9 (patch) | |
| tree | 117020f129a2d7bbb2307d32c23aa868a7933bb6 /src/nvim | |
| parent | a6c4487e8bfc8dc527ed64651515963e46ebeee8 (diff) | |
| download | rneovim-26be6446e5ea1c5b22c50bfd9a0e5aa85927aff9.tar.gz rneovim-26be6446e5ea1c5b22c50bfd9a0e5aa85927aff9.tar.bz2 rneovim-26be6446e5ea1c5b22c50bfd9a0e5aa85927aff9.zip | |
refactor(multibyte): replace generated unicode tables with utf8proc
This commit intentionally aims at preserving existing behavior as much
as possible while replacing our build step to convert unicode data
files into binary tables, which corresponding lookups in utf8proc.
Actual improvements in behavior will be a followup.
The only change in behavior is that 'emoji' option will turn some
more codepoints into double with. Nvim used the "Emoji" and
"Emoji_Presentation" properties to define emojis, while utf8proc
only exposes the Extended_Pictographic property from the emoji table.
This is a superset of the previous emoji properties. As only
codepoints above 0x1f000 are affected by the 'emoji' option, this means
that the following chars are now treated as double-width, instead of
single-width like in previous nvim versions:
๐ ๐ ๐ ๐ ๐
๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ก ๐ข ๐ฃ ๐ค
๐ฅ ๐ฆ ๐ง ๐จ ๐ฉ ๐ช ๐ซ ๐ฐ ๐ฑ ๐ฒ ๐ณ ๐ด ๐ต ๐ถ ๐ท ๐ธ ๐น ๐บ ๐ป ๐ผ ๐ฝ ๐พ ๐ฟ ๐ ๐ ๐ ๐ ๐ ๐
๐ ๐ ๐ ๐ ๐ ๐ ๐
๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ก ๐ข ๐ฃ ๐ค ๐ฅ ๐ฆ ๐ง ๐จ ๐ฉ ๐ช ๐ซ ๐ฌ ๐ญ ๐ฎ ๐ฏ ๐ฐ
๐ฑ ๐ฒ ๐ณ ๐ด ๐ต ๐ถ ๐ท ๐ธ ๐น ๐บ ๐ป ๐ผ ๐ฝ ๐พ ๐ฟ ๐ ๐ ๐ ๐ ๐ ๐
๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐
๐ก ๐ข ๐ฃ ๐ค ๐ฅ ๐ฆ ๐ง ๐จ ๐ฉ ๐ช ๐ซ ๐ฌ ๐ญ ๐ฎ ๐ฑ ๐ฒ ๐ณ ๐ด ๐ต ๐ถ ๐ท ๐ธ ๐น ๐บ ๐ป ๐ผ ๐ฝ ๐พ ๐ฟ ๐ ๐ ๐ ๐ ๐
๐ ๐
๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ก ๐ข ๐ฃ ๐ค ๐ฅ ๐ฆ ๐ง ๐จ ๐ฉ ๐ช ๐ซ ๐ฌ ๐ญ
๐ฎ ๐ฏ ๐ฐ ๐ฑ ๐ฒ ๐ณ ๐ด ๐ต ๐ ๐ ๐ ๐ฏ ๐
ฌ ๐
ญ ๐
ฎ ๐
ฏ ๐ญ ๐ข ๐ฃ ๐ ๐ ๐ ๐ ๐ ๐ฑ ๐ฒ ๐ถ ๐พ ๐ ๐ ๐ ๐ ๐จ ๐ฉ ๐ช ๐ซ
๐ฌ ๐ญ ๐ฎ ๐ฑ ๐ฒ ๐ป ๐ผ ๐ฝ ๐พ ๐ฟ ๐ ๐ ๐ ๐ ๐ ๐
๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ก
๐ข ๐ฃ ๐ฆ ๐ง ๐ฉ ๐ช ๐ซ ๐ฌ ๐ญ ๐ฎ ๐ฏ ๐ฐ ๐ณ ๐ด ๐ต ๐ถ ๐ท ๐ธ ๐น ๐บ ๐ป ๐ฝ ๐พ ๐ฟ ๐ ๐ ๐
๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐
๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ ๐ข ๐ค ๐ฅ ๐ฆ ๐ง ๐ฉ ๐ช ๐ซ ๐ฌ ๐ญ ๐ฎ ๐ฐ ๐ฑ ๐ฒ ๐ด ๐ต ๐ถ ๐ท ๐ธ ๐น ๐ ๐ ๐ ๐
๐ ๐ ๐ ๐ฆ ๐ง ๐จ ๐ช ๐ฑ ๐ฒ ๐ด ๐ต ๐ถ ๐ป ๐ผ ๐ฝ ๐พ ๐ฟ ๐ ๐ ๐ ๐ ๐ ๐ขฐ ๐ขฑ ๐จ ๐จ ๐จ ๐จ ๐จ ๐จ
๐จ ๐จ ๐จ ๐จ ๐จ ๐จ
๐จ ๐จ ๐จ ๐จ ๐จ ๐จ ๐จ ๐จ ๐จ ๐จ ๐จ ๐จ ๐จ ๐จ ๐จ ๐จ ๐จ ๐จ ๐จ ๐จ ๐จ ๐จก ๐จข ๐จฃ ๐จค ๐จฅ ๐จฆ ๐จง ๐จจ ๐จฉ ๐จช ๐จซ ๐จฌ ๐จญ ๐จฎ ๐จฏ
๐จฐ ๐จฑ ๐จฒ ๐จณ ๐จด ๐จต ๐จถ ๐จท ๐จธ ๐จน ๐จบ ๐จป ๐จผ ๐จฝ ๐จพ ๐จฟ ๐ฉ ๐ฉ ๐ฉ ๐ฉ ๐ฉ ๐ฉ
๐ฉ ๐ฉ ๐ฉ ๐ฉ ๐ฉ ๐ฉ ๐ฉ ๐ฉ ๐ฉ ๐ฉ ๐ฉ ๐ฉ ๐ฉ ๐ฉ
๐ฉ ๐ฉก ๐ฉข ๐ฉฃ ๐ฉค ๐ฉฅ ๐ฉฆ ๐ฉง ๐ฉจ ๐ฉฉ ๐ฉช ๐ฉซ ๐ฉฌ ๐ฉญ
Diffstat (limited to 'src/nvim')
| -rw-r--r-- | src/nvim/CMakeLists.txt | 14 | ||||
| -rw-r--r-- | src/nvim/generators/gen_unicode_tables.lua | 264 | ||||
| -rw-r--r-- | src/nvim/mbyte.c | 84 |
3 files changed, 50 insertions, 312 deletions
diff --git a/src/nvim/CMakeLists.txt b/src/nvim/CMakeLists.txt index 73aa508d9d..1b229c1d87 100644 --- a/src/nvim/CMakeLists.txt +++ b/src/nvim/CMakeLists.txt @@ -301,7 +301,6 @@ set(GENERATOR_DIR ${CMAKE_CURRENT_LIST_DIR}/generators) set(GEN_EVAL_TOUCH ${TOUCHES_DIR}/gen_doc_eval) set(LUAJIT_RUNTIME_DIR ${DEPS_PREFIX}/share/luajit-2.1/jit) set(NVIM_RUNTIME_DIR ${PROJECT_SOURCE_DIR}/runtime) -set(UNICODE_DIR ${PROJECT_SOURCE_DIR}/src/unicode) # GENERATOR_DIR set(API_DISPATCH_GENERATOR ${GENERATOR_DIR}/gen_api_dispatch.lua) @@ -316,7 +315,6 @@ set(GENERATOR_PRELOAD ${GENERATOR_DIR}/preload.lua) set(HEADER_GENERATOR ${GENERATOR_DIR}/gen_declarations.lua) set(OPTIONS_ENUM_GENERATOR ${GENERATOR_DIR}/gen_options_enum.lua) set(OPTIONS_GENERATOR ${GENERATOR_DIR}/gen_options.lua) -set(UNICODE_TABLES_GENERATOR ${GENERATOR_DIR}/gen_unicode_tables.lua) # GENERATED_DIR and GENERATED_INCLUDES_DIR set(GENERATED_API_DISPATCH ${GENERATED_DIR}/api/private/dispatch_wrappers.generated.h) @@ -333,7 +331,6 @@ set(GENERATED_OPTIONS_MAP ${GENERATED_DIR}/options_map.generated.h) set(GENERATED_UI_EVENTS_CALL ${GENERATED_DIR}/ui_events_call.generated.h) set(GENERATED_UI_EVENTS_CLIENT ${GENERATED_DIR}/ui_events_client.generated.h) set(GENERATED_UI_EVENTS_REMOTE ${GENERATED_DIR}/ui_events_remote.generated.h) -set(GENERATED_UNICODE_TABLES ${GENERATED_DIR}/unicode_tables.generated.h) set(LUA_API_C_BINDINGS ${GENERATED_DIR}/lua_api_c_bindings.generated.h) set(VIM_MODULE_FILE ${GENERATED_DIR}/lua/vim_module.generated.h) @@ -350,7 +347,6 @@ set(LUA_LOADER_MODULE_SOURCE ${NVIM_RUNTIME_DIR}/lua/vim/loader.lua) set(LUA_OPTIONS_MODULE_SOURCE ${NVIM_RUNTIME_DIR}/lua/vim/_options.lua) set(LUA_SHARED_MODULE_SOURCE ${NVIM_RUNTIME_DIR}/lua/vim/shared.lua) -file(GLOB UNICODE_FILES CONFIGURE_DEPENDS ${UNICODE_DIR}/*.txt) file(GLOB API_HEADERS CONFIGURE_DEPENDS api/*.h) list(REMOVE_ITEM API_HEADERS ${CMAKE_CURRENT_LIST_DIR}/api/ui_events.in.h) file(GLOB MSGPACK_RPC_HEADERS CONFIGURE_DEPENDS msgpack_rpc/*.h) @@ -587,15 +583,6 @@ foreach(sfile ${NVIM_SOURCES} endif() endforeach() -add_custom_command(OUTPUT ${GENERATED_UNICODE_TABLES} - COMMAND ${LUA_PRG} ${UNICODE_TABLES_GENERATOR} - ${UNICODE_DIR} - ${GENERATED_UNICODE_TABLES} - DEPENDS - ${UNICODE_TABLES_GENERATOR} - ${UNICODE_FILES} -) - set(NVIM_VERSION_LUA ${PROJECT_BINARY_DIR}/nvim_version.lua) configure_file(${GENERATOR_DIR}/nvim_version.lua.in ${NVIM_VERSION_LUA}) @@ -687,7 +674,6 @@ list(APPEND NVIM_GENERATED_FOR_SOURCES "${GENERATED_EVENTS_NAMES_MAP}" "${GENERATED_OPTIONS}" "${GENERATED_OPTIONS_MAP}" - "${GENERATED_UNICODE_TABLES}" "${VIM_MODULE_FILE}" "${PROJECT_BINARY_DIR}/cmake.config/auto/pathdef.h" ) diff --git a/src/nvim/generators/gen_unicode_tables.lua b/src/nvim/generators/gen_unicode_tables.lua deleted file mode 100644 index 01eb34be88..0000000000 --- a/src/nvim/generators/gen_unicode_tables.lua +++ /dev/null @@ -1,264 +0,0 @@ --- Script creates the following tables in unicode_tables.generated.h: --- --- 1. doublewidth and ambiguous tables: sorted list of non-overlapping closed --- intervals. Codepoints in these intervals have double (W or F) or ambiguous --- (A) east asian width respectively. --- 2. combining table: same as the above, but characters inside are combining --- characters (i.e. have general categories equal to Mn, Mc or Me). --- 3. foldCase table used to convert characters to --- folded variants. In this table first two values are --- character ranges: like in previous tables they are sorted and must be --- non-overlapping. Third value means step inside the range: e.g. if it is --- 2 then interval applies only to first, third, fifth, โฆ character in range. --- Fourth value is number that should be added to the codepoint to yield --- folded codepoint. --- 4. emoji_wide and emoji_all tables: sorted lists of non-overlapping closed --- intervals of Emoji characters. emoji_wide contains all the characters --- which don't have ambiguous or double width, and emoji_all has all Emojis. -if arg[1] == '--help' then - print('Usage:') - print(' gen_unicode_tables.lua unicode/ unicode_tables.generated.h') - os.exit(0) -end - -local basedir = arg[1] -local pathsep = package.config:sub(1, 1) -local get_path = function(fname) - return basedir .. pathsep .. fname -end - -local unicodedata_fname = get_path('UnicodeData.txt') -local eastasianwidth_fname = get_path('EastAsianWidth.txt') -local emoji_fname = get_path('emoji-data.txt') - -local utf_tables_fname = arg[2] - -local split_on_semicolons = function(s) - local ret = {} - local idx = 1 - while idx <= #s + 1 do - local item = s:match('^[^;]*', idx) - idx = idx + #item + 1 - if idx <= #s + 1 then - assert(s:sub(idx - 1, idx - 1) == ';') - end - item = item:gsub('^%s*', '') - item = item:gsub('%s*$', '') - table.insert(ret, item) - end - return ret -end - -local fp_lines_to_lists = function(fp, n, has_comments) - local ret = {} - local line - local i = 0 - while true do - i = i + 1 - line = fp:read('*l') - if not line then - break - end - if not has_comments or (line:sub(1, 1) ~= '#' and not line:match('^%s*$')) then - local l = split_on_semicolons(line) - if #l ~= n then - io.stderr:write(('Found %s items in line %u, expected %u\n'):format(#l, i, n)) - io.stderr:write('Line: ' .. line .. '\n') - return nil - end - table.insert(ret, l) - end - end - return ret -end - -local parse_data_to_props = function(ud_fp) - return fp_lines_to_lists(ud_fp, 15, false) -end - -local parse_width_props = function(eaw_fp) - return fp_lines_to_lists(eaw_fp, 2, true) -end - -local parse_emoji_props = function(emoji_fp) - return fp_lines_to_lists(emoji_fp, 2, true) -end - -local make_range = function(start, end_, step, add) - if step and add then - return (' {0x%x, 0x%x, %d, %d},\n'):format(start, end_, step == 0 and -1 or step, add) - else - return (' {0x%04x, 0x%04x},\n'):format(start, end_) - end -end - -local build_combining_table = function(ut_fp, dataprops) - ut_fp:write('static const struct interval combining[] = {\n') - local start = -1 - local end_ = -1 - for _, p in ipairs(dataprops) do - -- The 'Mc' property was removed, it does take up space. - if ({ Mn = true, Me = true })[p[3]] then - local n = tonumber(p[1], 16) - if start >= 0 and end_ + 1 == n then - -- Continue with the same range. - end_ = n - else - if start >= 0 then - -- Produce previous range. - ut_fp:write(make_range(start, end_)) - end - start = n - end_ = n - end - end - end - if start >= 0 then - ut_fp:write(make_range(start, end_)) - end - ut_fp:write('};\n') -end - -local build_width_table = function(ut_fp, dataprops, widthprops, widths, table_name) - ut_fp:write('static const struct interval ' .. table_name .. '[] = {\n') - local start = -1 - local end_ = -1 - local dataidx = 1 - local ret = {} - for _, p in ipairs(widthprops) do - if widths[p[2]:sub(1, 1)] then - local rng_start, rng_end = p[1]:find('%.%.') - local n, n_last - if rng_start then - -- It is a range. We donโt check for composing char then. - n = tonumber(p[1]:sub(1, rng_start - 1), 16) - n_last = tonumber(p[1]:sub(rng_end + 1), 16) - else - n = tonumber(p[1], 16) - n_last = n - end - local dn - while true do - dn = tonumber(dataprops[dataidx][1], 16) - if dn >= n then - break - end - dataidx = dataidx + 1 - end - if dn ~= n and n_last == n then - io.stderr:write('Cannot find character ' .. n .. ' in data table.\n') - end - -- Only use the char when itโs not a composing char. - -- But use all chars from a range. - local dp = dataprops[dataidx] - if (n_last > n) or not ({ Mn = true, Mc = true, Me = true })[dp[3]] then - if start >= 0 and end_ + 1 == n then -- luacheck: ignore 542 - -- Continue with the same range. - else - if start >= 0 then - ut_fp:write(make_range(start, end_)) - table.insert(ret, { start, end_ }) - end - start = n - end - end_ = n_last - end - end - end - if start >= 0 then - ut_fp:write(make_range(start, end_)) - table.insert(ret, { start, end_ }) - end - ut_fp:write('};\n') - return ret -end - -local build_emoji_table = function(ut_fp, emojiprops, doublewidth, ambiwidth) - local emojiwidth = {} - local emoji = {} - for _, p in ipairs(emojiprops) do - if p[2]:match('Emoji%s+#') then - local rng_start, rng_end = p[1]:find('%.%.') - local n - local n_last - if rng_start then - n = tonumber(p[1]:sub(1, rng_start - 1), 16) - n_last = tonumber(p[1]:sub(rng_end + 1), 16) - else - n = tonumber(p[1], 16) - n_last = n - end - if #emoji > 0 and n - 1 == emoji[#emoji][2] then - emoji[#emoji][2] = n_last - else - table.insert(emoji, { n, n_last }) - end - - -- Characters below 1F000 may be considered single width traditionally, - -- making them double width causes problems. - if n >= 0x1f000 then - -- exclude characters that are in the ambiguous/doublewidth table - for _, ambi in ipairs(ambiwidth) do - if n >= ambi[1] and n <= ambi[2] then - n = ambi[2] + 1 - end - if n_last >= ambi[1] and n_last <= ambi[2] then - n_last = ambi[1] - 1 - end - end - for _, double in ipairs(doublewidth) do - if n >= double[1] and n <= double[2] then - n = double[2] + 1 - end - if n_last >= double[1] and n_last <= double[2] then - n_last = double[1] - 1 - end - end - - if n <= n_last then - if #emojiwidth > 0 and n - 1 == emojiwidth[#emojiwidth][2] then - emojiwidth[#emojiwidth][2] = n_last - else - table.insert(emojiwidth, { n, n_last }) - end - end - end - end - end - - ut_fp:write('static const struct interval emoji_all[] = {\n') - for _, p in ipairs(emoji) do - ut_fp:write(make_range(p[1], p[2])) - end - ut_fp:write('};\n') - - ut_fp:write('static const struct interval emoji_wide[] = {\n') - for _, p in ipairs(emojiwidth) do - ut_fp:write(make_range(p[1], p[2])) - end - ut_fp:write('};\n') -end - -local ud_fp = io.open(unicodedata_fname, 'r') -local dataprops = parse_data_to_props(ud_fp) -ud_fp:close() - -local ut_fp = io.open(utf_tables_fname, 'w') - -build_combining_table(ut_fp, dataprops) - -local eaw_fp = io.open(eastasianwidth_fname, 'r') -local widthprops = parse_width_props(eaw_fp) -eaw_fp:close() - -local doublewidth = - build_width_table(ut_fp, dataprops, widthprops, { W = true, F = true }, 'doublewidth') -local ambiwidth = build_width_table(ut_fp, dataprops, widthprops, { A = true }, 'ambiguous') - -local emoji_fp = io.open(emoji_fname, 'r') -local emojiprops = parse_emoji_props(emoji_fp) -emoji_fp:close() - -build_emoji_table(ut_fp, emojiprops, doublewidth, ambiwidth) - -ut_fp:close() diff --git a/src/nvim/mbyte.c b/src/nvim/mbyte.c index 666a904fc5..db4730408b 100644 --- a/src/nvim/mbyte.c +++ b/src/nvim/mbyte.c @@ -85,7 +85,6 @@ struct interval { // uncrustify:off #ifdef INCLUDE_GENERATED_DECLARATIONS # include "mbyte.c.generated.h" -# include "unicode_tables.generated.h" #endif // uncrustify:on @@ -444,31 +443,10 @@ int mb_get_class_tab(const char *p, const uint64_t *const chartab) return utf_class_tab(utf_ptr2char(p), chartab); } -// Return true if "c" is in "table". -static bool intable(const struct interval *table, size_t n_items, int c) - FUNC_ATTR_PURE +static bool prop_is_emojilike(const utf8proc_property_t *prop) { - assert(n_items > 0); - // first quick check for Latin1 etc. characters - if (c < table[0].first) { - return false; - } - - assert(n_items <= SIZE_MAX / 2); - // binary search in table - size_t bot = 0; - size_t top = n_items; - do { - size_t mid = (bot + top) >> 1; - if (table[mid].last < c) { - bot = mid + 1; - } else if (table[mid].first > c) { - top = mid; - } else { - return true; - } - } while (top > bot); - return false; + return prop->boundclass == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC + || prop->boundclass == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR; } /// For UTF-8 character "c" return 2 for a double-width character, 1 for others. @@ -496,13 +474,18 @@ int utf_char2cells(int c) return n; } - if (intable(doublewidth, ARRAY_SIZE(doublewidth), c)) { + const utf8proc_property_t *prop = utf8proc_get_property(c); + + if (prop->charwidth == 2) { return 2; } - if (p_emoji && intable(emoji_wide, ARRAY_SIZE(emoji_wide), c)) { + if (*p_ambw == 'd' && prop->ambiguous_width) { return 2; } - if (*p_ambw == 'd' && intable(ambiguous, ARRAY_SIZE(ambiguous), c)) { + + // Characters below 1F000 may be considered single width traditionally, + // making them double width causes problems. + if (p_emoji && c >= 0x1f000 && !prop->ambiguous_width && prop_is_emojilike(prop)) { return 2; } @@ -528,7 +511,7 @@ int utf_ptr2cells(const char *p_in) } int cells = utf_char2cells(c); if (cells == 1 && p_emoji - && intable(emoji_all, ARRAY_SIZE(emoji_all), c)) { + && prop_is_emojilike(utf8proc_get_property(c))) { int c2 = utf_ptr2char(p_in + len); if (c2 == 0xFE0F) { return 2; // emoji presentation @@ -628,7 +611,7 @@ int utf_ptr2cells_len(const char *p, int size) } int cells = utf_char2cells(c); if (cells == 1 && p_emoji && size > len - && intable(emoji_all, ARRAY_SIZE(emoji_all), c) + && prop_is_emojilike(utf8proc_get_property(c)) && utf_ptr2len_len(p + len, size - len) == utf8len_tab[(uint8_t)p[len]]) { int c2 = utf_ptr2char(p + len); if (c2 == 0xFE0F) { @@ -1137,7 +1120,8 @@ int utf_char2bytes(const int c, char *const buf) /// Returns false for negative values. bool utf_iscomposing_legacy(int c) { - return intable(combining, ARRAY_SIZE(combining), c); + const utf8proc_property_t *prop = utf8proc_get_property(c); + return prop->category == UTF8PROC_CATEGORY_MN || prop->category == UTF8PROC_CATEGORY_ME; } #ifdef __SSE2__ @@ -1182,6 +1166,33 @@ bool utf_printable(int c) #else +// Return true if "c" is in "table". +static bool intable(const struct interval *table, size_t n_items, int c) + FUNC_ATTR_PURE +{ + assert(n_items > 0); + // first quick check for Latin1 etc. characters + if (c < table[0].first) { + return false; + } + + assert(n_items <= SIZE_MAX / 2); + // binary search in table + size_t bot = 0; + size_t top = n_items; + do { + size_t mid = (bot + top) >> 1; + if (table[mid].last < c) { + bot = mid + 1; + } else if (table[mid].first > c) { + top = mid; + } else { + return true; + } + } while (top > bot); + return false; +} + // Return true for characters that can be displayed in a normal way. // Only for characters of 0x100 and above! bool utf_printable(int c) @@ -1304,8 +1315,9 @@ int utf_class_tab(const int c, const uint64_t *const chartab) return 1; // punctuation } + const utf8proc_property_t *prop = utf8proc_get_property(c); // emoji - if (intable(emoji_all, ARRAY_SIZE(emoji_all), c)) { + if (prop_is_emojilike(prop)) { return 3; } @@ -1328,8 +1340,12 @@ int utf_class_tab(const int c, const uint64_t *const chartab) bool utf_ambiguous_width(const char *p) { int c = utf_ptr2char(p); - return c >= 0x80 && (intable(ambiguous, ARRAY_SIZE(ambiguous), c) - || intable(emoji_all, ARRAY_SIZE(emoji_all), c)); + if (c < 0x80) { + return false; + } + + const utf8proc_property_t *prop = utf8proc_get_property(c); + return prop->ambiguous_width || prop_is_emojilike(prop); } // Return the folded-case equivalent of "a", which is a UCS-4 character. Uses |