aboutsummaryrefslogtreecommitdiff
path: root/src/nvim/generators
diff options
context:
space:
mode:
authorbfredl <bjorn.linse@gmail.com>2024-08-14 10:10:54 +0200
committerbfredl <bjorn.linse@gmail.com>2024-08-31 18:09:38 +0200
commit26be6446e5ea1c5b22c50bfd9a0e5aa85927aff9 (patch)
tree117020f129a2d7bbb2307d32c23aa868a7933bb6 /src/nvim/generators
parenta6c4487e8bfc8dc527ed64651515963e46ebeee8 (diff)
downloadrneovim-26be6446e5ea1c5b22c50bfd9a0e5aa85927aff9.tar.gz
rneovim-26be6446e5ea1c5b22c50bfd9a0e5aa85927aff9.tar.bz2
rneovim-26be6446e5ea1c5b22c50bfd9a0e5aa85927aff9.zip
refactor(multibyte): replace generated unicode tables with utf8proc
This commit intentionally aims at preserving existing behavior as much as possible while replacing our build step to convert unicode data files into binary tables, which corresponding lookups in utf8proc. Actual improvements in behavior will be a followup. The only change in behavior is that 'emoji' option will turn some more codepoints into double with. Nvim used the "Emoji" and "Emoji_Presentation" properties to define emojis, while utf8proc only exposes the Extended_Pictographic property from the emoji table. This is a superset of the previous emoji properties. As only codepoints above 0x1f000 are affected by the 'emoji' option, this means that the following chars are now treated as double-width, instead of single-width like in previous nvim versions: ๐Ÿ€€ ๐Ÿ€ ๐Ÿ€‚ ๐Ÿ€ƒ ๐Ÿ€… ๐Ÿ€† ๐Ÿ€‡ ๐Ÿ€ˆ ๐Ÿ€‰ ๐Ÿ€Š ๐Ÿ€‹ ๐Ÿ€Œ ๐Ÿ€ ๐Ÿ€Ž ๐Ÿ€ ๐Ÿ€ ๐Ÿ€‘ ๐Ÿ€’ ๐Ÿ€“ ๐Ÿ€” ๐Ÿ€• ๐Ÿ€– ๐Ÿ€— ๐Ÿ€˜ ๐Ÿ€™ ๐Ÿ€š ๐Ÿ€› ๐Ÿ€œ ๐Ÿ€ ๐Ÿ€ž ๐Ÿ€Ÿ ๐Ÿ€  ๐Ÿ€ก ๐Ÿ€ข ๐Ÿ€ฃ ๐Ÿ€ค ๐Ÿ€ฅ ๐Ÿ€ฆ ๐Ÿ€ง ๐Ÿ€จ ๐Ÿ€ฉ ๐Ÿ€ช ๐Ÿ€ซ ๐Ÿ€ฐ ๐Ÿ€ฑ ๐Ÿ€ฒ ๐Ÿ€ณ ๐Ÿ€ด ๐Ÿ€ต ๐Ÿ€ถ ๐Ÿ€ท ๐Ÿ€ธ ๐Ÿ€น ๐Ÿ€บ ๐Ÿ€ป ๐Ÿ€ผ ๐Ÿ€ฝ ๐Ÿ€พ ๐Ÿ€ฟ ๐Ÿ€ ๐Ÿ ๐Ÿ‚ ๐Ÿƒ ๐Ÿ„ ๐Ÿ… ๐Ÿ† ๐Ÿ‡ ๐Ÿˆ ๐Ÿ‰ ๐ŸŠ ๐Ÿ‹ ๐ŸŒ ๐Ÿ ๐ŸŽ ๐Ÿ ๐Ÿ ๐Ÿ‘ ๐Ÿ’ ๐Ÿ“ ๐Ÿ” ๐Ÿ• ๐Ÿ– ๐Ÿ— ๐Ÿ˜ ๐Ÿ™ ๐Ÿš ๐Ÿ› ๐Ÿœ ๐Ÿ ๐Ÿž ๐ŸŸ ๐Ÿ  ๐Ÿก ๐Ÿข ๐Ÿฃ ๐Ÿค ๐Ÿฅ ๐Ÿฆ ๐Ÿง ๐Ÿจ ๐Ÿฉ ๐Ÿช ๐Ÿซ ๐Ÿฌ ๐Ÿญ ๐Ÿฎ ๐Ÿฏ ๐Ÿฐ ๐Ÿฑ ๐Ÿฒ ๐Ÿณ ๐Ÿด ๐Ÿต ๐Ÿถ ๐Ÿท ๐Ÿธ ๐Ÿน ๐Ÿบ ๐Ÿป ๐Ÿผ ๐Ÿฝ ๐Ÿพ ๐Ÿฟ ๐Ÿ‚€ ๐Ÿ‚ ๐Ÿ‚‚ ๐Ÿ‚ƒ ๐Ÿ‚„ ๐Ÿ‚… ๐Ÿ‚† ๐Ÿ‚‡ ๐Ÿ‚ˆ ๐Ÿ‚‰ ๐Ÿ‚Š ๐Ÿ‚‹ ๐Ÿ‚Œ ๐Ÿ‚ ๐Ÿ‚Ž ๐Ÿ‚ ๐Ÿ‚ ๐Ÿ‚‘ ๐Ÿ‚’ ๐Ÿ‚“ ๐Ÿ‚  ๐Ÿ‚ก ๐Ÿ‚ข ๐Ÿ‚ฃ ๐Ÿ‚ค ๐Ÿ‚ฅ ๐Ÿ‚ฆ ๐Ÿ‚ง ๐Ÿ‚จ ๐Ÿ‚ฉ ๐Ÿ‚ช ๐Ÿ‚ซ ๐Ÿ‚ฌ ๐Ÿ‚ญ ๐Ÿ‚ฎ ๐Ÿ‚ฑ ๐Ÿ‚ฒ ๐Ÿ‚ณ ๐Ÿ‚ด ๐Ÿ‚ต ๐Ÿ‚ถ ๐Ÿ‚ท ๐Ÿ‚ธ ๐Ÿ‚น ๐Ÿ‚บ ๐Ÿ‚ป ๐Ÿ‚ผ ๐Ÿ‚ฝ ๐Ÿ‚พ ๐Ÿ‚ฟ ๐Ÿƒ ๐Ÿƒ‚ ๐Ÿƒƒ ๐Ÿƒ„ ๐Ÿƒ… ๐Ÿƒ† ๐Ÿƒ‡ ๐Ÿƒˆ ๐Ÿƒ‰ ๐ŸƒŠ ๐Ÿƒ‹ ๐ŸƒŒ ๐Ÿƒ ๐ŸƒŽ ๐Ÿƒ‘ ๐Ÿƒ’ ๐Ÿƒ“ ๐Ÿƒ” ๐Ÿƒ• ๐Ÿƒ– ๐Ÿƒ— ๐Ÿƒ˜ ๐Ÿƒ™ ๐Ÿƒš ๐Ÿƒ› ๐Ÿƒœ ๐Ÿƒ ๐Ÿƒž ๐ŸƒŸ ๐Ÿƒ  ๐Ÿƒก ๐Ÿƒข ๐Ÿƒฃ ๐Ÿƒค ๐Ÿƒฅ ๐Ÿƒฆ ๐Ÿƒง ๐Ÿƒจ ๐Ÿƒฉ ๐Ÿƒช ๐Ÿƒซ ๐Ÿƒฌ ๐Ÿƒญ ๐Ÿƒฎ ๐Ÿƒฏ ๐Ÿƒฐ ๐Ÿƒฑ ๐Ÿƒฒ ๐Ÿƒณ ๐Ÿƒด ๐Ÿƒต ๐Ÿ„ ๐Ÿ„Ž ๐Ÿ„ ๐Ÿ„ฏ ๐Ÿ…ฌ ๐Ÿ…ญ ๐Ÿ…ฎ ๐Ÿ…ฏ ๐Ÿ†ญ ๐ŸŒข ๐ŸŒฃ ๐ŸŽ” ๐ŸŽ• ๐ŸŽ˜ ๐ŸŽœ ๐ŸŽ ๐Ÿฑ ๐Ÿฒ ๐Ÿถ ๐Ÿ“พ ๐Ÿ•† ๐Ÿ•‡ ๐Ÿ•ˆ ๐Ÿ• ๐Ÿ•จ ๐Ÿ•ฉ ๐Ÿ•ช ๐Ÿ•ซ ๐Ÿ•ฌ ๐Ÿ•ญ ๐Ÿ•ฎ ๐Ÿ•ฑ ๐Ÿ•ฒ ๐Ÿ•ป ๐Ÿ•ผ ๐Ÿ•ฝ ๐Ÿ•พ ๐Ÿ•ฟ ๐Ÿ–€ ๐Ÿ– ๐Ÿ–‚ ๐Ÿ–ƒ ๐Ÿ–„ ๐Ÿ–… ๐Ÿ–† ๐Ÿ–ˆ ๐Ÿ–‰ ๐Ÿ–Ž ๐Ÿ– ๐Ÿ–‘ ๐Ÿ–’ ๐Ÿ–“ ๐Ÿ–” ๐Ÿ–— ๐Ÿ–˜ ๐Ÿ–™ ๐Ÿ–š ๐Ÿ–› ๐Ÿ–œ ๐Ÿ– ๐Ÿ–ž ๐Ÿ–Ÿ ๐Ÿ–  ๐Ÿ–ก ๐Ÿ–ข ๐Ÿ–ฃ ๐Ÿ–ฆ ๐Ÿ–ง ๐Ÿ–ฉ ๐Ÿ–ช ๐Ÿ–ซ ๐Ÿ–ฌ ๐Ÿ–ญ ๐Ÿ–ฎ ๐Ÿ–ฏ ๐Ÿ–ฐ ๐Ÿ–ณ ๐Ÿ–ด ๐Ÿ–ต ๐Ÿ–ถ ๐Ÿ–ท ๐Ÿ–ธ ๐Ÿ–น ๐Ÿ–บ ๐Ÿ–ป ๐Ÿ–ฝ ๐Ÿ–พ ๐Ÿ–ฟ ๐Ÿ—€ ๐Ÿ— ๐Ÿ—… ๐Ÿ—† ๐Ÿ—‡ ๐Ÿ—ˆ ๐Ÿ—‰ ๐Ÿ—Š ๐Ÿ—‹ ๐Ÿ—Œ ๐Ÿ— ๐Ÿ—Ž ๐Ÿ— ๐Ÿ— ๐Ÿ—” ๐Ÿ—• ๐Ÿ—– ๐Ÿ—— ๐Ÿ—˜ ๐Ÿ—™ ๐Ÿ—š ๐Ÿ—› ๐Ÿ—Ÿ ๐Ÿ—  ๐Ÿ—ข ๐Ÿ—ค ๐Ÿ—ฅ ๐Ÿ—ฆ ๐Ÿ—ง ๐Ÿ—ฉ ๐Ÿ—ช ๐Ÿ—ซ ๐Ÿ—ฌ ๐Ÿ—ญ ๐Ÿ—ฎ ๐Ÿ—ฐ ๐Ÿ—ฑ ๐Ÿ—ฒ ๐Ÿ—ด ๐Ÿ—ต ๐Ÿ—ถ ๐Ÿ—ท ๐Ÿ—ธ ๐Ÿ—น ๐Ÿ›† ๐Ÿ›‡ ๐Ÿ›ˆ ๐Ÿ›‰ ๐Ÿ›Š ๐Ÿ›“ ๐Ÿ›” ๐Ÿ›ฆ ๐Ÿ›ง ๐Ÿ›จ ๐Ÿ›ช ๐Ÿ›ฑ ๐Ÿ›ฒ ๐Ÿด ๐Ÿต ๐Ÿถ ๐Ÿป ๐Ÿผ ๐Ÿฝ ๐Ÿพ ๐Ÿฟ ๐ŸŸ• ๐ŸŸ– ๐ŸŸ— ๐ŸŸ˜ ๐ŸŸ™ ๐Ÿขฐ ๐Ÿขฑ ๐Ÿจ€ ๐Ÿจ ๐Ÿจ‚ ๐Ÿจƒ ๐Ÿจ„ ๐Ÿจ… ๐Ÿจ† ๐Ÿจ‡ ๐Ÿจˆ ๐Ÿจ‰ ๐ŸจŠ ๐Ÿจ‹ ๐ŸจŒ ๐Ÿจ ๐ŸจŽ ๐Ÿจ ๐Ÿจ ๐Ÿจ‘ ๐Ÿจ’ ๐Ÿจ“ ๐Ÿจ” ๐Ÿจ• ๐Ÿจ– ๐Ÿจ— ๐Ÿจ˜ ๐Ÿจ™ ๐Ÿจš ๐Ÿจ› ๐Ÿจœ ๐Ÿจ ๐Ÿจž ๐ŸจŸ ๐Ÿจ  ๐Ÿจก ๐Ÿจข ๐Ÿจฃ ๐Ÿจค ๐Ÿจฅ ๐Ÿจฆ ๐Ÿจง ๐Ÿจจ ๐Ÿจฉ ๐Ÿจช ๐Ÿจซ ๐Ÿจฌ ๐Ÿจญ ๐Ÿจฎ ๐Ÿจฏ ๐Ÿจฐ ๐Ÿจฑ ๐Ÿจฒ ๐Ÿจณ ๐Ÿจด ๐Ÿจต ๐Ÿจถ ๐Ÿจท ๐Ÿจธ ๐Ÿจน ๐Ÿจบ ๐Ÿจป ๐Ÿจผ ๐Ÿจฝ ๐Ÿจพ ๐Ÿจฟ ๐Ÿฉ€ ๐Ÿฉ ๐Ÿฉ‚ ๐Ÿฉƒ ๐Ÿฉ„ ๐Ÿฉ… ๐Ÿฉ† ๐Ÿฉ‡ ๐Ÿฉˆ ๐Ÿฉ‰ ๐ŸฉŠ ๐Ÿฉ‹ ๐ŸฉŒ ๐Ÿฉ ๐ŸฉŽ ๐Ÿฉ ๐Ÿฉ ๐Ÿฉ‘ ๐Ÿฉ’ ๐Ÿฉ“ ๐Ÿฉ  ๐Ÿฉก ๐Ÿฉข ๐Ÿฉฃ ๐Ÿฉค ๐Ÿฉฅ ๐Ÿฉฆ ๐Ÿฉง ๐Ÿฉจ ๐Ÿฉฉ ๐Ÿฉช ๐Ÿฉซ ๐Ÿฉฌ ๐Ÿฉญ
Diffstat (limited to 'src/nvim/generators')
-rw-r--r--src/nvim/generators/gen_unicode_tables.lua264
1 files changed, 0 insertions, 264 deletions
diff --git a/src/nvim/generators/gen_unicode_tables.lua b/src/nvim/generators/gen_unicode_tables.lua
deleted file mode 100644
index 01eb34be88..0000000000
--- a/src/nvim/generators/gen_unicode_tables.lua
+++ /dev/null
@@ -1,264 +0,0 @@
--- Script creates the following tables in unicode_tables.generated.h:
---
--- 1. doublewidth and ambiguous tables: sorted list of non-overlapping closed
--- intervals. Codepoints in these intervals have double (W or F) or ambiguous
--- (A) east asian width respectively.
--- 2. combining table: same as the above, but characters inside are combining
--- characters (i.e. have general categories equal to Mn, Mc or Me).
--- 3. foldCase table used to convert characters to
--- folded variants. In this table first two values are
--- character ranges: like in previous tables they are sorted and must be
--- non-overlapping. Third value means step inside the range: e.g. if it is
--- 2 then interval applies only to first, third, fifth, โ€ฆ character in range.
--- Fourth value is number that should be added to the codepoint to yield
--- folded codepoint.
--- 4. emoji_wide and emoji_all tables: sorted lists of non-overlapping closed
--- intervals of Emoji characters. emoji_wide contains all the characters
--- which don't have ambiguous or double width, and emoji_all has all Emojis.
-if arg[1] == '--help' then
- print('Usage:')
- print(' gen_unicode_tables.lua unicode/ unicode_tables.generated.h')
- os.exit(0)
-end
-
-local basedir = arg[1]
-local pathsep = package.config:sub(1, 1)
-local get_path = function(fname)
- return basedir .. pathsep .. fname
-end
-
-local unicodedata_fname = get_path('UnicodeData.txt')
-local eastasianwidth_fname = get_path('EastAsianWidth.txt')
-local emoji_fname = get_path('emoji-data.txt')
-
-local utf_tables_fname = arg[2]
-
-local split_on_semicolons = function(s)
- local ret = {}
- local idx = 1
- while idx <= #s + 1 do
- local item = s:match('^[^;]*', idx)
- idx = idx + #item + 1
- if idx <= #s + 1 then
- assert(s:sub(idx - 1, idx - 1) == ';')
- end
- item = item:gsub('^%s*', '')
- item = item:gsub('%s*$', '')
- table.insert(ret, item)
- end
- return ret
-end
-
-local fp_lines_to_lists = function(fp, n, has_comments)
- local ret = {}
- local line
- local i = 0
- while true do
- i = i + 1
- line = fp:read('*l')
- if not line then
- break
- end
- if not has_comments or (line:sub(1, 1) ~= '#' and not line:match('^%s*$')) then
- local l = split_on_semicolons(line)
- if #l ~= n then
- io.stderr:write(('Found %s items in line %u, expected %u\n'):format(#l, i, n))
- io.stderr:write('Line: ' .. line .. '\n')
- return nil
- end
- table.insert(ret, l)
- end
- end
- return ret
-end
-
-local parse_data_to_props = function(ud_fp)
- return fp_lines_to_lists(ud_fp, 15, false)
-end
-
-local parse_width_props = function(eaw_fp)
- return fp_lines_to_lists(eaw_fp, 2, true)
-end
-
-local parse_emoji_props = function(emoji_fp)
- return fp_lines_to_lists(emoji_fp, 2, true)
-end
-
-local make_range = function(start, end_, step, add)
- if step and add then
- return (' {0x%x, 0x%x, %d, %d},\n'):format(start, end_, step == 0 and -1 or step, add)
- else
- return (' {0x%04x, 0x%04x},\n'):format(start, end_)
- end
-end
-
-local build_combining_table = function(ut_fp, dataprops)
- ut_fp:write('static const struct interval combining[] = {\n')
- local start = -1
- local end_ = -1
- for _, p in ipairs(dataprops) do
- -- The 'Mc' property was removed, it does take up space.
- if ({ Mn = true, Me = true })[p[3]] then
- local n = tonumber(p[1], 16)
- if start >= 0 and end_ + 1 == n then
- -- Continue with the same range.
- end_ = n
- else
- if start >= 0 then
- -- Produce previous range.
- ut_fp:write(make_range(start, end_))
- end
- start = n
- end_ = n
- end
- end
- end
- if start >= 0 then
- ut_fp:write(make_range(start, end_))
- end
- ut_fp:write('};\n')
-end
-
-local build_width_table = function(ut_fp, dataprops, widthprops, widths, table_name)
- ut_fp:write('static const struct interval ' .. table_name .. '[] = {\n')
- local start = -1
- local end_ = -1
- local dataidx = 1
- local ret = {}
- for _, p in ipairs(widthprops) do
- if widths[p[2]:sub(1, 1)] then
- local rng_start, rng_end = p[1]:find('%.%.')
- local n, n_last
- if rng_start then
- -- It is a range. We donโ€™t check for composing char then.
- n = tonumber(p[1]:sub(1, rng_start - 1), 16)
- n_last = tonumber(p[1]:sub(rng_end + 1), 16)
- else
- n = tonumber(p[1], 16)
- n_last = n
- end
- local dn
- while true do
- dn = tonumber(dataprops[dataidx][1], 16)
- if dn >= n then
- break
- end
- dataidx = dataidx + 1
- end
- if dn ~= n and n_last == n then
- io.stderr:write('Cannot find character ' .. n .. ' in data table.\n')
- end
- -- Only use the char when itโ€™s not a composing char.
- -- But use all chars from a range.
- local dp = dataprops[dataidx]
- if (n_last > n) or not ({ Mn = true, Mc = true, Me = true })[dp[3]] then
- if start >= 0 and end_ + 1 == n then -- luacheck: ignore 542
- -- Continue with the same range.
- else
- if start >= 0 then
- ut_fp:write(make_range(start, end_))
- table.insert(ret, { start, end_ })
- end
- start = n
- end
- end_ = n_last
- end
- end
- end
- if start >= 0 then
- ut_fp:write(make_range(start, end_))
- table.insert(ret, { start, end_ })
- end
- ut_fp:write('};\n')
- return ret
-end
-
-local build_emoji_table = function(ut_fp, emojiprops, doublewidth, ambiwidth)
- local emojiwidth = {}
- local emoji = {}
- for _, p in ipairs(emojiprops) do
- if p[2]:match('Emoji%s+#') then
- local rng_start, rng_end = p[1]:find('%.%.')
- local n
- local n_last
- if rng_start then
- n = tonumber(p[1]:sub(1, rng_start - 1), 16)
- n_last = tonumber(p[1]:sub(rng_end + 1), 16)
- else
- n = tonumber(p[1], 16)
- n_last = n
- end
- if #emoji > 0 and n - 1 == emoji[#emoji][2] then
- emoji[#emoji][2] = n_last
- else
- table.insert(emoji, { n, n_last })
- end
-
- -- Characters below 1F000 may be considered single width traditionally,
- -- making them double width causes problems.
- if n >= 0x1f000 then
- -- exclude characters that are in the ambiguous/doublewidth table
- for _, ambi in ipairs(ambiwidth) do
- if n >= ambi[1] and n <= ambi[2] then
- n = ambi[2] + 1
- end
- if n_last >= ambi[1] and n_last <= ambi[2] then
- n_last = ambi[1] - 1
- end
- end
- for _, double in ipairs(doublewidth) do
- if n >= double[1] and n <= double[2] then
- n = double[2] + 1
- end
- if n_last >= double[1] and n_last <= double[2] then
- n_last = double[1] - 1
- end
- end
-
- if n <= n_last then
- if #emojiwidth > 0 and n - 1 == emojiwidth[#emojiwidth][2] then
- emojiwidth[#emojiwidth][2] = n_last
- else
- table.insert(emojiwidth, { n, n_last })
- end
- end
- end
- end
- end
-
- ut_fp:write('static const struct interval emoji_all[] = {\n')
- for _, p in ipairs(emoji) do
- ut_fp:write(make_range(p[1], p[2]))
- end
- ut_fp:write('};\n')
-
- ut_fp:write('static const struct interval emoji_wide[] = {\n')
- for _, p in ipairs(emojiwidth) do
- ut_fp:write(make_range(p[1], p[2]))
- end
- ut_fp:write('};\n')
-end
-
-local ud_fp = io.open(unicodedata_fname, 'r')
-local dataprops = parse_data_to_props(ud_fp)
-ud_fp:close()
-
-local ut_fp = io.open(utf_tables_fname, 'w')
-
-build_combining_table(ut_fp, dataprops)
-
-local eaw_fp = io.open(eastasianwidth_fname, 'r')
-local widthprops = parse_width_props(eaw_fp)
-eaw_fp:close()
-
-local doublewidth =
- build_width_table(ut_fp, dataprops, widthprops, { W = true, F = true }, 'doublewidth')
-local ambiwidth = build_width_table(ut_fp, dataprops, widthprops, { A = true }, 'ambiguous')
-
-local emoji_fp = io.open(emoji_fname, 'r')
-local emojiprops = parse_emoji_props(emoji_fp)
-emoji_fp:close()
-
-build_emoji_table(ut_fp, emojiprops, doublewidth, ambiwidth)
-
-ut_fp:close()