aboutsummaryrefslogtreecommitdiff
path: root/test/unit/mbyte_spec.lua
diff options
context:
space:
mode:
authorbfredl <bjorn.linse@gmail.com>2024-08-08 10:42:08 +0200
committerbfredl <bjorn.linse@gmail.com>2024-08-30 11:49:09 +0200
commitcfdf68a7acde16597fbd896674af68c42361102c (patch)
tree6113193fda7a7c0f94577a464e39964e74311583 /test/unit/mbyte_spec.lua
parent4353996d0fa8e5872a334d68196d8088391960cf (diff)
downloadrneovim-cfdf68a7acde16597fbd896674af68c42361102c.tar.gz
rneovim-cfdf68a7acde16597fbd896674af68c42361102c.tar.bz2
rneovim-cfdf68a7acde16597fbd896674af68c42361102c.zip
feat(mbyte): support extended grapheme clusters including more emoji
Use the grapheme break algorithm from utf8proc to support grapheme clusters from recent unicode versions. Handle variant selector VS16 turning some codepoints into double-width emoji. This means we need to use ptr2cells rather than char2cells when possible.
Diffstat (limited to 'test/unit/mbyte_spec.lua')
-rw-r--r--test/unit/mbyte_spec.lua119
1 files changed, 93 insertions, 26 deletions
diff --git a/test/unit/mbyte_spec.lua b/test/unit/mbyte_spec.lua
index 8fcc67d20b..787a8862ae 100644
--- a/test/unit/mbyte_spec.lua
+++ b/test/unit/mbyte_spec.lua
@@ -3,8 +3,15 @@ local itp = t.gen_itp(it)
local ffi = t.ffi
local eq = t.eq
+local to_cstr = t.to_cstr
+local ok = t.ok
-local lib = t.cimport('./src/nvim/mbyte.h', './src/nvim/charset.h', './src/nvim/grid.h')
+local lib = t.cimport(
+ './src/nvim/mbyte.h',
+ './src/nvim/charset.h',
+ './src/nvim/grid.h',
+ './src/nvim/option_vars.h'
+)
describe('mbyte', function()
-- Convert from bytes to string
@@ -45,12 +52,21 @@ describe('mbyte', function()
end)
end
- describe('utfc_ptr2schar_len', function()
+ describe('utfc_ptr2schar', function()
local function test_seq(seq)
local firstc = ffi.new('int[1]')
local buf = ffi.new('char[32]')
- lib.schar_get(buf, lib.utfc_ptr2schar_len(to_string(seq), #seq, firstc))
- return { ffi.string(buf), firstc[0] }
+ lib.schar_get(buf, lib.utfc_ptr2schar(to_string(seq), firstc))
+ local str = ffi.string(buf)
+ if 1 > 2 then -- for debugging
+ local tabel = {}
+ for i = 1, #str do
+ table.insert(tabel, string.format('0x%02x', string.byte(str, i)))
+ end
+ print('{ ' .. table.concat(tabel, ', ') .. ' }')
+ io.stdout:flush()
+ end
+ return { str, firstc[0] }
end
local function byte(val)
@@ -88,7 +104,9 @@ describe('mbyte', function()
eq(byte(0x7f), test_seq { 0x7f, 0xc2, 0x80 })
-- Combining character is U+0300
- eq({ '\x7f\xcc\x80', 0x7f }, test_seq { 0x7f, 0xcc, 0x80 })
+ eq({ '\x29\xcc\x80', 0x29 }, test_seq { 0x29, 0xcc, 0x80 })
+ -- invalid start byte for combining
+ eq({ '\x7f', 0x7f }, test_seq { 0x7f, 0xcc, 0x80 })
-- No UTF-8 sequence
eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f, 0xcc })
@@ -102,18 +120,21 @@ describe('mbyte', function()
itp('4-byte sequences', function()
-- No following combining character
eq(byte(0x7f), test_seq { 0x7f, 0x7f, 0xcc, 0x80 })
+ eq(byte(0x29), test_seq { 0x29, 0x29, 0xcc, 0x80 })
-- No second UTF-8 character
eq(byte(0x7f), test_seq { 0x7f, 0xc2, 0xcc, 0x80 })
-- Combining character U+0300
- eq({ '\x7f\xcc\x80', 0x7f }, test_seq { 0x7f, 0xcc, 0x80, 0xcc })
+ eq({ '\x29\xcc\x80', 0x29 }, test_seq { 0x29, 0xcc, 0x80, 0xcc })
-- No UTF-8 sequence
eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f, 0xcc, 0x80 })
-- No following UTF-8 character
eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0xcc })
-- Combining character U+0301
- eq({ '\xc2\x80\xcc\x81', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81 })
+ eq({ '\xc2\xbc\xcc\x81', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0x81 })
+ -- U+0080 : not a valid start char
+ eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81 })
-- One UTF-8 character
eq({ '\xf4\x80\x80\x80', 0x100000 }, test_seq { 0xf4, 0x80, 0x80, 0x80 })
@@ -126,36 +147,36 @@ describe('mbyte', function()
eq(byte(0x7f), test_seq { 0x7f, 0xc2, 0xcc, 0x80, 0x80 })
-- Combining character U+0300
- eq({ '\x7f\xcc\x80', 0x7f }, test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x00 })
+ eq({ '\x29\xcc\x80', 0x29 }, test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x00 })
-- Combining characters U+0300 and U+0301
- eq({ '\x7f\xcc\x80\xcc\x81', 0x7f }, test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81 })
+ eq({ '\x29\xcc\x80\xcc\x81', 0x29 }, test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81 })
-- Combining characters U+0300, U+0301, U+0302
eq(
- { '\x7f\xcc\x80\xcc\x81\xcc\x82', 0x7f },
- test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82 }
+ { '\x29\xcc\x80\xcc\x81\xcc\x82', 0x29 },
+ test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82 }
)
-- Combining characters U+0300, U+0301, U+0302, U+0303
eq(
- { '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83', 0x7f },
- test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83 }
+ { '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83', 0x29 },
+ test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83 }
)
-- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304
eq(
- { '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84', 0x7f },
- test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84 }
+ { '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84', 0x29 },
+ test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84 }
)
-- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, U+0305
eq(
- { '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85', 0x7f },
- test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85 }
+ { '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85', 0x29 },
+ test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85 }
)
-- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, U+0305, U+0306
eq(
- { '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85\xcc\x86', 0x7f },
+ { '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85\xcc\x86', 0x29 },
test_seq {
- 0x7f,
+ 0x29,
0xcc,
0x80,
0xcc,
@@ -175,18 +196,18 @@ describe('mbyte', function()
-- Only three following combining characters U+0300, U+0301, U+0302
eq(
- { '\x7f\xcc\x80\xcc\x81\xcc\x82', 0x7f },
- test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xc2, 0x80, 0xcc, 0x84, 0xcc, 0x85 }
+ { '\x29\xcc\x80\xcc\x81\xcc\x82', 0x29 },
+ test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xc2, 0x80, 0xcc, 0x84, 0xcc, 0x85 }
)
-- No UTF-8 sequence
eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f, 0xcc, 0x80, 0x80 })
-- No following UTF-8 character
- eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0xcc, 0x80 })
+ eq({ '\xc2\xbc', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0xcc, 0x80 })
-- Combining character U+0301
- eq({ '\xc2\x80\xcc\x81', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81, 0x7f })
+ eq({ '\xc2\xbc\xcc\x81', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0x81, 0x7f })
-- Combining character U+0301
- eq({ '\xc2\x80\xcc\x81', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81, 0xcc })
+ eq({ '\xc2\xbc\xcc\x81', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0x81, 0xcc })
-- One UTF-8 character
eq({ '\xf4\x80\x80\x80', 0x100000 }, test_seq { 0xf4, 0x80, 0x80, 0x80, 0x7f })
@@ -205,8 +226,6 @@ describe('mbyte', function()
end)
describe('utf_cp_bounds_len', function()
- local to_cstr = t.to_cstr
-
local tests = {
{
name = 'for valid string',
@@ -273,4 +292,52 @@ describe('mbyte', function()
eq(expected_offsets, { b = b_offsets, e = e_offsets })
end)
end)
+
+ itp('utf_head_off', function()
+ local function check(str, expected_glyphs)
+ local len = #str
+ local cstr = to_cstr(str)
+ local breaks = { 0 } -- SOT
+ local pos = 0
+ local mb_glyphs = {}
+ while pos < len do
+ local clen = lib.utfc_ptr2len(cstr + pos)
+ ok(clen > 0) -- otherwise we get stuck
+ if clen > 1 then
+ table.insert(mb_glyphs, string.sub(str, pos + 1, pos + clen))
+ end
+ pos = pos + clen
+ table.insert(breaks, pos)
+ end
+ eq(breaks[#breaks], len) -- include EOT as break
+ -- we could also send in breaks, but this is more human readable
+ eq(mb_glyphs, expected_glyphs)
+
+ for i = 1, #breaks - 1 do
+ local start, next = breaks[i], breaks[i + 1]
+
+ for p = start, next - 1 do
+ eq(p - start, lib.utf_head_off(cstr, cstr + p))
+ end
+ end
+ eq(0, lib.utf_head_off(cstr, cstr + len)) -- NUL byte is safe
+ end
+ -- stylua doesn't like ZWJ chars..
+ -- stylua: ignore start
+ check('hej och hΓ₯ πŸ§‘β€πŸŒΎ!', { 'Γ₯', 'πŸ§‘β€πŸŒΎ' })
+ -- emoji only (various kinds of combinations, use g8 to see them)
+ check("πŸ³οΈβ€βš§οΈπŸ§‘β€πŸŒΎβ€οΈπŸ˜‚πŸ΄β€β˜ οΈ", {"πŸ³οΈβ€βš§οΈ", "πŸ§‘β€πŸŒΎ", "❀️", "πŸ˜‚", "πŸ΄β€β˜ οΈ"})
+ check('πŸ³οΈβ€βš§οΈxyπŸ§‘β€πŸŒΎ\rβ€οΈπŸ˜‚Γ₯πŸ΄β€β˜ οΈΒ€', { 'πŸ³οΈβ€βš§οΈ', 'πŸ§‘β€πŸŒΎ', '❀️', 'πŸ˜‚', 'Γ₯', 'πŸ΄β€β˜ οΈ', 'Β€' })
+
+ check('πŸ‡¦πŸ…±οΈ πŸ‡¦πŸ‡½ πŸ‡¦πŸ‡¨πŸ‡¦ πŸ‡²πŸ‡½πŸ‡ΉπŸ‡±',{'πŸ‡¦', 'πŸ…±οΈ', 'πŸ‡¦πŸ‡½', 'πŸ‡¦πŸ‡¨', 'πŸ‡¦', 'πŸ‡²πŸ‡½', 'πŸ‡ΉπŸ‡±'})
+ check('🏴󠁧󠁒󠁳󠁣󠁴󠁿🏴󠁧󠁒󠁷󠁬󠁳󠁿', {'🏴󠁧󠁒󠁳󠁣󠁴󠁿', '🏴󠁧󠁒󠁷󠁬󠁳󠁿'})
+
+ lib.p_arshape = true -- default
+ check('Ψ³Ω„Ψ§Ω…', { 'Ψ³', 'Ω„Ψ§', 'Ω…' })
+ lib.p_arshape = false
+ check('Ψ³Ω„Ψ§Ω…', { 'Ψ³', 'Ω„', 'Ψ§', 'Ω…' })
+
+ check('LΜ“Μ‰Μ‘Μ’ΜŒΜšoΜŒΜ’Μ—Μ„Μ›Μ€rΜΜˆΜ•ΜˆΜŽΜè̇̅̄̄̐mΜ…Μ–ΜŸΜ„ΜŸΜš', {'LΜ“Μ‰Μ‘Μ’ΜŒΜš', 'oΜŒΜ’Μ—Μ„Μ›Μ€', 'rΜΜˆΜ•ΜˆΜŽΜ', 'è̇̅̄̄̐', 'mΜ…Μ–ΜŸΜ„ΜŸΜš'})
+ -- stylua: ignore end
+ end)
end)