diff options
author | bfredl <bjorn.linse@gmail.com> | 2024-08-08 10:42:08 +0200 |
---|---|---|
committer | bfredl <bjorn.linse@gmail.com> | 2024-08-30 11:49:09 +0200 |
commit | cfdf68a7acde16597fbd896674af68c42361102c (patch) | |
tree | 6113193fda7a7c0f94577a464e39964e74311583 /test/unit/mbyte_spec.lua | |
parent | 4353996d0fa8e5872a334d68196d8088391960cf (diff) | |
download | rneovim-cfdf68a7acde16597fbd896674af68c42361102c.tar.gz rneovim-cfdf68a7acde16597fbd896674af68c42361102c.tar.bz2 rneovim-cfdf68a7acde16597fbd896674af68c42361102c.zip |
feat(mbyte): support extended grapheme clusters including more emoji
Use the grapheme break algorithm from utf8proc to support grapheme
clusters from recent unicode versions.
Handle variant selector VS16 turning some codepoints into double-width
emoji. This means we need to use ptr2cells rather than char2cells when
possible.
Diffstat (limited to 'test/unit/mbyte_spec.lua')
-rw-r--r-- | test/unit/mbyte_spec.lua | 119 |
1 files changed, 93 insertions, 26 deletions
diff --git a/test/unit/mbyte_spec.lua b/test/unit/mbyte_spec.lua index 8fcc67d20b..787a8862ae 100644 --- a/test/unit/mbyte_spec.lua +++ b/test/unit/mbyte_spec.lua @@ -3,8 +3,15 @@ local itp = t.gen_itp(it) local ffi = t.ffi local eq = t.eq +local to_cstr = t.to_cstr +local ok = t.ok -local lib = t.cimport('./src/nvim/mbyte.h', './src/nvim/charset.h', './src/nvim/grid.h') +local lib = t.cimport( + './src/nvim/mbyte.h', + './src/nvim/charset.h', + './src/nvim/grid.h', + './src/nvim/option_vars.h' +) describe('mbyte', function() -- Convert from bytes to string @@ -45,12 +52,21 @@ describe('mbyte', function() end) end - describe('utfc_ptr2schar_len', function() + describe('utfc_ptr2schar', function() local function test_seq(seq) local firstc = ffi.new('int[1]') local buf = ffi.new('char[32]') - lib.schar_get(buf, lib.utfc_ptr2schar_len(to_string(seq), #seq, firstc)) - return { ffi.string(buf), firstc[0] } + lib.schar_get(buf, lib.utfc_ptr2schar(to_string(seq), firstc)) + local str = ffi.string(buf) + if 1 > 2 then -- for debugging + local tabel = {} + for i = 1, #str do + table.insert(tabel, string.format('0x%02x', string.byte(str, i))) + end + print('{ ' .. table.concat(tabel, ', ') .. ' }') + io.stdout:flush() + end + return { str, firstc[0] } end local function byte(val) @@ -88,7 +104,9 @@ describe('mbyte', function() eq(byte(0x7f), test_seq { 0x7f, 0xc2, 0x80 }) -- Combining character is U+0300 - eq({ '\x7f\xcc\x80', 0x7f }, test_seq { 0x7f, 0xcc, 0x80 }) + eq({ '\x29\xcc\x80', 0x29 }, test_seq { 0x29, 0xcc, 0x80 }) + -- invalid start byte for combining + eq({ '\x7f', 0x7f }, test_seq { 0x7f, 0xcc, 0x80 }) -- No UTF-8 sequence eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f, 0xcc }) @@ -102,18 +120,21 @@ describe('mbyte', function() itp('4-byte sequences', function() -- No following combining character eq(byte(0x7f), test_seq { 0x7f, 0x7f, 0xcc, 0x80 }) + eq(byte(0x29), test_seq { 0x29, 0x29, 0xcc, 0x80 }) -- No second UTF-8 character eq(byte(0x7f), test_seq { 0x7f, 0xc2, 0xcc, 0x80 }) -- Combining character U+0300 - eq({ '\x7f\xcc\x80', 0x7f }, test_seq { 0x7f, 0xcc, 0x80, 0xcc }) + eq({ '\x29\xcc\x80', 0x29 }, test_seq { 0x29, 0xcc, 0x80, 0xcc }) -- No UTF-8 sequence eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f, 0xcc, 0x80 }) -- No following UTF-8 character eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0xcc }) -- Combining character U+0301 - eq({ '\xc2\x80\xcc\x81', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81 }) + eq({ '\xc2\xbc\xcc\x81', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0x81 }) + -- U+0080 : not a valid start char + eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81 }) -- One UTF-8 character eq({ '\xf4\x80\x80\x80', 0x100000 }, test_seq { 0xf4, 0x80, 0x80, 0x80 }) @@ -126,36 +147,36 @@ describe('mbyte', function() eq(byte(0x7f), test_seq { 0x7f, 0xc2, 0xcc, 0x80, 0x80 }) -- Combining character U+0300 - eq({ '\x7f\xcc\x80', 0x7f }, test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x00 }) + eq({ '\x29\xcc\x80', 0x29 }, test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x00 }) -- Combining characters U+0300 and U+0301 - eq({ '\x7f\xcc\x80\xcc\x81', 0x7f }, test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81 }) + eq({ '\x29\xcc\x80\xcc\x81', 0x29 }, test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81 }) -- Combining characters U+0300, U+0301, U+0302 eq( - { '\x7f\xcc\x80\xcc\x81\xcc\x82', 0x7f }, - test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82 } + { '\x29\xcc\x80\xcc\x81\xcc\x82', 0x29 }, + test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82 } ) -- Combining characters U+0300, U+0301, U+0302, U+0303 eq( - { '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83', 0x7f }, - test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83 } + { '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83', 0x29 }, + test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83 } ) -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304 eq( - { '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84', 0x7f }, - test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84 } + { '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84', 0x29 }, + test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84 } ) -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, U+0305 eq( - { '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85', 0x7f }, - test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85 } + { '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85', 0x29 }, + test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85 } ) -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, U+0305, U+0306 eq( - { '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85\xcc\x86', 0x7f }, + { '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85\xcc\x86', 0x29 }, test_seq { - 0x7f, + 0x29, 0xcc, 0x80, 0xcc, @@ -175,18 +196,18 @@ describe('mbyte', function() -- Only three following combining characters U+0300, U+0301, U+0302 eq( - { '\x7f\xcc\x80\xcc\x81\xcc\x82', 0x7f }, - test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xc2, 0x80, 0xcc, 0x84, 0xcc, 0x85 } + { '\x29\xcc\x80\xcc\x81\xcc\x82', 0x29 }, + test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xc2, 0x80, 0xcc, 0x84, 0xcc, 0x85 } ) -- No UTF-8 sequence eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f, 0xcc, 0x80, 0x80 }) -- No following UTF-8 character - eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0xcc, 0x80 }) + eq({ '\xc2\xbc', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0xcc, 0x80 }) -- Combining character U+0301 - eq({ '\xc2\x80\xcc\x81', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81, 0x7f }) + eq({ '\xc2\xbc\xcc\x81', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0x81, 0x7f }) -- Combining character U+0301 - eq({ '\xc2\x80\xcc\x81', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81, 0xcc }) + eq({ '\xc2\xbc\xcc\x81', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0x81, 0xcc }) -- One UTF-8 character eq({ '\xf4\x80\x80\x80', 0x100000 }, test_seq { 0xf4, 0x80, 0x80, 0x80, 0x7f }) @@ -205,8 +226,6 @@ describe('mbyte', function() end) describe('utf_cp_bounds_len', function() - local to_cstr = t.to_cstr - local tests = { { name = 'for valid string', @@ -273,4 +292,52 @@ describe('mbyte', function() eq(expected_offsets, { b = b_offsets, e = e_offsets }) end) end) + + itp('utf_head_off', function() + local function check(str, expected_glyphs) + local len = #str + local cstr = to_cstr(str) + local breaks = { 0 } -- SOT + local pos = 0 + local mb_glyphs = {} + while pos < len do + local clen = lib.utfc_ptr2len(cstr + pos) + ok(clen > 0) -- otherwise we get stuck + if clen > 1 then + table.insert(mb_glyphs, string.sub(str, pos + 1, pos + clen)) + end + pos = pos + clen + table.insert(breaks, pos) + end + eq(breaks[#breaks], len) -- include EOT as break + -- we could also send in breaks, but this is more human readable + eq(mb_glyphs, expected_glyphs) + + for i = 1, #breaks - 1 do + local start, next = breaks[i], breaks[i + 1] + + for p = start, next - 1 do + eq(p - start, lib.utf_head_off(cstr, cstr + p)) + end + end + eq(0, lib.utf_head_off(cstr, cstr + len)) -- NUL byte is safe + end + -- stylua doesn't like ZWJ chars.. + -- stylua: ignore start + check('hej och hΓ₯ π§βπΎ!', { 'Γ₯', 'π§βπΎ' }) + -- emoji only (various kinds of combinations, use g8 to see them) + check("π³οΈββ§οΈπ§βπΎβ€οΈππ΄ββ οΈ", {"π³οΈββ§οΈ", "π§βπΎ", "β€οΈ", "π", "π΄ββ οΈ"}) + check('π³οΈββ§οΈxyπ§βπΎ\rβ€οΈπΓ₯π΄ββ οΈΒ', { 'π³οΈββ§οΈ', 'π§βπΎ', 'β€οΈ', 'π', 'Γ₯', 'π΄ββ οΈ', 'Β' }) + + check('π¦π
±οΈ π¦π½ π¦π¨π¦ π²π½πΉπ±',{'π¦', 'π
±οΈ', 'π¦π½', 'π¦π¨', 'π¦', 'π²π½', 'πΉπ±'}) + check('π΄σ §σ ’σ ³σ £σ ΄σ Ώπ΄σ §σ ’σ ·σ ¬σ ³σ Ώ', {'π΄σ §σ ’σ ³σ £σ ΄σ Ώ', 'π΄σ §σ ’σ ·σ ¬σ ³σ Ώ'}) + + lib.p_arshape = true -- default + check('Ψ³ΩΨ§Ω
', { 'Ψ³', 'ΩΨ§', 'Ω
' }) + lib.p_arshape = false + check('Ψ³ΩΨ§Ω
', { 'Ψ³', 'Ω', 'Ψ§', 'Ω
' }) + + check('LΜΜΜΜΜΜoΜΜΜΜΜΜrΜΜΜΜΜΜeΜΜΜ
ΜΜΜmΜ
ΜΜΜΜΜ', {'LΜΜΜΜΜΜ', 'oΜΜΜΜΜΜ', 'rΜΜΜΜΜΜ', 'eΜΜΜ
ΜΜΜ', 'mΜ
ΜΜΜΜΜ'}) + -- stylua: ignore end + end) end) |