diff options
Diffstat (limited to 'test/unit/mbyte_spec.lua')
-rw-r--r-- | test/unit/mbyte_spec.lua | 138 |
1 files changed, 112 insertions, 26 deletions
diff --git a/test/unit/mbyte_spec.lua b/test/unit/mbyte_spec.lua index 8fcc67d20b..bdc111de2c 100644 --- a/test/unit/mbyte_spec.lua +++ b/test/unit/mbyte_spec.lua @@ -3,8 +3,14 @@ local itp = t.gen_itp(it) local ffi = t.ffi local eq = t.eq +local to_cstr = t.to_cstr -local lib = t.cimport('./src/nvim/mbyte.h', './src/nvim/charset.h', './src/nvim/grid.h') +local lib = t.cimport( + './src/nvim/mbyte.h', + './src/nvim/charset.h', + './src/nvim/grid.h', + './src/nvim/option_vars.h' +) describe('mbyte', function() -- Convert from bytes to string @@ -45,12 +51,21 @@ describe('mbyte', function() end) end - describe('utfc_ptr2schar_len', function() + describe('utfc_ptr2schar', function() local function test_seq(seq) local firstc = ffi.new('int[1]') local buf = ffi.new('char[32]') - lib.schar_get(buf, lib.utfc_ptr2schar_len(to_string(seq), #seq, firstc)) - return { ffi.string(buf), firstc[0] } + lib.schar_get(buf, lib.utfc_ptr2schar(to_string(seq), firstc)) + local str = ffi.string(buf) + if 1 > 2 then -- for debugging + local tabel = {} + for i = 1, #str do + table.insert(tabel, string.format('0x%02x', string.byte(str, i))) + end + print('{ ' .. table.concat(tabel, ', ') .. ' }') + io.stdout:flush() + end + return { str, firstc[0] } end local function byte(val) @@ -88,7 +103,9 @@ describe('mbyte', function() eq(byte(0x7f), test_seq { 0x7f, 0xc2, 0x80 }) -- Combining character is U+0300 - eq({ '\x7f\xcc\x80', 0x7f }, test_seq { 0x7f, 0xcc, 0x80 }) + eq({ '\x29\xcc\x80', 0x29 }, test_seq { 0x29, 0xcc, 0x80 }) + -- invalid start byte for combining + eq({ '\x7f', 0x7f }, test_seq { 0x7f, 0xcc, 0x80 }) -- No UTF-8 sequence eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f, 0xcc }) @@ -102,18 +119,21 @@ describe('mbyte', function() itp('4-byte sequences', function() -- No following combining character eq(byte(0x7f), test_seq { 0x7f, 0x7f, 0xcc, 0x80 }) + eq(byte(0x29), test_seq { 0x29, 0x29, 0xcc, 0x80 }) -- No second UTF-8 character eq(byte(0x7f), test_seq { 0x7f, 0xc2, 0xcc, 0x80 }) -- Combining character U+0300 - eq({ '\x7f\xcc\x80', 0x7f }, test_seq { 0x7f, 0xcc, 0x80, 0xcc }) + eq({ '\x29\xcc\x80', 0x29 }, test_seq { 0x29, 0xcc, 0x80, 0xcc }) -- No UTF-8 sequence eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f, 0xcc, 0x80 }) -- No following UTF-8 character eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0xcc }) -- Combining character U+0301 - eq({ '\xc2\x80\xcc\x81', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81 }) + eq({ '\xc2\xbc\xcc\x81', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0x81 }) + -- U+0080 : not a valid start char + eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81 }) -- One UTF-8 character eq({ '\xf4\x80\x80\x80', 0x100000 }, test_seq { 0xf4, 0x80, 0x80, 0x80 }) @@ -126,36 +146,36 @@ describe('mbyte', function() eq(byte(0x7f), test_seq { 0x7f, 0xc2, 0xcc, 0x80, 0x80 }) -- Combining character U+0300 - eq({ '\x7f\xcc\x80', 0x7f }, test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x00 }) + eq({ '\x29\xcc\x80', 0x29 }, test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x00 }) -- Combining characters U+0300 and U+0301 - eq({ '\x7f\xcc\x80\xcc\x81', 0x7f }, test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81 }) + eq({ '\x29\xcc\x80\xcc\x81', 0x29 }, test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81 }) -- Combining characters U+0300, U+0301, U+0302 eq( - { '\x7f\xcc\x80\xcc\x81\xcc\x82', 0x7f }, - test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82 } + { '\x29\xcc\x80\xcc\x81\xcc\x82', 0x29 }, + test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82 } ) -- Combining characters U+0300, U+0301, U+0302, U+0303 eq( - { '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83', 0x7f }, - test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83 } + { '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83', 0x29 }, + test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83 } ) -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304 eq( - { '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84', 0x7f }, - test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84 } + { '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84', 0x29 }, + test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84 } ) -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, U+0305 eq( - { '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85', 0x7f }, - test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85 } + { '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85', 0x29 }, + test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85 } ) -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, U+0305, U+0306 eq( - { '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85\xcc\x86', 0x7f }, + { '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85\xcc\x86', 0x29 }, test_seq { - 0x7f, + 0x29, 0xcc, 0x80, 0xcc, @@ -175,18 +195,18 @@ describe('mbyte', function() -- Only three following combining characters U+0300, U+0301, U+0302 eq( - { '\x7f\xcc\x80\xcc\x81\xcc\x82', 0x7f }, - test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xc2, 0x80, 0xcc, 0x84, 0xcc, 0x85 } + { '\x29\xcc\x80\xcc\x81\xcc\x82', 0x29 }, + test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xc2, 0x80, 0xcc, 0x84, 0xcc, 0x85 } ) -- No UTF-8 sequence eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f, 0xcc, 0x80, 0x80 }) -- No following UTF-8 character - eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0xcc, 0x80 }) + eq({ '\xc2\xbc', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0xcc, 0x80 }) -- Combining character U+0301 - eq({ '\xc2\x80\xcc\x81', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81, 0x7f }) + eq({ '\xc2\xbc\xcc\x81', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0x81, 0x7f }) -- Combining character U+0301 - eq({ '\xc2\x80\xcc\x81', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81, 0xcc }) + eq({ '\xc2\xbc\xcc\x81', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0x81, 0xcc }) -- One UTF-8 character eq({ '\xf4\x80\x80\x80', 0x100000 }, test_seq { 0xf4, 0x80, 0x80, 0x80, 0x7f }) @@ -205,8 +225,6 @@ describe('mbyte', function() end) describe('utf_cp_bounds_len', function() - local to_cstr = t.to_cstr - local tests = { { name = 'for valid string', @@ -273,4 +291,72 @@ describe('mbyte', function() eq(expected_offsets, { b = b_offsets, e = e_offsets }) end) end) + + itp('utf_head_off', function() + local function check(str, expected_glyphs) + local len = #str + local cstr = to_cstr(str) + local breaks = { 0 } -- SOT + local pos = 0 + local mb_glyphs = {} + while pos < len do + local clen = lib.utfc_ptr2len(cstr + pos) + if clen == 0 then + eq(0, string.byte(str, pos + 1)) -- only NUL bytes can has length zery + clen = 1 -- but skip it, otherwise we get stuck + end + if clen > 1 then + table.insert(mb_glyphs, string.sub(str, pos + 1, pos + clen)) + end + pos = pos + clen + table.insert(breaks, pos) + end + eq(breaks[#breaks], len) -- include EOT as break + -- we could also send in breaks, but this is more human readable + eq(mb_glyphs, expected_glyphs) + + for i = 1, #breaks - 1 do + local start, next = breaks[i], breaks[i + 1] + + for p = start, next - 1 do + eq(p - start, lib.utf_head_off(cstr, cstr + p)) + end + end + eq(0, lib.utf_head_off(cstr, cstr + len)) -- NUL byte is safe + end + -- stylua doesn't like ZWJ chars.. + -- stylua: ignore start + check('hej och hΓ₯ π§βπΎ!', { 'Γ₯', 'π§βπΎ' }) + + -- emoji (various kinds of combinations, use g8 to see them) + check("π³οΈββ§οΈπ§βπΎβ€οΈππ΄ββ οΈ", {"π³οΈββ§οΈ", "π§βπΎ", "β€οΈ", "π", "π΄ββ οΈ"}) + check('π³οΈββ§οΈxyπ§βπΎ\rβ€οΈπΓ₯π΄ββ οΈΒ', { 'π³οΈββ§οΈ', 'π§βπΎ', 'β€οΈ', 'π', 'Γ₯', 'π΄ββ οΈ', 'Β' }) + check('π³οΈββ§οΈ\000π§βπΎ\000β€οΈ\000π\000Γ₯\000π΄ββ οΈ\000Β', { 'π³οΈββ§οΈ', 'π§βπΎ', 'β€οΈ', 'π', 'Γ₯', 'π΄ββ οΈ', 'Β' }) + check('\195π³οΈββ§οΈ\198π§βπΎ\165β€οΈ\168\195π\255π΄ββ οΈ\129Β\165', { 'π³οΈββ§οΈ', 'π§βπΎ', 'β€οΈ', 'π', 'π΄ββ οΈ', 'Β' }) + + check('π¦π
±οΈ π¦π½ π¦π¨π¦ π²π½πΉπ±',{'π¦', 'π
±οΈ', 'π¦π½', 'π¦π¨', 'π¦', 'π²π½', 'πΉπ±'}) + check('π΄σ §σ ’σ ³σ £σ ΄σ Ώπ΄σ §σ ’σ ·σ ¬σ ³σ Ώ', {'π΄σ §σ ’σ ³σ £σ ΄σ Ώ', 'π΄σ §σ ’σ ·σ ¬σ ³σ Ώ'}) + + check('Γ₯\165ΓΌ\195aΓ«q\168Ξ²\000\169ζ¬\255', {'Γ₯', 'ΓΌ', 'Γ«', 'Ξ²', 'ζ¬'}) + + lib.p_arshape = true -- default + check('Ψ³ΩΨ§Ω
', { 'Ψ³', 'ΩΨ§', 'Ω
' }) + lib.p_arshape = false + check('Ψ³ΩΨ§Ω
', { 'Ψ³', 'Ω', 'Ψ§', 'Ω
' }) + + check('LΜΜΜΜΜΜoΜΜΜΜΜΜrΜΜΜΜΜΜeΜΜΜ
ΜΜΜmΜ
ΜΜΜΜΜ', {'LΜΜΜΜΜΜ', 'oΜΜΜΜΜΜ', 'rΜΜΜΜΜΜ', 'eΜΜΜ
ΜΜΜ', 'mΜ
ΜΜΜΜΜ'}) + -- stylua: ignore end + end) + + describe('utf_fold', function() + itp('does not crash with surrogates #30527', function() + eq(0xddfb, lib.utf_fold(0xddfb)) -- low surrogate, invalid as a character + eq(0xd800, lib.utf_fold(0xd800)) -- high surrogate, invalid as a character + end) + + itp("doesn't crash on invalid codepoints", function() + eq(9000000, lib.utf_fold(9000000)) + eq(0, lib.utf_fold(0)) + end) + end) end) |