aboutsummaryrefslogtreecommitdiff
path: root/test/unit/mbyte_spec.lua
diff options
context:
space:
mode:
Diffstat (limited to 'test/unit/mbyte_spec.lua')
-rw-r--r--test/unit/mbyte_spec.lua138
1 files changed, 112 insertions, 26 deletions
diff --git a/test/unit/mbyte_spec.lua b/test/unit/mbyte_spec.lua
index 8fcc67d20b..bdc111de2c 100644
--- a/test/unit/mbyte_spec.lua
+++ b/test/unit/mbyte_spec.lua
@@ -3,8 +3,14 @@ local itp = t.gen_itp(it)
local ffi = t.ffi
local eq = t.eq
+local to_cstr = t.to_cstr
-local lib = t.cimport('./src/nvim/mbyte.h', './src/nvim/charset.h', './src/nvim/grid.h')
+local lib = t.cimport(
+ './src/nvim/mbyte.h',
+ './src/nvim/charset.h',
+ './src/nvim/grid.h',
+ './src/nvim/option_vars.h'
+)
describe('mbyte', function()
-- Convert from bytes to string
@@ -45,12 +51,21 @@ describe('mbyte', function()
end)
end
- describe('utfc_ptr2schar_len', function()
+ describe('utfc_ptr2schar', function()
local function test_seq(seq)
local firstc = ffi.new('int[1]')
local buf = ffi.new('char[32]')
- lib.schar_get(buf, lib.utfc_ptr2schar_len(to_string(seq), #seq, firstc))
- return { ffi.string(buf), firstc[0] }
+ lib.schar_get(buf, lib.utfc_ptr2schar(to_string(seq), firstc))
+ local str = ffi.string(buf)
+ if 1 > 2 then -- for debugging
+ local tabel = {}
+ for i = 1, #str do
+ table.insert(tabel, string.format('0x%02x', string.byte(str, i)))
+ end
+ print('{ ' .. table.concat(tabel, ', ') .. ' }')
+ io.stdout:flush()
+ end
+ return { str, firstc[0] }
end
local function byte(val)
@@ -88,7 +103,9 @@ describe('mbyte', function()
eq(byte(0x7f), test_seq { 0x7f, 0xc2, 0x80 })
-- Combining character is U+0300
- eq({ '\x7f\xcc\x80', 0x7f }, test_seq { 0x7f, 0xcc, 0x80 })
+ eq({ '\x29\xcc\x80', 0x29 }, test_seq { 0x29, 0xcc, 0x80 })
+ -- invalid start byte for combining
+ eq({ '\x7f', 0x7f }, test_seq { 0x7f, 0xcc, 0x80 })
-- No UTF-8 sequence
eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f, 0xcc })
@@ -102,18 +119,21 @@ describe('mbyte', function()
itp('4-byte sequences', function()
-- No following combining character
eq(byte(0x7f), test_seq { 0x7f, 0x7f, 0xcc, 0x80 })
+ eq(byte(0x29), test_seq { 0x29, 0x29, 0xcc, 0x80 })
-- No second UTF-8 character
eq(byte(0x7f), test_seq { 0x7f, 0xc2, 0xcc, 0x80 })
-- Combining character U+0300
- eq({ '\x7f\xcc\x80', 0x7f }, test_seq { 0x7f, 0xcc, 0x80, 0xcc })
+ eq({ '\x29\xcc\x80', 0x29 }, test_seq { 0x29, 0xcc, 0x80, 0xcc })
-- No UTF-8 sequence
eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f, 0xcc, 0x80 })
-- No following UTF-8 character
eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0xcc })
-- Combining character U+0301
- eq({ '\xc2\x80\xcc\x81', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81 })
+ eq({ '\xc2\xbc\xcc\x81', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0x81 })
+ -- U+0080 : not a valid start char
+ eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81 })
-- One UTF-8 character
eq({ '\xf4\x80\x80\x80', 0x100000 }, test_seq { 0xf4, 0x80, 0x80, 0x80 })
@@ -126,36 +146,36 @@ describe('mbyte', function()
eq(byte(0x7f), test_seq { 0x7f, 0xc2, 0xcc, 0x80, 0x80 })
-- Combining character U+0300
- eq({ '\x7f\xcc\x80', 0x7f }, test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x00 })
+ eq({ '\x29\xcc\x80', 0x29 }, test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x00 })
-- Combining characters U+0300 and U+0301
- eq({ '\x7f\xcc\x80\xcc\x81', 0x7f }, test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81 })
+ eq({ '\x29\xcc\x80\xcc\x81', 0x29 }, test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81 })
-- Combining characters U+0300, U+0301, U+0302
eq(
- { '\x7f\xcc\x80\xcc\x81\xcc\x82', 0x7f },
- test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82 }
+ { '\x29\xcc\x80\xcc\x81\xcc\x82', 0x29 },
+ test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82 }
)
-- Combining characters U+0300, U+0301, U+0302, U+0303
eq(
- { '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83', 0x7f },
- test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83 }
+ { '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83', 0x29 },
+ test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83 }
)
-- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304
eq(
- { '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84', 0x7f },
- test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84 }
+ { '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84', 0x29 },
+ test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84 }
)
-- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, U+0305
eq(
- { '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85', 0x7f },
- test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85 }
+ { '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85', 0x29 },
+ test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85 }
)
-- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, U+0305, U+0306
eq(
- { '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85\xcc\x86', 0x7f },
+ { '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85\xcc\x86', 0x29 },
test_seq {
- 0x7f,
+ 0x29,
0xcc,
0x80,
0xcc,
@@ -175,18 +195,18 @@ describe('mbyte', function()
-- Only three following combining characters U+0300, U+0301, U+0302
eq(
- { '\x7f\xcc\x80\xcc\x81\xcc\x82', 0x7f },
- test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xc2, 0x80, 0xcc, 0x84, 0xcc, 0x85 }
+ { '\x29\xcc\x80\xcc\x81\xcc\x82', 0x29 },
+ test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xc2, 0x80, 0xcc, 0x84, 0xcc, 0x85 }
)
-- No UTF-8 sequence
eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f, 0xcc, 0x80, 0x80 })
-- No following UTF-8 character
- eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0xcc, 0x80 })
+ eq({ '\xc2\xbc', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0xcc, 0x80 })
-- Combining character U+0301
- eq({ '\xc2\x80\xcc\x81', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81, 0x7f })
+ eq({ '\xc2\xbc\xcc\x81', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0x81, 0x7f })
-- Combining character U+0301
- eq({ '\xc2\x80\xcc\x81', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81, 0xcc })
+ eq({ '\xc2\xbc\xcc\x81', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0x81, 0xcc })
-- One UTF-8 character
eq({ '\xf4\x80\x80\x80', 0x100000 }, test_seq { 0xf4, 0x80, 0x80, 0x80, 0x7f })
@@ -205,8 +225,6 @@ describe('mbyte', function()
end)
describe('utf_cp_bounds_len', function()
- local to_cstr = t.to_cstr
-
local tests = {
{
name = 'for valid string',
@@ -273,4 +291,72 @@ describe('mbyte', function()
eq(expected_offsets, { b = b_offsets, e = e_offsets })
end)
end)
+
+ itp('utf_head_off', function()
+ local function check(str, expected_glyphs)
+ local len = #str
+ local cstr = to_cstr(str)
+ local breaks = { 0 } -- SOT
+ local pos = 0
+ local mb_glyphs = {}
+ while pos < len do
+ local clen = lib.utfc_ptr2len(cstr + pos)
+ if clen == 0 then
+ eq(0, string.byte(str, pos + 1)) -- only NUL bytes can has length zery
+ clen = 1 -- but skip it, otherwise we get stuck
+ end
+ if clen > 1 then
+ table.insert(mb_glyphs, string.sub(str, pos + 1, pos + clen))
+ end
+ pos = pos + clen
+ table.insert(breaks, pos)
+ end
+ eq(breaks[#breaks], len) -- include EOT as break
+ -- we could also send in breaks, but this is more human readable
+ eq(mb_glyphs, expected_glyphs)
+
+ for i = 1, #breaks - 1 do
+ local start, next = breaks[i], breaks[i + 1]
+
+ for p = start, next - 1 do
+ eq(p - start, lib.utf_head_off(cstr, cstr + p))
+ end
+ end
+ eq(0, lib.utf_head_off(cstr, cstr + len)) -- NUL byte is safe
+ end
+ -- stylua doesn't like ZWJ chars..
+ -- stylua: ignore start
+ check('hej och hΓ₯ πŸ§‘β€πŸŒΎ!', { 'Γ₯', 'πŸ§‘β€πŸŒΎ' })
+
+ -- emoji (various kinds of combinations, use g8 to see them)
+ check("πŸ³οΈβ€βš§οΈπŸ§‘β€πŸŒΎβ€οΈπŸ˜‚πŸ΄β€β˜ οΈ", {"πŸ³οΈβ€βš§οΈ", "πŸ§‘β€πŸŒΎ", "❀️", "πŸ˜‚", "πŸ΄β€β˜ οΈ"})
+ check('πŸ³οΈβ€βš§οΈxyπŸ§‘β€πŸŒΎ\rβ€οΈπŸ˜‚Γ₯πŸ΄β€β˜ οΈΒ€', { 'πŸ³οΈβ€βš§οΈ', 'πŸ§‘β€πŸŒΎ', '❀️', 'πŸ˜‚', 'Γ₯', 'πŸ΄β€β˜ οΈ', 'Β€' })
+ check('πŸ³οΈβ€βš§οΈ\000πŸ§‘β€πŸŒΎ\000❀️\000πŸ˜‚\000Γ₯\000πŸ΄β€β˜ οΈ\000Β€', { 'πŸ³οΈβ€βš§οΈ', 'πŸ§‘β€πŸŒΎ', '❀️', 'πŸ˜‚', 'Γ₯', 'πŸ΄β€β˜ οΈ', 'Β€' })
+ check('\195πŸ³οΈβ€βš§οΈ\198πŸ§‘β€πŸŒΎ\165❀️\168\195πŸ˜‚\255πŸ΄β€β˜ οΈ\129Β€\165', { 'πŸ³οΈβ€βš§οΈ', 'πŸ§‘β€πŸŒΎ', '❀️', 'πŸ˜‚', 'πŸ΄β€β˜ οΈ', 'Β€' })
+
+ check('πŸ‡¦πŸ…±οΈ πŸ‡¦πŸ‡½ πŸ‡¦πŸ‡¨πŸ‡¦ πŸ‡²πŸ‡½πŸ‡ΉπŸ‡±',{'πŸ‡¦', 'πŸ…±οΈ', 'πŸ‡¦πŸ‡½', 'πŸ‡¦πŸ‡¨', 'πŸ‡¦', 'πŸ‡²πŸ‡½', 'πŸ‡ΉπŸ‡±'})
+ check('🏴󠁧󠁒󠁳󠁣󠁴󠁿🏴󠁧󠁒󠁷󠁬󠁳󠁿', {'🏴󠁧󠁒󠁳󠁣󠁴󠁿', '🏴󠁧󠁒󠁷󠁬󠁳󠁿'})
+
+ check('Γ₯\165ΓΌ\195aΓ«q\168Ξ²\000\169本\255', {'Γ₯', 'ΓΌ', 'Γ«', 'Ξ²', '本'})
+
+ lib.p_arshape = true -- default
+ check('Ψ³Ω„Ψ§Ω…', { 'Ψ³', 'Ω„Ψ§', 'Ω…' })
+ lib.p_arshape = false
+ check('Ψ³Ω„Ψ§Ω…', { 'Ψ³', 'Ω„', 'Ψ§', 'Ω…' })
+
+ check('LΜ“Μ‰Μ‘Μ’ΜŒΜšoΜŒΜ’Μ—Μ„Μ›Μ€rΜΜˆΜ•ΜˆΜŽΜè̇̅̄̄̐mΜ…Μ–ΜŸΜ„ΜŸΜš', {'LΜ“Μ‰Μ‘Μ’ΜŒΜš', 'oΜŒΜ’Μ—Μ„Μ›Μ€', 'rΜΜˆΜ•ΜˆΜŽΜ', 'è̇̅̄̄̐', 'mΜ…Μ–ΜŸΜ„ΜŸΜš'})
+ -- stylua: ignore end
+ end)
+
+ describe('utf_fold', function()
+ itp('does not crash with surrogates #30527', function()
+ eq(0xddfb, lib.utf_fold(0xddfb)) -- low surrogate, invalid as a character
+ eq(0xd800, lib.utf_fold(0xd800)) -- high surrogate, invalid as a character
+ end)
+
+ itp("doesn't crash on invalid codepoints", function()
+ eq(9000000, lib.utf_fold(9000000))
+ eq(0, lib.utf_fold(0))
+ end)
+ end)
end)