aboutsummaryrefslogtreecommitdiff
path: root/test/unit/mbyte_spec.lua
diff options
context:
space:
mode:
Diffstat (limited to 'test/unit/mbyte_spec.lua')
-rw-r--r--test/unit/mbyte_spec.lua205
1 files changed, 154 insertions, 51 deletions
diff --git a/test/unit/mbyte_spec.lua b/test/unit/mbyte_spec.lua
index cd94624570..00a8c06ceb 100644
--- a/test/unit/mbyte_spec.lua
+++ b/test/unit/mbyte_spec.lua
@@ -1,8 +1,8 @@
-local helpers = require("test.unit.helpers")(after_each)
+local helpers = require('test.unit.helpers')(after_each)
local itp = helpers.gen_itp(it)
-local ffi = helpers.ffi
-local eq = helpers.eq
+local ffi = helpers.ffi
+local eq = helpers.eq
local lib = helpers.cimport('./src/nvim/mbyte.h', './src/nvim/charset.h', './src/nvim/grid.h')
@@ -16,13 +16,12 @@ describe('mbyte', function()
return table.concat(s)
end
- before_each(function()
- end)
+ before_each(function() end)
itp('utf_ptr2char', function()
-- For strings with length 1 the first byte is returned.
for c = 0, 255 do
- eq(c, lib.utf_ptr2char(to_string({c, 0})))
+ eq(c, lib.utf_ptr2char(to_string({ c, 0 })))
end
-- Some ill formed byte sequences that should not be recognized as UTF-8
@@ -48,126 +47,230 @@ describe('mbyte', function()
describe('utfc_ptr2schar_len', function()
local function test_seq(seq)
- local firstc = ffi.new("int[1]")
- local buf = ffi.new("char[32]")
+ local firstc = ffi.new('int[1]')
+ local buf = ffi.new('char[32]')
lib.schar_get(buf, lib.utfc_ptr2schar_len(to_string(seq), #seq, firstc))
- return {ffi.string(buf), firstc[0]}
+ return { ffi.string(buf), firstc[0] }
end
local function byte(val)
- return {string.char(val), val}
+ return { string.char(val), val }
end
itp('1-byte sequences', function()
- eq({'', 0}, test_seq{0})
+ eq({ '', 0 }, test_seq { 0 })
for c = 1, 127 do
- eq(byte(c), test_seq{c})
+ eq(byte(c), test_seq { c })
end
for c = 128, 255 do
- eq({'', c}, test_seq{c})
+ eq({ '', c }, test_seq { c })
end
end)
itp('2-byte sequences', function()
-- No combining characters
- eq(byte(0x7f), test_seq{0x7f, 0x7f})
+ eq(byte(0x7f), test_seq { 0x7f, 0x7f })
-- No combining characters
- eq(byte(0x7f), test_seq{0x7f, 0x80})
+ eq(byte(0x7f), test_seq { 0x7f, 0x80 })
-- No UTF-8 sequence
- eq({'', 0xc2}, test_seq{0xc2, 0x7f})
+ eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f })
-- One UTF-8 character
- eq({'\xc2\x80', 0x80}, test_seq{0xc2, 0x80})
+ eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80 })
-- No UTF-8 sequence
- eq({'', 0xc2}, test_seq{0xc2, 0xc0})
+ eq({ '', 0xc2 }, test_seq { 0xc2, 0xc0 })
end)
itp('3-byte sequences', function()
-- No second UTF-8 character
- eq(byte(0x7f), test_seq{0x7f, 0x80, 0x80})
+ eq(byte(0x7f), test_seq { 0x7f, 0x80, 0x80 })
-- No combining character
- eq(byte(0x7f), test_seq{0x7f, 0xc2, 0x80})
+ eq(byte(0x7f), test_seq { 0x7f, 0xc2, 0x80 })
-- Combining character is U+0300
- eq({"\x7f\xcc\x80", 0x7f}, test_seq{0x7f, 0xcc, 0x80})
+ eq({ '\x7f\xcc\x80', 0x7f }, test_seq { 0x7f, 0xcc, 0x80 })
-- No UTF-8 sequence
- eq({'', 0xc2}, test_seq{0xc2, 0x7f, 0xcc})
+ eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f, 0xcc })
-- Incomplete combining character
- eq({"\xc2\x80", 0x80}, test_seq{0xc2, 0x80, 0xcc})
+ eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80, 0xcc })
-- One UTF-8 character (composing only)
- eq({" \xe2\x83\x90", 0x20d0}, test_seq{0xe2, 0x83, 0x90})
+ eq({ ' \xe2\x83\x90', 0x20d0 }, test_seq { 0xe2, 0x83, 0x90 })
end)
itp('4-byte sequences', function()
-
-- No following combining character
- eq(byte(0x7f), test_seq{0x7f, 0x7f, 0xcc, 0x80})
+ eq(byte(0x7f), test_seq { 0x7f, 0x7f, 0xcc, 0x80 })
-- No second UTF-8 character
- eq(byte(0x7f), test_seq{0x7f, 0xc2, 0xcc, 0x80})
+ eq(byte(0x7f), test_seq { 0x7f, 0xc2, 0xcc, 0x80 })
-- Combining character U+0300
- eq({"\x7f\xcc\x80", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc})
+ eq({ '\x7f\xcc\x80', 0x7f }, test_seq { 0x7f, 0xcc, 0x80, 0xcc })
-- No UTF-8 sequence
- eq({'', 0xc2}, test_seq{0xc2, 0x7f, 0xcc, 0x80})
+ eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f, 0xcc, 0x80 })
-- No following UTF-8 character
- eq({"\xc2\x80", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0xcc})
+ eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0xcc })
-- Combining character U+0301
- eq({"\xc2\x80\xcc\x81", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0x81})
+ eq({ '\xc2\x80\xcc\x81', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81 })
-- One UTF-8 character
- eq({"\xf4\x80\x80\x80", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80})
+ eq({ '\xf4\x80\x80\x80', 0x100000 }, test_seq { 0xf4, 0x80, 0x80, 0x80 })
end)
itp('5+-byte sequences', function()
-- No following combining character
- eq(byte(0x7f), test_seq{0x7f, 0x7f, 0xcc, 0x80, 0x80})
+ eq(byte(0x7f), test_seq { 0x7f, 0x7f, 0xcc, 0x80, 0x80 })
-- No second UTF-8 character
- eq(byte(0x7f), test_seq{0x7f, 0xc2, 0xcc, 0x80, 0x80})
+ eq(byte(0x7f), test_seq { 0x7f, 0xc2, 0xcc, 0x80, 0x80 })
-- Combining character U+0300
- eq({"\x7f\xcc\x80", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x00})
+ eq({ '\x7f\xcc\x80', 0x7f }, test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x00 })
-- Combining characters U+0300 and U+0301
- eq({"\x7f\xcc\x80\xcc\x81", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81})
+ eq({ '\x7f\xcc\x80\xcc\x81', 0x7f }, test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81 })
-- Combining characters U+0300, U+0301, U+0302
- eq({"\x7f\xcc\x80\xcc\x81\xcc\x82", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82})
+ eq(
+ { '\x7f\xcc\x80\xcc\x81\xcc\x82', 0x7f },
+ test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82 }
+ )
-- Combining characters U+0300, U+0301, U+0302, U+0303
- eq({"\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83})
+ eq(
+ { '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83', 0x7f },
+ test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83 }
+ )
-- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304
- eq({"\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84})
+ eq(
+ { '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84', 0x7f },
+ test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84 }
+ )
-- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, U+0305
- eq({"\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85})
+ eq(
+ { '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85', 0x7f },
+ test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85 }
+ )
-- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, U+0305, U+0306
- eq({"\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85\xcc\x86", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85, 0xcc, 0x86})
+ eq(
+ { '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85\xcc\x86', 0x7f },
+ test_seq {
+ 0x7f,
+ 0xcc,
+ 0x80,
+ 0xcc,
+ 0x81,
+ 0xcc,
+ 0x82,
+ 0xcc,
+ 0x83,
+ 0xcc,
+ 0x84,
+ 0xcc,
+ 0x85,
+ 0xcc,
+ 0x86,
+ }
+ )
-- Only three following combining characters U+0300, U+0301, U+0302
- eq({"\x7f\xcc\x80\xcc\x81\xcc\x82", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xc2, 0x80, 0xcc, 0x84, 0xcc, 0x85})
+ eq(
+ { '\x7f\xcc\x80\xcc\x81\xcc\x82', 0x7f },
+ test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xc2, 0x80, 0xcc, 0x84, 0xcc, 0x85 }
+ )
-- No UTF-8 sequence
- eq({'', 0xc2}, test_seq{0xc2, 0x7f, 0xcc, 0x80, 0x80})
+ eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f, 0xcc, 0x80, 0x80 })
-- No following UTF-8 character
- eq({"\xc2\x80", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0xcc, 0x80})
+ eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0xcc, 0x80 })
-- Combining character U+0301
- eq({"\xc2\x80\xcc\x81", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0x81, 0x7f})
+ eq({ '\xc2\x80\xcc\x81', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81, 0x7f })
-- Combining character U+0301
- eq({"\xc2\x80\xcc\x81", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0x81, 0xcc})
+ eq({ '\xc2\x80\xcc\x81', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81, 0xcc })
-- One UTF-8 character
- eq({"\xf4\x80\x80\x80", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80, 0x7f})
+ eq({ '\xf4\x80\x80\x80', 0x100000 }, test_seq { 0xf4, 0x80, 0x80, 0x80, 0x7f })
-- One UTF-8 character
- eq({"\xf4\x80\x80\x80", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80, 0x80})
+ eq({ '\xf4\x80\x80\x80', 0x100000 }, test_seq { 0xf4, 0x80, 0x80, 0x80, 0x80 })
-- One UTF-8 character
- eq({"\xf4\x80\x80\x80", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80, 0xcc})
+ eq({ '\xf4\x80\x80\x80', 0x100000 }, test_seq { 0xf4, 0x80, 0x80, 0x80, 0xcc })
-- Combining characters U+1AB0 and U+0301
- eq({"\xf4\x80\x80\x80\xe1\xaa\xb0\xcc\x81", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80, 0xe1, 0xaa, 0xb0, 0xcc, 0x81})
+ eq(
+ { '\xf4\x80\x80\x80\xe1\xaa\xb0\xcc\x81', 0x100000 },
+ test_seq { 0xf4, 0x80, 0x80, 0x80, 0xe1, 0xaa, 0xb0, 0xcc, 0x81 }
+ )
end)
-
end)
+ describe('utf_cp_bounds_len', function()
+ local to_cstr = helpers.to_cstr
+
+ local tests = {
+ {
+ name = 'for valid string',
+ str = 'iÀiiⱠiⱠⱠ𐀀i',
+ offsets = {
+ b = { 0, 0, 1, 0, 0, 0, 1, 2, 0, 0, 1, 2, 0, 1, 2, 0, 1, 2, 3, 0 },
+ e = { 1, 2, 1, 1, 1, 3, 2, 1, 1, 3, 2, 1, 3, 2, 1, 4, 3, 2, 1, 1 },
+ },
+ },
+ {
+ name = 'for string with incomplete sequence',
+ str = 'i\xC3iÀⱠiÀ\xE2\xB1Ⱡ\xF0\x90\x80',
+ offsets = {
+ b = { 0, 0, 0, 0, 1, 0, 1, 2, 0, 0, 1, 0, 0, 0, 1, 2, 0, 0, 0 },
+ e = { 1, 1, 1, 2, 1, 3, 2, 1, 1, 2, 1, 1, 1, 3, 2, 1, 1, 1, 1 },
+ },
+ },
+ {
+ name = 'for string with trailing bytes after multibyte',
+ str = 'iÀ\xA0Ⱡ\xA0Ⱡ𐀀\xA0i',
+ offsets = {
+ b = { 0, 0, 1, 0, 0, 1, 2, 0, 0, 1, 2, 0, 1, 2, 3, 0, 0 },
+ e = { 1, 2, 1, 1, 3, 2, 1, 1, 3, 2, 1, 4, 3, 2, 1, 1, 1 },
+ },
+ },
+ }
+
+ for _, test in ipairs(tests) do
+ itp(test.name, function()
+ local cstr = to_cstr(test.str)
+ local b_offsets, e_offsets = {}, {}
+ for i = 1, #test.str do
+ local result = lib.utf_cp_bounds_len(cstr, cstr + i - 1, #test.str - (i - 1))
+ table.insert(b_offsets, result.begin_off)
+ table.insert(e_offsets, result.end_off)
+ end
+ eq(test.offsets, { b = b_offsets, e = e_offsets })
+ end)
+ end
+
+ itp('does not read before start', function()
+ local str = '𐀀'
+ local expected_offsets = { b = { 0, 0, 0 }, e = { 1, 1, 1 } }
+ local cstr = to_cstr(str) + 1
+ local b_offsets, e_offsets = {}, {}
+ for i = 1, 3 do
+ local result = lib.utf_cp_bounds_len(cstr, cstr + i - 1, 3 - (i - 1))
+ table.insert(b_offsets, result.begin_off)
+ table.insert(e_offsets, result.end_off)
+ end
+ eq(expected_offsets, { b = b_offsets, e = e_offsets })
+ end)
+
+ itp('does not read past the end', function()
+ local str = '𐀀'
+ local expected_offsets = { b = { 0, 0, 0 }, e = { 1, 1, 1 } }
+ local cstr = to_cstr(str)
+ local b_offsets, e_offsets = {}, {}
+ for i = 1, 3 do
+ local result = lib.utf_cp_bounds_len(cstr, cstr + i - 1, 3 - (i - 1))
+ table.insert(b_offsets, result.begin_off)
+ table.insert(e_offsets, result.end_off)
+ end
+ eq(expected_offsets, { b = b_offsets, e = e_offsets })
+ end)
+ end)
end)