diff options
author | bfredl <bjorn.linse@gmail.com> | 2023-11-06 14:52:27 +0100 |
---|---|---|
committer | bfredl <bjorn.linse@gmail.com> | 2023-11-17 12:58:57 +0100 |
commit | b522cb1ac3fbdf6e68eed5d0b6e1cbeaf3ac2254 (patch) | |
tree | 434ec27e069ba57406ce9f6d194627e95c3d315c /test/unit/mbyte_spec.lua | |
parent | 20ec4c776a07492c2e3b995e10b40b1cdb52bc7a (diff) | |
download | rneovim-b522cb1ac3fbdf6e68eed5d0b6e1cbeaf3ac2254.tar.gz rneovim-b522cb1ac3fbdf6e68eed5d0b6e1cbeaf3ac2254.tar.bz2 rneovim-b522cb1ac3fbdf6e68eed5d0b6e1cbeaf3ac2254.zip |
refactor(grid): make screen rendering more multibyte than ever before
Problem: buffer text with composing chars are converted from UTF-8
to an array of up to seven UTF-32 values and then converted back
to UTF-8 strings.
Solution: Convert buffer text directly to UTF-8 based schar_T values.
The limit of the text size is now in schar_T bytes, which is currently
31+1 but easily could be raised as it no longer multiplies the size
of the entire screen grid when not used, the full size is only required
for temporary scratch buffers.
Also does some general cleanup to win_line text handling, which was
unnecessarily complicated due to multibyte rendering being an "opt-in"
feature long ago. Nowadays, a char is just a char, regardless if it consists
of one ASCII byte or multiple bytes.
Diffstat (limited to 'test/unit/mbyte_spec.lua')
-rw-r--r-- | test/unit/mbyte_spec.lua | 243 |
1 files changed, 64 insertions, 179 deletions
diff --git a/test/unit/mbyte_spec.lua b/test/unit/mbyte_spec.lua index fdb1bceab0..cd94624570 100644 --- a/test/unit/mbyte_spec.lua +++ b/test/unit/mbyte_spec.lua @@ -4,17 +4,9 @@ local itp = helpers.gen_itp(it) local ffi = helpers.ffi local eq = helpers.eq -local mbyte = helpers.cimport("./src/nvim/mbyte.h") -local charset = helpers.cimport('./src/nvim/charset.h') +local lib = helpers.cimport('./src/nvim/mbyte.h', './src/nvim/charset.h', './src/nvim/grid.h') describe('mbyte', function() - -- Array for composing characters - local intp = ffi.typeof('int[?]') - local function to_intp() - -- how to get MAX_MCO from globals.h? - return intp(7, 1) - end - -- Convert from bytes to string local function to_string(bytes) local s = {} @@ -30,14 +22,14 @@ describe('mbyte', function() itp('utf_ptr2char', function() -- For strings with length 1 the first byte is returned. for c = 0, 255 do - eq(c, mbyte.utf_ptr2char(to_string({c, 0}))) + eq(c, lib.utf_ptr2char(to_string({c, 0}))) end -- Some ill formed byte sequences that should not be recognized as UTF-8 -- First byte: 0xc0 or 0xc1 -- Second byte: 0x80 .. 0xbf - --eq(0x00c0, mbyte.utf_ptr2char(to_string({0xc0, 0x80}))) - --eq(0x00c1, mbyte.utf_ptr2char(to_string({0xc1, 0xbf}))) + --eq(0x00c0, lib.utf_ptr2char(to_string({0xc0, 0x80}))) + --eq(0x00c1, lib.utf_ptr2char(to_string({0xc1, 0xbf}))) -- -- Sequences with more than four bytes end) @@ -47,240 +39,133 @@ describe('mbyte', function() local char_p = ffi.typeof('char[?]') for c = n * 0x1000, n * 0x1000 + 0xFFF do local p = char_p(4, 0) - mbyte.utf_char2bytes(c, p) - eq(c, mbyte.utf_ptr2char(p)) - eq(charset.vim_iswordc(c), charset.vim_iswordp(p)) + lib.utf_char2bytes(c, p) + eq(c, lib.utf_ptr2char(p)) + eq(lib.vim_iswordc(c), lib.vim_iswordp(p)) end end) end - describe('utfc_ptr2char_len', function() + describe('utfc_ptr2schar_len', function() + local function test_seq(seq) + local firstc = ffi.new("int[1]") + local buf = ffi.new("char[32]") + lib.schar_get(buf, lib.utfc_ptr2schar_len(to_string(seq), #seq, firstc)) + return {ffi.string(buf), firstc[0]} + end + + local function byte(val) + return {string.char(val), val} + end itp('1-byte sequences', function() - local pcc = to_intp() - for c = 0, 255 do - eq(c, mbyte.utfc_ptr2char_len(to_string({c}), pcc, 1)) - eq(0, pcc[0]) + eq({'', 0}, test_seq{0}) + for c = 1, 127 do + eq(byte(c), test_seq{c}) + end + for c = 128, 255 do + eq({'', c}, test_seq{c}) end end) itp('2-byte sequences', function() - local pcc = to_intp() -- No combining characters - eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0x7f}), pcc, 2)) - eq(0, pcc[0]) + eq(byte(0x7f), test_seq{0x7f, 0x7f}) -- No combining characters - pcc = to_intp() - eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0x80}), pcc, 2)) - eq(0, pcc[0]) + eq(byte(0x7f), test_seq{0x7f, 0x80}) -- No UTF-8 sequence - pcc = to_intp() - eq(0x00c2, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x7f}), pcc, 2)) - eq(0, pcc[0]) + eq({'', 0xc2}, test_seq{0xc2, 0x7f}) -- One UTF-8 character - pcc = to_intp() - eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80}), pcc, 2)) - eq(0, pcc[0]) + eq({'\xc2\x80', 0x80}, test_seq{0xc2, 0x80}) -- No UTF-8 sequence - pcc = to_intp() - eq(0x00c2, mbyte.utfc_ptr2char_len(to_string({0xc2, 0xc0}), pcc, 2)) - eq(0, pcc[0]) + eq({'', 0xc2}, test_seq{0xc2, 0xc0}) end) itp('3-byte sequences', function() - local pcc = to_intp() - -- No second UTF-8 character - eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0x80, 0x80}), pcc, 3)) - eq(0, pcc[0]) + eq(byte(0x7f), test_seq{0x7f, 0x80, 0x80}) -- No combining character - pcc = to_intp() - eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xc2, 0x80}), pcc, 3)) - eq(0, pcc[0]) + eq(byte(0x7f), test_seq{0x7f, 0xc2, 0x80}) -- Combining character is U+0300 - pcc = to_intp() - eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80}), pcc, 3)) - eq(0x0300, pcc[0]) - eq(0x0000, pcc[1]) + eq({"\x7f\xcc\x80", 0x7f}, test_seq{0x7f, 0xcc, 0x80}) -- No UTF-8 sequence - pcc = to_intp() - eq(0x00c2, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x7f, 0xcc}), pcc, 3)) - eq(0, pcc[0]) + eq({'', 0xc2}, test_seq{0xc2, 0x7f, 0xcc}) -- Incomplete combining character - pcc = to_intp() - eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc}), pcc, 3)) - eq(0, pcc[0]) + eq({"\xc2\x80", 0x80}, test_seq{0xc2, 0x80, 0xcc}) - -- One UTF-8 character - pcc = to_intp() - eq(0x20d0, mbyte.utfc_ptr2char_len(to_string({0xe2, 0x83, 0x90}), pcc, 3)) - eq(0, pcc[0]) + -- One UTF-8 character (composing only) + eq({" \xe2\x83\x90", 0x20d0}, test_seq{0xe2, 0x83, 0x90}) end) itp('4-byte sequences', function() - local pcc = to_intp() -- No following combining character - eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0x7f, 0xcc, 0x80}), pcc, 4)) - eq(0, pcc[0]) + eq(byte(0x7f), test_seq{0x7f, 0x7f, 0xcc, 0x80}) -- No second UTF-8 character - pcc = to_intp() - eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xc2, 0xcc, 0x80}), pcc, 4)) - eq(0, pcc[0]) + eq(byte(0x7f), test_seq{0x7f, 0xc2, 0xcc, 0x80}) -- Combining character U+0300 - pcc = to_intp() - eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80, 0xcc}), pcc, 4)) - eq(0x0300, pcc[0]) - eq(0x0000, pcc[1]) + eq({"\x7f\xcc\x80", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc}) -- No UTF-8 sequence - pcc = to_intp() - eq(0x00c2, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x7f, 0xcc, 0x80}), pcc, 4)) - eq(0, pcc[0]) + eq({'', 0xc2}, test_seq{0xc2, 0x7f, 0xcc, 0x80}) -- No following UTF-8 character - pcc = to_intp() - eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc, 0xcc}), pcc, 4)) - eq(0, pcc[0]) + eq({"\xc2\x80", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0xcc}) -- Combining character U+0301 - pcc = to_intp() - eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc, 0x81}), pcc, 4)) - eq(0x0301, pcc[0]) - eq(0x0000, pcc[1]) + eq({"\xc2\x80\xcc\x81", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0x81}) -- One UTF-8 character - pcc = to_intp() - eq(0x100000, mbyte.utfc_ptr2char_len(to_string({0xf4, 0x80, 0x80, 0x80}), pcc, 4)) - eq(0, pcc[0]) + eq({"\xf4\x80\x80\x80", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80}) end) itp('5+-byte sequences', function() - local pcc = to_intp() - -- No following combining character - eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0x7f, 0xcc, 0x80, 0x80}), pcc, 5)) - eq(0, pcc[0]) + eq(byte(0x7f), test_seq{0x7f, 0x7f, 0xcc, 0x80, 0x80}) -- No second UTF-8 character - pcc = to_intp() - eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xc2, 0xcc, 0x80, 0x80}), pcc, 5)) - eq(0, pcc[0]) + eq(byte(0x7f), test_seq{0x7f, 0xc2, 0xcc, 0x80, 0x80}) -- Combining character U+0300 - pcc = to_intp() - eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80, 0xcc}), pcc, 5)) - eq(0x0300, pcc[0]) - eq(0x0000, pcc[1]) + eq({"\x7f\xcc\x80", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x00}) -- Combining characters U+0300 and U+0301 - pcc = to_intp() - eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80, 0xcc, 0x81}), pcc, 5)) - eq(0x0300, pcc[0]) - eq(0x0301, pcc[1]) - eq(0x0000, pcc[2]) + eq({"\x7f\xcc\x80\xcc\x81", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81}) -- Combining characters U+0300, U+0301, U+0302 - pcc = to_intp() - eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82}), pcc, 7)) - eq(0x0300, pcc[0]) - eq(0x0301, pcc[1]) - eq(0x0302, pcc[2]) - eq(0x0000, pcc[3]) + eq({"\x7f\xcc\x80\xcc\x81\xcc\x82", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82}) -- Combining characters U+0300, U+0301, U+0302, U+0303 - pcc = to_intp() - eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83}), pcc, 9)) - eq(0x0300, pcc[0]) - eq(0x0301, pcc[1]) - eq(0x0302, pcc[2]) - eq(0x0303, pcc[3]) - eq(0x0000, pcc[4]) + eq({"\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83}) -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304 - pcc = to_intp() - eq(0x007f, mbyte.utfc_ptr2char_len(to_string( - {0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84}), pcc, 11)) - eq(0x0300, pcc[0]) - eq(0x0301, pcc[1]) - eq(0x0302, pcc[2]) - eq(0x0303, pcc[3]) - eq(0x0304, pcc[4]) - eq(0x0000, pcc[5]) - -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, - -- U+0305 - pcc = to_intp() - eq(0x007f, mbyte.utfc_ptr2char_len(to_string( - {0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85}), pcc, 13)) - eq(0x0300, pcc[0]) - eq(0x0301, pcc[1]) - eq(0x0302, pcc[2]) - eq(0x0303, pcc[3]) - eq(0x0304, pcc[4]) - eq(0x0305, pcc[5]) - eq(1, pcc[6]) - - -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, - -- U+0305, U+0306, but only save six (= MAX_MCO). - pcc = to_intp() - eq(0x007f, mbyte.utfc_ptr2char_len(to_string( - {0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85, 0xcc, 0x86}), pcc, 15)) - eq(0x0300, pcc[0]) - eq(0x0301, pcc[1]) - eq(0x0302, pcc[2]) - eq(0x0303, pcc[3]) - eq(0x0304, pcc[4]) - eq(0x0305, pcc[5]) - eq(0x0001, pcc[6]) + eq({"\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84}) + -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, U+0305 + eq({"\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85}) - -- Only three following combining characters U+0300, U+0301, U+0302 - pcc = to_intp() - eq(0x007f, mbyte.utfc_ptr2char_len(to_string( - {0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xc2, 0x80, 0xcc, 0x84, 0xcc, 0x85}), pcc, 13)) - eq(0x0300, pcc[0]) - eq(0x0301, pcc[1]) - eq(0x0302, pcc[2]) - eq(0x0000, pcc[3]) + -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, U+0305, U+0306 + eq({"\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85\xcc\x86", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85, 0xcc, 0x86}) + -- Only three following combining characters U+0300, U+0301, U+0302 + eq({"\x7f\xcc\x80\xcc\x81\xcc\x82", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xc2, 0x80, 0xcc, 0x84, 0xcc, 0x85}) -- No UTF-8 sequence - pcc = to_intp() - eq(0x00c2, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x7f, 0xcc, 0x80, 0x80}), pcc, 5)) - eq(0, pcc[0]) + eq({'', 0xc2}, test_seq{0xc2, 0x7f, 0xcc, 0x80, 0x80}) -- No following UTF-8 character - pcc = to_intp() - eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc, 0xcc, 0x80}), pcc, 5)) - eq(0, pcc[0]) + eq({"\xc2\x80", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0xcc, 0x80}) -- Combining character U+0301 - pcc = to_intp() - eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc, 0x81, 0x7f}), pcc, 5)) - eq(0x0301, pcc[0]) - eq(0x0000, pcc[1]) + eq({"\xc2\x80\xcc\x81", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0x81, 0x7f}) -- Combining character U+0301 - pcc = to_intp() - eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc, 0x81, 0xcc}), pcc, 5)) - eq(0x0301, pcc[0]) - eq(0x0000, pcc[1]) + eq({"\xc2\x80\xcc\x81", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0x81, 0xcc}) -- One UTF-8 character - pcc = to_intp() - eq(0x100000, mbyte.utfc_ptr2char_len(to_string({0xf4, 0x80, 0x80, 0x80, 0x7f}), pcc, 5)) - eq(0, pcc[0]) + eq({"\xf4\x80\x80\x80", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80, 0x7f}) -- One UTF-8 character - pcc = to_intp() - eq(0x100000, mbyte.utfc_ptr2char_len(to_string({0xf4, 0x80, 0x80, 0x80, 0x80}), pcc, 5)) - eq(0, pcc[0]) + eq({"\xf4\x80\x80\x80", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80, 0x80}) -- One UTF-8 character - pcc = to_intp() - eq(0x100000, mbyte.utfc_ptr2char_len(to_string({0xf4, 0x80, 0x80, 0x80, 0xcc}), pcc, 5)) - eq(0, pcc[0]) + eq({"\xf4\x80\x80\x80", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80, 0xcc}) -- Combining characters U+1AB0 and U+0301 - pcc = to_intp() - eq(0x100000, mbyte.utfc_ptr2char_len(to_string( - {0xf4, 0x80, 0x80, 0x80, 0xe1, 0xaa, 0xb0, 0xcc, 0x81}), pcc, 9)) - eq(0x1ab0, pcc[0]) - eq(0x0301, pcc[1]) - eq(0x0000, pcc[2]) + eq({"\xf4\x80\x80\x80\xe1\xaa\xb0\xcc\x81", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80, 0xe1, 0xaa, 0xb0, 0xcc, 0x81}) end) end) |