From cfe4352897a3e9075185f9f3cf8ae9f9f4bbc944 Mon Sep 17 00:00:00 2001 From: oni-link Date: Fri, 15 Apr 2016 21:15:12 +0200 Subject: mbyte.c: Unittest for utfc_ptr2char_len() --- test/unit/mbyte_spec.lua | 277 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 277 insertions(+) create mode 100644 test/unit/mbyte_spec.lua (limited to 'test/unit/mbyte_spec.lua') diff --git a/test/unit/mbyte_spec.lua b/test/unit/mbyte_spec.lua new file mode 100644 index 0000000000..6e5af0918b --- /dev/null +++ b/test/unit/mbyte_spec.lua @@ -0,0 +1,277 @@ +local helpers = require("test.unit.helpers") + +local ffi = helpers.ffi +local eq = helpers.eq + +local globals = helpers.cimport("./src/nvim/globals.h") +local mbyte = helpers.cimport("./src/nvim/mbyte.h") + +describe('mbyte', function() + + -- Array for composing characters + local intp = ffi.typeof('int[?]') + local function to_intp() + -- how to get MAX_MCO from globals.h? + return intp(7, 1) + end + + -- Convert from bytes to string + local function to_string(bytes) + s = {} + for i = 1, #bytes do + s[i] = string.char(bytes[i]) + end + return table.concat(s) + end + + before_each(function() + end) + + it('utf_ptr2char', function() + -- For strings with length 1 the first byte is returned. + for c = 0, 255 do + eq(c, mbyte.utf_ptr2char(to_string({c, 0}))) + end + + -- Some ill formed byte sequences that should not be recognized as UTF-8 + -- First byte: 0xc0 or 0xc1 + -- Second byte: 0x80 .. 0xbf + --eq(0x00c0, mbyte.utf_ptr2char(to_string({0xc0, 0x80}))) + --eq(0x00c1, mbyte.utf_ptr2char(to_string({0xc1, 0xbf}))) + -- + -- Sequences with more than four bytes + end) + + + describe('utfc_ptr2char_len', function() + + it('1-byte sequences', function() + local pcc = to_intp() + for c = 0, 255 do + eq(c, mbyte.utfc_ptr2char_len(to_string({c}), pcc, 1)) + eq(0, pcc[0]) + end + end) + + it('2-byte sequences', function() + local pcc = to_intp() + -- No combining characters + eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0x7f}), pcc, 2)) + eq(0, pcc[0]) + -- No combining characters + local pcc = to_intp() + eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0x80}), pcc, 2)) + eq(0, pcc[0]) + + -- No UTF-8 sequence + local pcc = to_intp() + eq(0x00c2, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x7f}), pcc, 2)) + eq(0, pcc[0]) + -- One UTF-8 character + local pcc = to_intp() + eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80}), pcc, 2)) + eq(0, pcc[0]) + -- No UTF-8 sequence + local pcc = to_intp() + eq(0x00c2, mbyte.utfc_ptr2char_len(to_string({0xc2, 0xc0}), pcc, 2)) + eq(0, pcc[0]) + end) + + it('3-byte sequences', function() + local pcc = to_intp() + + -- No second UTF-8 character + eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0x80, 0x80}), pcc, 3)) + eq(0, pcc[0]) + -- No combining character + local pcc = to_intp() + eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xc2, 0x80}), pcc, 3)) + eq(0, pcc[0]) + + -- Combining character is U+0300 + local pcc = to_intp() + eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80}), pcc, 3)) + eq(0x0300, pcc[0]) + eq(0x0000, pcc[1]) + + -- No UTF-8 sequence + local pcc = to_intp() + eq(0x00c2, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x7f, 0xcc}), pcc, 3)) + eq(0, pcc[0]) + -- Incomplete combining character + local pcc = to_intp() + eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc}), pcc, 3)) + eq(0, pcc[0]) + + -- One UTF-8 character + local pcc = to_intp() + eq(0x20d0, mbyte.utfc_ptr2char_len(to_string({0xe2, 0x83, 0x90}), pcc, 3)) + eq(0, pcc[0]) + end) + + it('4-byte sequences', function() + local pcc = to_intp() + + -- No following combining character + eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0x7f, 0xcc, 0x80}), pcc, 4)) + eq(0, pcc[0]) + -- No second UTF-8 character + local pcc = to_intp() + eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xc2, 0xcc, 0x80}), pcc, 4)) + eq(0, pcc[0]) + + -- Combining character U+0300 + local pcc = to_intp() + eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80, 0xcc}), pcc, 4)) + eq(0x0300, pcc[0]) + eq(0x0000, pcc[1]) + + -- No UTF-8 sequence + local pcc = to_intp() + eq(0x00c2, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x7f, 0xcc, 0x80}), pcc, 4)) + eq(0, pcc[0]) + -- No following UTF-8 character + local pcc = to_intp() + eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc, 0xcc}), pcc, 4)) + eq(0, pcc[0]) + -- Combining character U+0301 + local pcc = to_intp() + eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc, 0x81}), pcc, 4)) + eq(0x0301, pcc[0]) + eq(0x0000, pcc[1]) + + -- One UTF-8 character + local pcc = to_intp() + eq(0x100000, mbyte.utfc_ptr2char_len(to_string({0xf4, 0x80, 0x80, 0x80}), pcc, 4)) + eq(0, pcc[0]) + end) + + it('5+-byte sequences', function() + local pcc = to_intp() + + -- No following combining character + eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0x7f, 0xcc, 0x80, 0x80}), pcc, 5)) + eq(0, pcc[0]) + -- No second UTF-8 character + local pcc = to_intp() + eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xc2, 0xcc, 0x80, 0x80}), pcc, 5)) + eq(0, pcc[0]) + + -- Combining character U+0300 + local pcc = to_intp() + eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80, 0xcc}), pcc, 5)) + eq(0x0300, pcc[0]) + eq(0x0000, pcc[1]) + + -- Combining characters U+0300 and U+0301 + local pcc = to_intp() + eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80, 0xcc, 0x81}), pcc, 5)) + eq(0x0300, pcc[0]) + eq(0x0301, pcc[1]) + eq(0x0000, pcc[2]) + -- Combining characters U+0300, U+0301, U+0302 + local pcc = to_intp() + eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82}), pcc, 7)) + eq(0x0300, pcc[0]) + eq(0x0301, pcc[1]) + eq(0x0302, pcc[2]) + eq(0x0000, pcc[3]) + -- Combining characters U+0300, U+0301, U+0302, U+0303 + local pcc = to_intp() + eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83}), pcc, 9)) + eq(0x0300, pcc[0]) + eq(0x0301, pcc[1]) + eq(0x0302, pcc[2]) + eq(0x0303, pcc[3]) + eq(0x0000, pcc[4]) + -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304 + local pcc = to_intp() + eq(0x007f, mbyte.utfc_ptr2char_len(to_string( + {0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84}), pcc, 11)) + eq(0x0300, pcc[0]) + eq(0x0301, pcc[1]) + eq(0x0302, pcc[2]) + eq(0x0303, pcc[3]) + eq(0x0304, pcc[4]) + eq(0x0000, pcc[5]) + -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, + -- U+0305 + local pcc = to_intp() + eq(0x007f, mbyte.utfc_ptr2char_len(to_string( + {0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85}), pcc, 13)) + eq(0x0300, pcc[0]) + eq(0x0301, pcc[1]) + eq(0x0302, pcc[2]) + eq(0x0303, pcc[3]) + eq(0x0304, pcc[4]) + eq(0x0305, pcc[5]) + eq(1, pcc[6]) + + -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, + -- U+0305, U+0306, but only write save six. + local pcc = to_intp() + eq(0x007f, mbyte.utfc_ptr2char_len(to_string( + {0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85, 0xcc, 0x86}), pcc, 15)) + eq(0x0300, pcc[0]) + eq(0x0301, pcc[1]) + eq(0x0302, pcc[2]) + eq(0x0303, pcc[3]) + eq(0x0304, pcc[4]) + eq(0x0305, pcc[5]) + eq(0x0001, pcc[6]) + + -- Only three following combining characters U+0300, U+0301, U+0302 + local pcc = to_intp() + eq(0x007f, mbyte.utfc_ptr2char_len(to_string( + {0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xc2, 0x80, 0xcc, 0x84, 0xcc, 0x85}), pcc, 13)) + eq(0x0300, pcc[0]) + eq(0x0301, pcc[1]) + eq(0x0302, pcc[2]) + eq(0x0000, pcc[3]) + + + -- No UTF-8 sequence + local pcc = to_intp() + eq(0x00c2, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x7f, 0xcc, 0x80, 0x80}), pcc, 5)) + eq(0, pcc[0]) + -- No following UTF-8 character + local pcc = to_intp() + eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc, 0xcc, 0x80}), pcc, 5)) + eq(0, pcc[0]) + -- Combining character U+0301 + local pcc = to_intp() + eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc, 0x81, 0x7f}), pcc, 5)) + eq(0x0301, pcc[0]) + eq(0x0000, pcc[1]) + -- Combining character U+0301 + local pcc = to_intp() + eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc, 0x81, 0xcc}), pcc, 5)) + eq(0x0301, pcc[0]) + eq(0x0000, pcc[1]) + + -- One UTF-8 character + local pcc = to_intp() + eq(0x100000, mbyte.utfc_ptr2char_len(to_string({0xf4, 0x80, 0x80, 0x80, 0x7f}), pcc, 5)) + eq(0, pcc[0]) + + -- One UTF-8 character + local pcc = to_intp() + eq(0x100000, mbyte.utfc_ptr2char_len(to_string({0xf4, 0x80, 0x80, 0x80, 0x80}), pcc, 5)) + eq(0, pcc[0]) + -- One UTF-8 character + local pcc = to_intp() + eq(0x100000, mbyte.utfc_ptr2char_len(to_string({0xf4, 0x80, 0x80, 0x80, 0xcc}), pcc, 5)) + eq(0, pcc[0]) + + -- Combining characters U+1AB0 and U+0301 + local pcc = to_intp() + eq(0x100000, mbyte.utfc_ptr2char_len(to_string( + {0xf4, 0x80, 0x80, 0x80, 0xe1, 0xaa, 0xb0, 0xcc, 0x81}), pcc, 9)) + eq(0x1ab0, pcc[0]) + eq(0x0301, pcc[1]) + eq(0x0000, pcc[2]) + end) + + end) + +end) -- cgit From a8fec15899c070195a2a29d8a44c20249199f258 Mon Sep 17 00:00:00 2001 From: oni-link Date: Sun, 17 Apr 2016 21:07:22 +0200 Subject: mbyte_spec.lua: Fix wording --- test/unit/mbyte_spec.lua | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'test/unit/mbyte_spec.lua') diff --git a/test/unit/mbyte_spec.lua b/test/unit/mbyte_spec.lua index 6e5af0918b..eb7e79c59a 100644 --- a/test/unit/mbyte_spec.lua +++ b/test/unit/mbyte_spec.lua @@ -208,7 +208,7 @@ describe('mbyte', function() eq(1, pcc[6]) -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, - -- U+0305, U+0306, but only write save six. + -- U+0305, U+0306, but only save six (= MAX_MCO). local pcc = to_intp() eq(0x007f, mbyte.utfc_ptr2char_len(to_string( {0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85, 0xcc, 0x86}), pcc, 15)) -- cgit From cd00aa6ae475ecb41de9272fe7c35d8c5a0e8512 Mon Sep 17 00:00:00 2001 From: oni-link Date: Sun, 17 Apr 2016 21:07:42 +0200 Subject: mbyte_spec.lua: Fix indentation --- test/unit/mbyte_spec.lua | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'test/unit/mbyte_spec.lua') diff --git a/test/unit/mbyte_spec.lua b/test/unit/mbyte_spec.lua index eb7e79c59a..e1b401c76d 100644 --- a/test/unit/mbyte_spec.lua +++ b/test/unit/mbyte_spec.lua @@ -11,8 +11,8 @@ describe('mbyte', function() -- Array for composing characters local intp = ffi.typeof('int[?]') local function to_intp() - -- how to get MAX_MCO from globals.h? - return intp(7, 1) + -- how to get MAX_MCO from globals.h? + return intp(7, 1) end -- Convert from bytes to string @@ -30,7 +30,7 @@ describe('mbyte', function() it('utf_ptr2char', function() -- For strings with length 1 the first byte is returned. for c = 0, 255 do - eq(c, mbyte.utf_ptr2char(to_string({c, 0}))) + eq(c, mbyte.utf_ptr2char(to_string({c, 0}))) end -- Some ill formed byte sequences that should not be recognized as UTF-8 @@ -48,8 +48,8 @@ describe('mbyte', function() it('1-byte sequences', function() local pcc = to_intp() for c = 0, 255 do - eq(c, mbyte.utfc_ptr2char_len(to_string({c}), pcc, 1)) - eq(0, pcc[0]) + eq(c, mbyte.utfc_ptr2char_len(to_string({c}), pcc, 1)) + eq(0, pcc[0]) end end) @@ -187,7 +187,7 @@ describe('mbyte', function() -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304 local pcc = to_intp() eq(0x007f, mbyte.utfc_ptr2char_len(to_string( - {0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84}), pcc, 11)) + {0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84}), pcc, 11)) eq(0x0300, pcc[0]) eq(0x0301, pcc[1]) eq(0x0302, pcc[2]) @@ -198,7 +198,7 @@ describe('mbyte', function() -- U+0305 local pcc = to_intp() eq(0x007f, mbyte.utfc_ptr2char_len(to_string( - {0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85}), pcc, 13)) + {0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85}), pcc, 13)) eq(0x0300, pcc[0]) eq(0x0301, pcc[1]) eq(0x0302, pcc[2]) @@ -211,7 +211,7 @@ describe('mbyte', function() -- U+0305, U+0306, but only save six (= MAX_MCO). local pcc = to_intp() eq(0x007f, mbyte.utfc_ptr2char_len(to_string( - {0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85, 0xcc, 0x86}), pcc, 15)) + {0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85, 0xcc, 0x86}), pcc, 15)) eq(0x0300, pcc[0]) eq(0x0301, pcc[1]) eq(0x0302, pcc[2]) @@ -223,7 +223,7 @@ describe('mbyte', function() -- Only three following combining characters U+0300, U+0301, U+0302 local pcc = to_intp() eq(0x007f, mbyte.utfc_ptr2char_len(to_string( - {0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xc2, 0x80, 0xcc, 0x84, 0xcc, 0x85}), pcc, 13)) + {0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xc2, 0x80, 0xcc, 0x84, 0xcc, 0x85}), pcc, 13)) eq(0x0300, pcc[0]) eq(0x0301, pcc[1]) eq(0x0302, pcc[2]) @@ -266,7 +266,7 @@ describe('mbyte', function() -- Combining characters U+1AB0 and U+0301 local pcc = to_intp() eq(0x100000, mbyte.utfc_ptr2char_len(to_string( - {0xf4, 0x80, 0x80, 0x80, 0xe1, 0xaa, 0xb0, 0xcc, 0x81}), pcc, 9)) + {0xf4, 0x80, 0x80, 0x80, 0xe1, 0xaa, 0xb0, 0xcc, 0x81}), pcc, 9)) eq(0x1ab0, pcc[0]) eq(0x0301, pcc[1]) eq(0x0000, pcc[2]) -- cgit