1 files changed, 154 insertions, 51 deletions
diff --git a/test/unit/mbyte_spec.lua b/test/unit/mbyte_spec.lua
index cd94624570..00a8c06ceb 100644
--- a/test/unit/mbyte_spec.lua
+++ b/test/unit/mbyte_spec.lua
@@ -1,8 +1,8 @@
-local helpers = require("test.unit.helpers")(after_each)
+local helpers = require('test.unit.helpers')(after_each)
 local itp = helpers.gen_itp(it)
 
-local ffi     = helpers.ffi
-local eq      = helpers.eq
+local ffi = helpers.ffi
+local eq = helpers.eq
 
 local lib = helpers.cimport('./src/nvim/mbyte.h', './src/nvim/charset.h', './src/nvim/grid.h')
 
@@ -16,13 +16,12 @@ describe('mbyte', function()
     return table.concat(s)
   end
 
-  before_each(function()
-  end)
+  before_each(function() end)
 
   itp('utf_ptr2char', function()
     -- For strings with length 1 the first byte is returned.
     for c = 0, 255 do
-      eq(c, lib.utf_ptr2char(to_string({c, 0})))
+      eq(c, lib.utf_ptr2char(to_string({ c, 0 })))
     end
 
     -- Some ill formed byte sequences that should not be recognized as UTF-8
@@ -48,126 +47,230 @@ describe('mbyte', function()
 
   describe('utfc_ptr2schar_len', function()
     local function test_seq(seq)
-      local firstc = ffi.new("int[1]")
-      local buf = ffi.new("char[32]")
+      local firstc = ffi.new('int[1]')
+      local buf = ffi.new('char[32]')
       lib.schar_get(buf, lib.utfc_ptr2schar_len(to_string(seq), #seq, firstc))
-      return {ffi.string(buf), firstc[0]}
+      return { ffi.string(buf), firstc[0] }
     end
 
     local function byte(val)
-      return {string.char(val), val}
+      return { string.char(val), val }
     end
 
     itp('1-byte sequences', function()
-      eq({'', 0}, test_seq{0})
+      eq({ '', 0 }, test_seq { 0 })
       for c = 1, 127 do
-        eq(byte(c), test_seq{c})
+        eq(byte(c), test_seq { c })
       end
       for c = 128, 255 do
-        eq({'', c}, test_seq{c})
+        eq({ '', c }, test_seq { c })
       end
     end)
 
     itp('2-byte sequences', function()
       -- No combining characters
-      eq(byte(0x7f), test_seq{0x7f, 0x7f})
+      eq(byte(0x7f), test_seq { 0x7f, 0x7f })
       -- No combining characters
-      eq(byte(0x7f), test_seq{0x7f, 0x80})
+      eq(byte(0x7f), test_seq { 0x7f, 0x80 })
 
       -- No UTF-8 sequence
-      eq({'', 0xc2}, test_seq{0xc2, 0x7f})
+      eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f })
       -- One UTF-8 character
-      eq({'\xc2\x80', 0x80}, test_seq{0xc2, 0x80})
+      eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80 })
       -- No UTF-8 sequence
-      eq({'', 0xc2}, test_seq{0xc2, 0xc0})
+      eq({ '', 0xc2 }, test_seq { 0xc2, 0xc0 })
     end)
 
     itp('3-byte sequences', function()
       -- No second UTF-8 character
-      eq(byte(0x7f), test_seq{0x7f, 0x80, 0x80})
+      eq(byte(0x7f), test_seq { 0x7f, 0x80, 0x80 })
       -- No combining character
-      eq(byte(0x7f), test_seq{0x7f, 0xc2, 0x80})
+      eq(byte(0x7f), test_seq { 0x7f, 0xc2, 0x80 })
 
       -- Combining character is U+0300
-      eq({"\x7f\xcc\x80", 0x7f}, test_seq{0x7f, 0xcc, 0x80})
+      eq({ '\x7f\xcc\x80', 0x7f }, test_seq { 0x7f, 0xcc, 0x80 })
 
       -- No UTF-8 sequence
-      eq({'', 0xc2}, test_seq{0xc2, 0x7f, 0xcc})
+      eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f, 0xcc })
       -- Incomplete combining character
-      eq({"\xc2\x80", 0x80}, test_seq{0xc2, 0x80, 0xcc})
+      eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80, 0xcc })
 
       -- One UTF-8 character (composing only)
-      eq({" \xe2\x83\x90", 0x20d0}, test_seq{0xe2, 0x83, 0x90})
+      eq({ ' \xe2\x83\x90', 0x20d0 }, test_seq { 0xe2, 0x83, 0x90 })
     end)
 
     itp('4-byte sequences', function()
-
       -- No following combining character
-      eq(byte(0x7f), test_seq{0x7f, 0x7f, 0xcc, 0x80})
+      eq(byte(0x7f), test_seq { 0x7f, 0x7f, 0xcc, 0x80 })
       -- No second UTF-8 character
-      eq(byte(0x7f), test_seq{0x7f, 0xc2, 0xcc, 0x80})
+      eq(byte(0x7f), test_seq { 0x7f, 0xc2, 0xcc, 0x80 })
 
       -- Combining character U+0300
-      eq({"\x7f\xcc\x80", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc})
+      eq({ '\x7f\xcc\x80', 0x7f }, test_seq { 0x7f, 0xcc, 0x80, 0xcc })
 
       -- No UTF-8 sequence
-      eq({'', 0xc2}, test_seq{0xc2, 0x7f, 0xcc, 0x80})
+      eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f, 0xcc, 0x80 })
       -- No following UTF-8 character
-      eq({"\xc2\x80", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0xcc})
+      eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0xcc })
       -- Combining character U+0301
-      eq({"\xc2\x80\xcc\x81", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0x81})
+      eq({ '\xc2\x80\xcc\x81', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81 })
 
       -- One UTF-8 character
-      eq({"\xf4\x80\x80\x80", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80})
+      eq({ '\xf4\x80\x80\x80', 0x100000 }, test_seq { 0xf4, 0x80, 0x80, 0x80 })
     end)
 
     itp('5+-byte sequences', function()
       -- No following combining character
-      eq(byte(0x7f), test_seq{0x7f, 0x7f, 0xcc, 0x80, 0x80})
+      eq(byte(0x7f), test_seq { 0x7f, 0x7f, 0xcc, 0x80, 0x80 })
       -- No second UTF-8 character
-      eq(byte(0x7f), test_seq{0x7f, 0xc2, 0xcc, 0x80, 0x80})
+      eq(byte(0x7f), test_seq { 0x7f, 0xc2, 0xcc, 0x80, 0x80 })
 
       -- Combining character U+0300
-      eq({"\x7f\xcc\x80", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x00})
+      eq({ '\x7f\xcc\x80', 0x7f }, test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x00 })
 
       -- Combining characters U+0300 and U+0301
-      eq({"\x7f\xcc\x80\xcc\x81", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81})
+      eq({ '\x7f\xcc\x80\xcc\x81', 0x7f }, test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81 })
       -- Combining characters U+0300, U+0301, U+0302
-      eq({"\x7f\xcc\x80\xcc\x81\xcc\x82", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82})
+      eq(
+        { '\x7f\xcc\x80\xcc\x81\xcc\x82', 0x7f },
+        test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82 }
+      )
       -- Combining characters U+0300, U+0301, U+0302, U+0303
-      eq({"\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83})
+      eq(
+        { '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83', 0x7f },
+        test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83 }
+      )
       -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304
-      eq({"\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84})
+      eq(
+        { '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84', 0x7f },
+        test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84 }
+      )
       -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, U+0305
-      eq({"\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85})
+      eq(
+        { '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85', 0x7f },
+        test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85 }
+      )
 
       -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, U+0305, U+0306
-      eq({"\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85\xcc\x86", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85, 0xcc, 0x86})
+      eq(
+        { '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85\xcc\x86', 0x7f },
+        test_seq {
+          0x7f,
+          0xcc,
+          0x80,
+          0xcc,
+          0x81,
+          0xcc,
+          0x82,
+          0xcc,
+          0x83,
+          0xcc,
+          0x84,
+          0xcc,
+          0x85,
+          0xcc,
+          0x86,
+        }
+      )
 
       -- Only three following combining characters U+0300, U+0301, U+0302
-      eq({"\x7f\xcc\x80\xcc\x81\xcc\x82", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xc2, 0x80, 0xcc, 0x84, 0xcc, 0x85})
+      eq(
+        { '\x7f\xcc\x80\xcc\x81\xcc\x82', 0x7f },
+        test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xc2, 0x80, 0xcc, 0x84, 0xcc, 0x85 }
+      )
 
       -- No UTF-8 sequence
-      eq({'', 0xc2}, test_seq{0xc2, 0x7f, 0xcc, 0x80, 0x80})
+      eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f, 0xcc, 0x80, 0x80 })
       -- No following UTF-8 character
-      eq({"\xc2\x80", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0xcc, 0x80})
+      eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0xcc, 0x80 })
       -- Combining character U+0301
-      eq({"\xc2\x80\xcc\x81", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0x81, 0x7f})
+      eq({ '\xc2\x80\xcc\x81', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81, 0x7f })
       -- Combining character U+0301
-      eq({"\xc2\x80\xcc\x81", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0x81, 0xcc})
+      eq({ '\xc2\x80\xcc\x81', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81, 0xcc })
 
       -- One UTF-8 character
-      eq({"\xf4\x80\x80\x80", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80, 0x7f})
+      eq({ '\xf4\x80\x80\x80', 0x100000 }, test_seq { 0xf4, 0x80, 0x80, 0x80, 0x7f })
 
       -- One UTF-8 character
-      eq({"\xf4\x80\x80\x80", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80, 0x80})
+      eq({ '\xf4\x80\x80\x80', 0x100000 }, test_seq { 0xf4, 0x80, 0x80, 0x80, 0x80 })
       -- One UTF-8 character
-      eq({"\xf4\x80\x80\x80", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80, 0xcc})
+      eq({ '\xf4\x80\x80\x80', 0x100000 }, test_seq { 0xf4, 0x80, 0x80, 0x80, 0xcc })
 
       -- Combining characters U+1AB0 and U+0301
-      eq({"\xf4\x80\x80\x80\xe1\xaa\xb0\xcc\x81", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80, 0xe1, 0xaa, 0xb0, 0xcc, 0x81})
+      eq(
+        { '\xf4\x80\x80\x80\xe1\xaa\xb0\xcc\x81', 0x100000 },
+        test_seq { 0xf4, 0x80, 0x80, 0x80, 0xe1, 0xaa, 0xb0, 0xcc, 0x81 }
+      )
     end)
-
   end)
 
+  describe('utf_cp_bounds_len', function()
+    local to_cstr = helpers.to_cstr
+
+    local tests = {
+      {
+        name = 'for valid string',
+        str = 'iÀiiⱠiⱠⱠ𐀀i',
+        offsets = {
+          b = { 0, 0, 1, 0, 0, 0, 1, 2, 0, 0, 1, 2, 0, 1, 2, 0, 1, 2, 3, 0 },
+          e = { 1, 2, 1, 1, 1, 3, 2, 1, 1, 3, 2, 1, 3, 2, 1, 4, 3, 2, 1, 1 },
+        },
+      },
+      {
+        name = 'for string with incomplete sequence',
+        str = 'i\xC3iÀⱠiÀ\xE2\xB1Ⱡ\xF0\x90\x80',
+        offsets = {
+          b = { 0, 0, 0, 0, 1, 0, 1, 2, 0, 0, 1, 0, 0, 0, 1, 2, 0, 0, 0 },
+          e = { 1, 1, 1, 2, 1, 3, 2, 1, 1, 2, 1, 1, 1, 3, 2, 1, 1, 1, 1 },
+        },
+      },
+      {
+        name = 'for string with trailing bytes after multibyte',
+        str = 'iÀ\xA0Ⱡ\xA0Ⱡ𐀀\xA0i',
+        offsets = {
+          b = { 0, 0, 1, 0, 0, 1, 2, 0, 0, 1, 2, 0, 1, 2, 3, 0, 0 },
+          e = { 1, 2, 1, 1, 3, 2, 1, 1, 3, 2, 1, 4, 3, 2, 1, 1, 1 },
+        },
+      },
+    }
+
+    for _, test in ipairs(tests) do
+      itp(test.name, function()
+        local cstr = to_cstr(test.str)
+        local b_offsets, e_offsets = {}, {}
+        for i = 1, #test.str do
+          local result = lib.utf_cp_bounds_len(cstr, cstr + i - 1, #test.str - (i - 1))
+          table.insert(b_offsets, result.begin_off)
+          table.insert(e_offsets, result.end_off)
+        end
+        eq(test.offsets, { b = b_offsets, e = e_offsets })
+      end)
+    end
+
+    itp('does not read before start', function()
+      local str = '𐀀'
+      local expected_offsets = { b = { 0, 0, 0 }, e = { 1, 1, 1 } }
+      local cstr = to_cstr(str) + 1
+      local b_offsets, e_offsets = {}, {}
+      for i = 1, 3 do
+        local result = lib.utf_cp_bounds_len(cstr, cstr + i - 1, 3 - (i - 1))
+        table.insert(b_offsets, result.begin_off)
+        table.insert(e_offsets, result.end_off)
+      end
+      eq(expected_offsets, { b = b_offsets, e = e_offsets })
+    end)
+
+    itp('does not read past the end', function()
+      local str = '𐀀'
+      local expected_offsets = { b = { 0, 0, 0 }, e = { 1, 1, 1 } }
+      local cstr = to_cstr(str)
+      local b_offsets, e_offsets = {}, {}
+      for i = 1, 3 do
+        local result = lib.utf_cp_bounds_len(cstr, cstr + i - 1, 3 - (i - 1))
+        table.insert(b_offsets, result.begin_off)
+        table.insert(e_offsets, result.end_off)
+      end
+      eq(expected_offsets, { b = b_offsets, e = e_offsets })
+    end)
+  end)
 end)