test/unit/mbyte_spec.lua


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173

local helpers = require("test.unit.helpers")(after_each)
local itp = helpers.gen_itp(it)

local ffi     = helpers.ffi
local eq      = helpers.eq

local lib = helpers.cimport('./src/nvim/mbyte.h', './src/nvim/charset.h', './src/nvim/grid.h')

describe('mbyte', function()
  -- Convert from bytes to string
  local function to_string(bytes)
    local s = {}
    for i = 1, #bytes do
      s[i] = string.char(bytes[i])
    end
    return table.concat(s)
  end

  before_each(function()
  end)

  itp('utf_ptr2char', function()
    -- For strings with length 1 the first byte is returned.
    for c = 0, 255 do
      eq(c, lib.utf_ptr2char(to_string({c, 0})))
    end

    -- Some ill formed byte sequences that should not be recognized as UTF-8
    -- First byte: 0xc0 or 0xc1
    -- Second byte: 0x80 .. 0xbf
    --eq(0x00c0, lib.utf_ptr2char(to_string({0xc0, 0x80})))
    --eq(0x00c1, lib.utf_ptr2char(to_string({0xc1, 0xbf})))
    --
    -- Sequences with more than four bytes
  end)

  for n = 0, 0xF do
    itp(('utf_char2bytes for chars 0x%x - 0x%x'):format(n * 0x1000, n * 0x1000 + 0xFFF), function()
      local char_p = ffi.typeof('char[?]')
      for c = n * 0x1000, n * 0x1000 + 0xFFF do
        local p = char_p(4, 0)
        lib.utf_char2bytes(c, p)
        eq(c, lib.utf_ptr2char(p))
        eq(lib.vim_iswordc(c), lib.vim_iswordp(p))
      end
    end)
  end

  describe('utfc_ptr2schar_len', function()
    local function test_seq(seq)
      local firstc = ffi.new("int[1]")
      local buf = ffi.new("char[32]")
      lib.schar_get(buf, lib.utfc_ptr2schar_len(to_string(seq), #seq, firstc))
      return {ffi.string(buf), firstc[0]}
    end

    local function byte(val)
      return {string.char(val), val}
    end

    itp('1-byte sequences', function()
      eq({'', 0}, test_seq{0})
      for c = 1, 127 do
        eq(byte(c), test_seq{c})
      end
      for c = 128, 255 do
        eq({'', c}, test_seq{c})
      end
    end)

    itp('2-byte sequences', function()
      -- No combining characters
      eq(byte(0x7f), test_seq{0x7f, 0x7f})
      -- No combining characters
      eq(byte(0x7f), test_seq{0x7f, 0x80})

      -- No UTF-8 sequence
      eq({'', 0xc2}, test_seq{0xc2, 0x7f})
      -- One UTF-8 character
      eq({'\xc2\x80', 0x80}, test_seq{0xc2, 0x80})
      -- No UTF-8 sequence
      eq({'', 0xc2}, test_seq{0xc2, 0xc0})
    end)

    itp('3-byte sequences', function()
      -- No second UTF-8 character
      eq(byte(0x7f), test_seq{0x7f, 0x80, 0x80})
      -- No combining character
      eq(byte(0x7f), test_seq{0x7f, 0xc2, 0x80})

      -- Combining character is U+0300
      eq({"\x7f\xcc\x80", 0x7f}, test_seq{0x7f, 0xcc, 0x80})

      -- No UTF-8 sequence
      eq({'', 0xc2}, test_seq{0xc2, 0x7f, 0xcc})
      -- Incomplete combining character
      eq({"\xc2\x80", 0x80}, test_seq{0xc2, 0x80, 0xcc})

      -- One UTF-8 character (composing only)
      eq({" \xe2\x83\x90", 0x20d0}, test_seq{0xe2, 0x83, 0x90})
    end)

    itp('4-byte sequences', function()

      -- No following combining character
      eq(byte(0x7f), test_seq{0x7f, 0x7f, 0xcc, 0x80})
      -- No second UTF-8 character
      eq(byte(0x7f), test_seq{0x7f, 0xc2, 0xcc, 0x80})

      -- Combining character U+0300
      eq({"\x7f\xcc\x80", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc})

      -- No UTF-8 sequence
      eq({'', 0xc2}, test_seq{0xc2, 0x7f, 0xcc, 0x80})
      -- No following UTF-8 character
      eq({"\xc2\x80", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0xcc})
      -- Combining character U+0301
      eq({"\xc2\x80\xcc\x81", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0x81})

      -- One UTF-8 character
      eq({"\xf4\x80\x80\x80", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80})
    end)

    itp('5+-byte sequences', function()
      -- No following combining character
      eq(byte(0x7f), test_seq{0x7f, 0x7f, 0xcc, 0x80, 0x80})
      -- No second UTF-8 character
      eq(byte(0x7f), test_seq{0x7f, 0xc2, 0xcc, 0x80, 0x80})

      -- Combining character U+0300
      eq({"\x7f\xcc\x80", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x00})

      -- Combining characters U+0300 and U+0301
      eq({"\x7f\xcc\x80\xcc\x81", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81})
      -- Combining characters U+0300, U+0301, U+0302
      eq({"\x7f\xcc\x80\xcc\x81\xcc\x82", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82})
      -- Combining characters U+0300, U+0301, U+0302, U+0303
      eq({"\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83})
      -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304
      eq({"\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84})
      -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, U+0305
      eq({"\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85})

      -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, U+0305, U+0306
      eq({"\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85\xcc\x86", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85, 0xcc, 0x86})

      -- Only three following combining characters U+0300, U+0301, U+0302
      eq({"\x7f\xcc\x80\xcc\x81\xcc\x82", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xc2, 0x80, 0xcc, 0x84, 0xcc, 0x85})

      -- No UTF-8 sequence
      eq({'', 0xc2}, test_seq{0xc2, 0x7f, 0xcc, 0x80, 0x80})
      -- No following UTF-8 character
      eq({"\xc2\x80", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0xcc, 0x80})
      -- Combining character U+0301
      eq({"\xc2\x80\xcc\x81", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0x81, 0x7f})
      -- Combining character U+0301
      eq({"\xc2\x80\xcc\x81", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0x81, 0xcc})

      -- One UTF-8 character
      eq({"\xf4\x80\x80\x80", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80, 0x7f})

      -- One UTF-8 character
      eq({"\xf4\x80\x80\x80", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80, 0x80})
      -- One UTF-8 character
      eq({"\xf4\x80\x80\x80", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80, 0xcc})

      -- Combining characters U+1AB0 and U+0301
      eq({"\xf4\x80\x80\x80\xe1\xaa\xb0\xcc\x81", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80, 0xe1, 0xaa, 0xb0, 0xcc, 0x81})
    end)

  end)

end)