diff options
author | Tristan Knight <admin@snappeh.com> | 2024-10-23 14:33:57 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-10-23 06:33:57 -0700 |
commit | 230b0c7f021a57647a658edce27fe115343f083f (patch) | |
tree | 49bcf13151da2bc140408ce2fb173b782614a0ca | |
parent | 3a86b60032bd659c2b12e984abb40cee93568558 (diff) | |
download | rneovim-230b0c7f021a57647a658edce27fe115343f083f.tar.gz rneovim-230b0c7f021a57647a658edce27fe115343f083f.tar.bz2 rneovim-230b0c7f021a57647a658edce27fe115343f083f.zip |
feat(stdlib): overload vim.str_byteindex, vim.str_utfindex #30735
PROBLEM:
There are several limitations to vim.str_byteindex, vim.str_utfindex:
1. They throw given out-of-range indexes. An invalid (often user/lsp-provided)
index doesn't feel exceptional and should be handled by the caller.
`:help dev-error-patterns` suggests that `retval, errmsg` is the preferred
way to handle this kind of failure.
2. They cannot accept an encoding. So LSP needs wrapper functions. #25272
3. The current signatures are not extensible.
* Calling: The function currently uses a fairly opaque boolean value to
indicate to identify the encoding.
* Returns: The fact it can throw requires wrapping in pcall.
4. The current name doesn't follow suggestions in `:h dev-naming` and I think
`get` would be suitable.
SOLUTION:
- Because these are performance-sensitive, don't introduce `opts`.
- Introduce an "overload" that accepts `encoding:string` and
`strict_indexing:bool` params.
```lua
local col = vim.str_utfindex(line, encoding, [index, [no_out_of_range]])
```
Support the old versions by dispatching on the type of argument 2, and
deprecate that form.
```lua
vim.str_utfindex(line) -- (utf-32 length, utf-16 length), deprecated
vim.str_utfindex(line, index) -- (utf-32 index, utf-16 index), deprecated
vim.str_utfindex(line, 'utf-16') -- utf-16 length
vim.str_utfindex(line, 'utf-16', index) -- utf-16 index
vim.str_utfindex(line, 'utf-16', math.huge) -- error: index out of range
vim.str_utfindex(line, 'utf-16', math.huge, false) -- utf-16 length
```
-rw-r--r-- | runtime/doc/lua.txt | 71 | ||||
-rw-r--r-- | runtime/lua/vim/_editor.lua | 128 | ||||
-rw-r--r-- | runtime/lua/vim/_meta/builtin.lua | 25 | ||||
-rw-r--r-- | src/nvim/lua/stdlib.c | 14 | ||||
-rw-r--r-- | test/functional/lua/vim_spec.lua | 113 |
5 files changed, 283 insertions, 68 deletions
diff --git a/runtime/doc/lua.txt b/runtime/doc/lua.txt index 9d422026e6..c873523d9d 100644 --- a/runtime/doc/lua.txt +++ b/runtime/doc/lua.txt @@ -1000,22 +1000,6 @@ vim.schedule({fn}) *vim.schedule()* Parameters: ~ • {fn} (`fun()`) -vim.str_byteindex({str}, {index}, {use_utf16}) *vim.str_byteindex()* - Convert UTF-32 or UTF-16 {index} to byte index. If {use_utf16} is not - supplied, it defaults to false (use UTF-32). Returns the byte index. - - Invalid UTF-8 and NUL is treated like in |vim.str_utfindex()|. An {index} - in the middle of a UTF-16 sequence is rounded upwards to the end of that - sequence. - - Parameters: ~ - • {str} (`string`) - • {index} (`integer`) - • {use_utf16} (`boolean?`) - - Return: ~ - (`integer`) - vim.str_utf_end({str}, {index}) *vim.str_utf_end()* Gets the distance (in bytes) from the last byte of the codepoint (character) that {index} points to. @@ -1073,23 +1057,6 @@ vim.str_utf_start({str}, {index}) *vim.str_utf_start()* Return: ~ (`integer`) -vim.str_utfindex({str}, {index}) *vim.str_utfindex()* - Convert byte index to UTF-32 and UTF-16 indices. If {index} is not - supplied, the length of the string is used. All indices are zero-based. - - Embedded NUL bytes are treated as terminating the string. Invalid UTF-8 - bytes, and embedded surrogates are counted as one code point each. An - {index} in the middle of a UTF-8 sequence is rounded upwards to the end of - that sequence. - - Parameters: ~ - • {str} (`string`) - • {index} (`integer?`) - - Return (multiple): ~ - (`integer`) UTF-32 index - (`integer`) UTF-16 index - vim.stricmp({a}, {b}) *vim.stricmp()* Compares strings case-insensitively. @@ -1776,6 +1743,44 @@ vim.schedule_wrap({fn}) *vim.schedule_wrap()* • |vim.schedule()| • |vim.in_fast_event()| + *vim.str_byteindex()* +vim.str_byteindex({s}, {encoding}, {index}, {strict_indexing}) + Convert UTF-32, UTF-16 or UTF-8 {index} to byte index. If + {strict_indexing} is false then then an out of range index will return + byte length instead of throwing an error. + + Invalid UTF-8 and NUL is treated like in |vim.str_utfindex()|. An {index} + in the middle of a UTF-16 sequence is rounded upwards to the end of that + sequence. + + Parameters: ~ + • {s} (`string`) + • {encoding} (`"utf-8"|"utf-16"|"utf-32"`) + • {index} (`integer`) + • {strict_indexing} (`boolean?`) default: true + + Return: ~ + (`integer`) + + *vim.str_utfindex()* +vim.str_utfindex({s}, {encoding}, {index}, {strict_indexing}) + Convert byte index to UTF-32, UTF-16 or UTF-8 indices. If {index} is not + supplied, the length of the string is used. All indices are zero-based. + + If {strict_indexing} is false then an out of range index will return + string length instead of throwing an error. Invalid UTF-8 bytes, and + embedded surrogates are counted as one code point each. An {index} in the + middle of a UTF-8 sequence is rounded upwards to the end of that sequence. + + Parameters: ~ + • {s} (`string`) + • {encoding} (`"utf-8"|"utf-16"|"utf-32"`) + • {index} (`integer?`) + • {strict_indexing} (`boolean?`) default: true + + Return: ~ + (`integer`) + vim.system({cmd}, {opts}, {on_exit}) *vim.system()* Runs a system command or throws an error if {cmd} cannot be run. diff --git a/runtime/lua/vim/_editor.lua b/runtime/lua/vim/_editor.lua index 58283ac64b..496bbf747c 100644 --- a/runtime/lua/vim/_editor.lua +++ b/runtime/lua/vim/_editor.lua @@ -68,6 +68,12 @@ vim.log = { }, } +local utfs = { + ['utf-8'] = true, + ['utf-16'] = true, + ['utf-32'] = true, +} + -- TODO(lewis6991): document that the signature is system({cmd}, [{opts},] {on_exit}) --- Runs a system command or throws an error if {cmd} cannot be run. --- @@ -714,7 +720,127 @@ function vim._on_key(buf, typed_buf) end end ---- Generates a list of possible completions for the string. +--- Convert UTF-32, UTF-16 or UTF-8 {index} to byte index. +--- If {strict_indexing} is false +--- then then an out of range index will return byte length +--- instead of throwing an error. +--- +--- Invalid UTF-8 and NUL is treated like in |vim.str_utfindex()|. +--- An {index} in the middle of a UTF-16 sequence is rounded upwards to +--- the end of that sequence. +---@param s string +---@param encoding "utf-8"|"utf-16"|"utf-32" +---@param index integer +---@param strict_indexing? boolean # default: true +---@return integer +function vim.str_byteindex(s, encoding, index, strict_indexing) + if type(encoding) == 'number' then + -- Legacy support for old API + -- Parameters: ~ + -- • {str} (`string`) + -- • {index} (`integer`) + -- • {use_utf16} (`boolean?`) + local old_index = encoding + local use_utf16 = index or false + return vim.__str_byteindex(s, old_index, use_utf16) or error('index out of range') + end + + vim.validate('s', s, 'string') + vim.validate('index', index, 'number') + + local len = #s + + if index == 0 or len == 0 then + return 0 + end + + vim.validate('encoding', encoding, function(v) + return utfs[v], 'invalid encoding' + end) + + vim.validate('strict_indexing', strict_indexing, 'boolean', true) + if strict_indexing == nil then + strict_indexing = true + end + + if encoding == 'utf-8' then + if index > len then + return strict_indexing and error('index out of range') or len + end + return index + end + return vim.__str_byteindex(s, index, encoding == 'utf-16') + or strict_indexing and error('index out of range') + or len +end + +--- Convert byte index to UTF-32, UTF-16 or UTF-8 indices. If {index} is not +--- supplied, the length of the string is used. All indices are zero-based. +--- +--- If {strict_indexing} is false then an out of range index will return string +--- length instead of throwing an error. +--- Invalid UTF-8 bytes, and embedded surrogates are counted as one code point +--- each. An {index} in the middle of a UTF-8 sequence is rounded upwards to the end of +--- that sequence. +---@param s string +---@param encoding "utf-8"|"utf-16"|"utf-32" +---@param index? integer +---@param strict_indexing? boolean # default: true +---@return integer +function vim.str_utfindex(s, encoding, index, strict_indexing) + if encoding == nil or type(encoding) == 'number' then + -- Legacy support for old API + -- Parameters: ~ + -- • {str} (`string`) + -- • {index} (`integer?`) + local old_index = encoding + local col32, col16 = vim.__str_utfindex(s, old_index) --[[@as integer,integer]] + if not col32 or not col16 then + error('index out of range') + end + -- Return (multiple): ~ + -- (`integer`) UTF-32 index + -- (`integer`) UTF-16 index + return col32, col16 + end + + vim.validate('s', s, 'string') + vim.validate('index', index, 'number', true) + if not index then + index = math.huge + strict_indexing = false + end + + if index == 0 then + return 0 + end + + vim.validate('encoding', encoding, function(v) + return utfs[v], 'invalid encoding' + end) + + vim.validate('strict_indexing', strict_indexing, 'boolean', true) + if strict_indexing == nil then + strict_indexing = true + end + + if encoding == 'utf-8' then + local len = #s + return index <= len and index or (strict_indexing and error('index out of range') or len) + end + local col32, col16 = vim.__str_utfindex(s, index) --[[@as integer?,integer?]] + local col = encoding == 'utf-16' and col16 or col32 + if col then + return col + end + if strict_indexing then + error('index out of range') + end + local max32, max16 = vim.__str_utfindex(s)--[[@as integer integer]] + return encoding == 'utf-16' and max16 or max32 +end + +--- Generates a list of possible completions for the str --- String has the pattern. --- --- 1. Can we get it to just return things in the global namespace with that name prefix diff --git a/runtime/lua/vim/_meta/builtin.lua b/runtime/lua/vim/_meta/builtin.lua index 13bd1c1294..234c75d38f 100644 --- a/runtime/lua/vim/_meta/builtin.lua +++ b/runtime/lua/vim/_meta/builtin.lua @@ -112,18 +112,6 @@ function vim.rpcrequest(channel, method, ...) end --- equal, {a} is greater than {b} or {a} is lesser than {b}, respectively. function vim.stricmp(a, b) end ---- Convert UTF-32 or UTF-16 {index} to byte index. If {use_utf16} is not ---- supplied, it defaults to false (use UTF-32). Returns the byte index. ---- ---- Invalid UTF-8 and NUL is treated like in |vim.str_utfindex()|. ---- An {index} in the middle of a UTF-16 sequence is rounded upwards to ---- the end of that sequence. ---- @param str string ---- @param index integer ---- @param use_utf16? boolean ---- @return integer -function vim.str_byteindex(str, index, use_utf16) end - --- Gets a list of the starting byte positions of each UTF-8 codepoint in the given string. --- --- Embedded NUL bytes are treated as terminating the string. @@ -173,19 +161,6 @@ function vim.str_utf_start(str, index) end --- @return integer function vim.str_utf_end(str, index) end ---- Convert byte index to UTF-32 and UTF-16 indices. If {index} is not ---- supplied, the length of the string is used. All indices are zero-based. ---- ---- Embedded NUL bytes are treated as terminating the string. Invalid UTF-8 ---- bytes, and embedded surrogates are counted as one code point each. An ---- {index} in the middle of a UTF-8 sequence is rounded upwards to the end of ---- that sequence. ---- @param str string ---- @param index? integer ---- @return integer # UTF-32 index ---- @return integer # UTF-16 index -function vim.str_utfindex(str, index) end - --- The result is a String, which is the text {str} converted from --- encoding {from} to encoding {to}. When the conversion fails `nil` is --- returned. When some characters could not be converted they diff --git a/src/nvim/lua/stdlib.c b/src/nvim/lua/stdlib.c index ee0eabbebb..bf8b085458 100644 --- a/src/nvim/lua/stdlib.c +++ b/src/nvim/lua/stdlib.c @@ -181,7 +181,9 @@ int nlua_str_utfindex(lua_State *const lstate) FUNC_ATTR_NONNULL_ALL } else { idx = luaL_checkinteger(lstate, 2); if (idx < 0 || idx > (intptr_t)s1_len) { - return luaL_error(lstate, "index out of range"); + lua_pushnil(lstate); + lua_pushnil(lstate); + return 2; } } @@ -272,7 +274,8 @@ int nlua_str_byteindex(lua_State *const lstate) FUNC_ATTR_NONNULL_ALL const char *s1 = luaL_checklstring(lstate, 1, &s1_len); intptr_t idx = luaL_checkinteger(lstate, 2); if (idx < 0) { - return luaL_error(lstate, "index out of range"); + lua_pushnil(lstate); + return 1; } bool use_utf16 = false; if (lua_gettop(lstate) >= 3) { @@ -281,7 +284,8 @@ int nlua_str_byteindex(lua_State *const lstate) FUNC_ATTR_NONNULL_ALL ssize_t byteidx = mb_utf_index_to_bytes(s1, s1_len, (size_t)idx, use_utf16); if (byteidx == -1) { - return luaL_error(lstate, "index out of range"); + lua_pushnil(lstate); + return 1; } lua_pushinteger(lstate, (lua_Integer)byteidx); @@ -695,10 +699,10 @@ void nlua_state_add_stdlib(lua_State *const lstate, bool is_thread) lua_setfield(lstate, -2, "stricmp"); // str_utfindex lua_pushcfunction(lstate, &nlua_str_utfindex); - lua_setfield(lstate, -2, "str_utfindex"); + lua_setfield(lstate, -2, "__str_utfindex"); // str_byteindex lua_pushcfunction(lstate, &nlua_str_byteindex); - lua_setfield(lstate, -2, "str_byteindex"); + lua_setfield(lstate, -2, "__str_byteindex"); // str_utf_pos lua_pushcfunction(lstate, &nlua_str_utf_pos); lua_setfield(lstate, -2, "str_utf_pos"); diff --git a/test/functional/lua/vim_spec.lua b/test/functional/lua/vim_spec.lua index b32712860a..13e146a9da 100644 --- a/test/functional/lua/vim_spec.lua +++ b/test/functional/lua/vim_spec.lua @@ -312,21 +312,106 @@ describe('lua stdlib', function() 49, 51, } + local indices8 = { + [0] = 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + } for i, k in pairs(indices32) do eq(k, exec_lua('return vim.str_byteindex(_G.test_text, ...)', i), i) + eq(k, exec_lua('return vim.str_byteindex(_G.test_text, ..., false)', i), i) + eq(k, exec_lua('return vim.str_byteindex(_G.test_text, "utf-32", ...)', i), i) end for i, k in pairs(indices16) do eq(k, exec_lua('return vim.str_byteindex(_G.test_text, ..., true)', i), i) + eq(k, exec_lua('return vim.str_byteindex(_G.test_text, "utf-16", ...)', i), i) end - eq( + for i, k in pairs(indices8) do + eq(k, exec_lua('return vim.str_byteindex(_G.test_text, "utf-8", ...)', i), i) + end + matches( 'index out of range', pcall_err(exec_lua, 'return vim.str_byteindex(_G.test_text, ...)', #indices32 + 1) ) - eq( + matches( 'index out of range', pcall_err(exec_lua, 'return vim.str_byteindex(_G.test_text, ..., true)', #indices16 + 1) ) - local i32, i16 = 0, 0 + matches( + 'index out of range', + pcall_err(exec_lua, 'return vim.str_byteindex(_G.test_text, "utf-16", ...)', #indices16 + 1) + ) + matches( + 'index out of range', + pcall_err(exec_lua, 'return vim.str_byteindex(_G.test_text, "utf-32", ...)', #indices32 + 1) + ) + matches( + 'invalid encoding', + pcall_err(exec_lua, 'return vim.str_byteindex("hello", "madeupencoding", 1)') + ) + eq( + indices32[#indices32], + exec_lua('return vim.str_byteindex(_G.test_text, "utf-32", 99999, false)') + ) + eq( + indices16[#indices16], + exec_lua('return vim.str_byteindex(_G.test_text, "utf-16", 99999, false)') + ) + eq( + indices8[#indices8], + exec_lua('return vim.str_byteindex(_G.test_text, "utf-8", 99999, false)') + ) + eq(2, exec_lua('return vim.str_byteindex("é", "utf-16", 2, false)')) + local i32, i16, i8 = 0, 0, 0 local len = 51 for k = 0, len do if indices32[i32] < k then @@ -338,9 +423,29 @@ describe('lua stdlib', function() i16 = i16 + 1 end end + if indices8[i8] < k then + i8 = i8 + 1 + end eq({ i32, i16 }, exec_lua('return {vim.str_utfindex(_G.test_text, ...)}', k), k) + eq({ i32 }, exec_lua('return {vim.str_utfindex(_G.test_text, "utf-32", ...)}', k), k) + eq({ i16 }, exec_lua('return {vim.str_utfindex(_G.test_text, "utf-16", ...)}', k), k) + eq({ i8 }, exec_lua('return {vim.str_utfindex(_G.test_text, "utf-8", ...)}', k), k) end - eq( + + eq({ #indices32, #indices16 }, exec_lua('return {vim.str_utfindex(_G.test_text)}')) + + eq(#indices32, exec_lua('return vim.str_utfindex(_G.test_text, "utf-32", math.huge, false)')) + eq(#indices16, exec_lua('return vim.str_utfindex(_G.test_text, "utf-16", math.huge, false)')) + eq(#indices8, exec_lua('return vim.str_utfindex(_G.test_text, "utf-8", math.huge, false)')) + + eq(#indices32, exec_lua('return vim.str_utfindex(_G.test_text, "utf-32")')) + eq(#indices16, exec_lua('return vim.str_utfindex(_G.test_text, "utf-16")')) + eq(#indices8, exec_lua('return vim.str_utfindex(_G.test_text, "utf-8")')) + matches( + 'invalid encoding', + pcall_err(exec_lua, 'return vim.str_utfindex(_G.test_text, "madeupencoding", ...)', 1) + ) + matches( 'index out of range', pcall_err(exec_lua, 'return vim.str_utfindex(_G.test_text, ...)', len + 1) ) |