aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTristan Knight <admin@snappeh.com>2024-10-23 14:33:57 +0100
committerGitHub <noreply@github.com>2024-10-23 06:33:57 -0700
commit230b0c7f021a57647a658edce27fe115343f083f (patch)
tree49bcf13151da2bc140408ce2fb173b782614a0ca
parent3a86b60032bd659c2b12e984abb40cee93568558 (diff)
downloadrneovim-230b0c7f021a57647a658edce27fe115343f083f.tar.gz
rneovim-230b0c7f021a57647a658edce27fe115343f083f.tar.bz2
rneovim-230b0c7f021a57647a658edce27fe115343f083f.zip
feat(stdlib): overload vim.str_byteindex, vim.str_utfindex #30735
PROBLEM: There are several limitations to vim.str_byteindex, vim.str_utfindex: 1. They throw given out-of-range indexes. An invalid (often user/lsp-provided) index doesn't feel exceptional and should be handled by the caller. `:help dev-error-patterns` suggests that `retval, errmsg` is the preferred way to handle this kind of failure. 2. They cannot accept an encoding. So LSP needs wrapper functions. #25272 3. The current signatures are not extensible. * Calling: The function currently uses a fairly opaque boolean value to indicate to identify the encoding. * Returns: The fact it can throw requires wrapping in pcall. 4. The current name doesn't follow suggestions in `:h dev-naming` and I think `get` would be suitable. SOLUTION: - Because these are performance-sensitive, don't introduce `opts`. - Introduce an "overload" that accepts `encoding:string` and `strict_indexing:bool` params. ```lua local col = vim.str_utfindex(line, encoding, [index, [no_out_of_range]]) ``` Support the old versions by dispatching on the type of argument 2, and deprecate that form. ```lua vim.str_utfindex(line) -- (utf-32 length, utf-16 length), deprecated vim.str_utfindex(line, index) -- (utf-32 index, utf-16 index), deprecated vim.str_utfindex(line, 'utf-16') -- utf-16 length vim.str_utfindex(line, 'utf-16', index) -- utf-16 index vim.str_utfindex(line, 'utf-16', math.huge) -- error: index out of range vim.str_utfindex(line, 'utf-16', math.huge, false) -- utf-16 length ```
-rw-r--r--runtime/doc/lua.txt71
-rw-r--r--runtime/lua/vim/_editor.lua128
-rw-r--r--runtime/lua/vim/_meta/builtin.lua25
-rw-r--r--src/nvim/lua/stdlib.c14
-rw-r--r--test/functional/lua/vim_spec.lua113
5 files changed, 283 insertions, 68 deletions
diff --git a/runtime/doc/lua.txt b/runtime/doc/lua.txt
index 9d422026e6..c873523d9d 100644
--- a/runtime/doc/lua.txt
+++ b/runtime/doc/lua.txt
@@ -1000,22 +1000,6 @@ vim.schedule({fn}) *vim.schedule()*
Parameters: ~
• {fn} (`fun()`)
-vim.str_byteindex({str}, {index}, {use_utf16}) *vim.str_byteindex()*
- Convert UTF-32 or UTF-16 {index} to byte index. If {use_utf16} is not
- supplied, it defaults to false (use UTF-32). Returns the byte index.
-
- Invalid UTF-8 and NUL is treated like in |vim.str_utfindex()|. An {index}
- in the middle of a UTF-16 sequence is rounded upwards to the end of that
- sequence.
-
- Parameters: ~
- • {str} (`string`)
- • {index} (`integer`)
- • {use_utf16} (`boolean?`)
-
- Return: ~
- (`integer`)
-
vim.str_utf_end({str}, {index}) *vim.str_utf_end()*
Gets the distance (in bytes) from the last byte of the codepoint
(character) that {index} points to.
@@ -1073,23 +1057,6 @@ vim.str_utf_start({str}, {index}) *vim.str_utf_start()*
Return: ~
(`integer`)
-vim.str_utfindex({str}, {index}) *vim.str_utfindex()*
- Convert byte index to UTF-32 and UTF-16 indices. If {index} is not
- supplied, the length of the string is used. All indices are zero-based.
-
- Embedded NUL bytes are treated as terminating the string. Invalid UTF-8
- bytes, and embedded surrogates are counted as one code point each. An
- {index} in the middle of a UTF-8 sequence is rounded upwards to the end of
- that sequence.
-
- Parameters: ~
- • {str} (`string`)
- • {index} (`integer?`)
-
- Return (multiple): ~
- (`integer`) UTF-32 index
- (`integer`) UTF-16 index
-
vim.stricmp({a}, {b}) *vim.stricmp()*
Compares strings case-insensitively.
@@ -1776,6 +1743,44 @@ vim.schedule_wrap({fn}) *vim.schedule_wrap()*
• |vim.schedule()|
• |vim.in_fast_event()|
+ *vim.str_byteindex()*
+vim.str_byteindex({s}, {encoding}, {index}, {strict_indexing})
+ Convert UTF-32, UTF-16 or UTF-8 {index} to byte index. If
+ {strict_indexing} is false then then an out of range index will return
+ byte length instead of throwing an error.
+
+ Invalid UTF-8 and NUL is treated like in |vim.str_utfindex()|. An {index}
+ in the middle of a UTF-16 sequence is rounded upwards to the end of that
+ sequence.
+
+ Parameters: ~
+ • {s} (`string`)
+ • {encoding} (`"utf-8"|"utf-16"|"utf-32"`)
+ • {index} (`integer`)
+ • {strict_indexing} (`boolean?`) default: true
+
+ Return: ~
+ (`integer`)
+
+ *vim.str_utfindex()*
+vim.str_utfindex({s}, {encoding}, {index}, {strict_indexing})
+ Convert byte index to UTF-32, UTF-16 or UTF-8 indices. If {index} is not
+ supplied, the length of the string is used. All indices are zero-based.
+
+ If {strict_indexing} is false then an out of range index will return
+ string length instead of throwing an error. Invalid UTF-8 bytes, and
+ embedded surrogates are counted as one code point each. An {index} in the
+ middle of a UTF-8 sequence is rounded upwards to the end of that sequence.
+
+ Parameters: ~
+ • {s} (`string`)
+ • {encoding} (`"utf-8"|"utf-16"|"utf-32"`)
+ • {index} (`integer?`)
+ • {strict_indexing} (`boolean?`) default: true
+
+ Return: ~
+ (`integer`)
+
vim.system({cmd}, {opts}, {on_exit}) *vim.system()*
Runs a system command or throws an error if {cmd} cannot be run.
diff --git a/runtime/lua/vim/_editor.lua b/runtime/lua/vim/_editor.lua
index 58283ac64b..496bbf747c 100644
--- a/runtime/lua/vim/_editor.lua
+++ b/runtime/lua/vim/_editor.lua
@@ -68,6 +68,12 @@ vim.log = {
},
}
+local utfs = {
+ ['utf-8'] = true,
+ ['utf-16'] = true,
+ ['utf-32'] = true,
+}
+
-- TODO(lewis6991): document that the signature is system({cmd}, [{opts},] {on_exit})
--- Runs a system command or throws an error if {cmd} cannot be run.
---
@@ -714,7 +720,127 @@ function vim._on_key(buf, typed_buf)
end
end
---- Generates a list of possible completions for the string.
+--- Convert UTF-32, UTF-16 or UTF-8 {index} to byte index.
+--- If {strict_indexing} is false
+--- then then an out of range index will return byte length
+--- instead of throwing an error.
+---
+--- Invalid UTF-8 and NUL is treated like in |vim.str_utfindex()|.
+--- An {index} in the middle of a UTF-16 sequence is rounded upwards to
+--- the end of that sequence.
+---@param s string
+---@param encoding "utf-8"|"utf-16"|"utf-32"
+---@param index integer
+---@param strict_indexing? boolean # default: true
+---@return integer
+function vim.str_byteindex(s, encoding, index, strict_indexing)
+ if type(encoding) == 'number' then
+ -- Legacy support for old API
+ -- Parameters: ~
+ -- • {str} (`string`)
+ -- • {index} (`integer`)
+ -- • {use_utf16} (`boolean?`)
+ local old_index = encoding
+ local use_utf16 = index or false
+ return vim.__str_byteindex(s, old_index, use_utf16) or error('index out of range')
+ end
+
+ vim.validate('s', s, 'string')
+ vim.validate('index', index, 'number')
+
+ local len = #s
+
+ if index == 0 or len == 0 then
+ return 0
+ end
+
+ vim.validate('encoding', encoding, function(v)
+ return utfs[v], 'invalid encoding'
+ end)
+
+ vim.validate('strict_indexing', strict_indexing, 'boolean', true)
+ if strict_indexing == nil then
+ strict_indexing = true
+ end
+
+ if encoding == 'utf-8' then
+ if index > len then
+ return strict_indexing and error('index out of range') or len
+ end
+ return index
+ end
+ return vim.__str_byteindex(s, index, encoding == 'utf-16')
+ or strict_indexing and error('index out of range')
+ or len
+end
+
+--- Convert byte index to UTF-32, UTF-16 or UTF-8 indices. If {index} is not
+--- supplied, the length of the string is used. All indices are zero-based.
+---
+--- If {strict_indexing} is false then an out of range index will return string
+--- length instead of throwing an error.
+--- Invalid UTF-8 bytes, and embedded surrogates are counted as one code point
+--- each. An {index} in the middle of a UTF-8 sequence is rounded upwards to the end of
+--- that sequence.
+---@param s string
+---@param encoding "utf-8"|"utf-16"|"utf-32"
+---@param index? integer
+---@param strict_indexing? boolean # default: true
+---@return integer
+function vim.str_utfindex(s, encoding, index, strict_indexing)
+ if encoding == nil or type(encoding) == 'number' then
+ -- Legacy support for old API
+ -- Parameters: ~
+ -- • {str} (`string`)
+ -- • {index} (`integer?`)
+ local old_index = encoding
+ local col32, col16 = vim.__str_utfindex(s, old_index) --[[@as integer,integer]]
+ if not col32 or not col16 then
+ error('index out of range')
+ end
+ -- Return (multiple): ~
+ -- (`integer`) UTF-32 index
+ -- (`integer`) UTF-16 index
+ return col32, col16
+ end
+
+ vim.validate('s', s, 'string')
+ vim.validate('index', index, 'number', true)
+ if not index then
+ index = math.huge
+ strict_indexing = false
+ end
+
+ if index == 0 then
+ return 0
+ end
+
+ vim.validate('encoding', encoding, function(v)
+ return utfs[v], 'invalid encoding'
+ end)
+
+ vim.validate('strict_indexing', strict_indexing, 'boolean', true)
+ if strict_indexing == nil then
+ strict_indexing = true
+ end
+
+ if encoding == 'utf-8' then
+ local len = #s
+ return index <= len and index or (strict_indexing and error('index out of range') or len)
+ end
+ local col32, col16 = vim.__str_utfindex(s, index) --[[@as integer?,integer?]]
+ local col = encoding == 'utf-16' and col16 or col32
+ if col then
+ return col
+ end
+ if strict_indexing then
+ error('index out of range')
+ end
+ local max32, max16 = vim.__str_utfindex(s)--[[@as integer integer]]
+ return encoding == 'utf-16' and max16 or max32
+end
+
+--- Generates a list of possible completions for the str
--- String has the pattern.
---
--- 1. Can we get it to just return things in the global namespace with that name prefix
diff --git a/runtime/lua/vim/_meta/builtin.lua b/runtime/lua/vim/_meta/builtin.lua
index 13bd1c1294..234c75d38f 100644
--- a/runtime/lua/vim/_meta/builtin.lua
+++ b/runtime/lua/vim/_meta/builtin.lua
@@ -112,18 +112,6 @@ function vim.rpcrequest(channel, method, ...) end
--- equal, {a} is greater than {b} or {a} is lesser than {b}, respectively.
function vim.stricmp(a, b) end
---- Convert UTF-32 or UTF-16 {index} to byte index. If {use_utf16} is not
---- supplied, it defaults to false (use UTF-32). Returns the byte index.
----
---- Invalid UTF-8 and NUL is treated like in |vim.str_utfindex()|.
---- An {index} in the middle of a UTF-16 sequence is rounded upwards to
---- the end of that sequence.
---- @param str string
---- @param index integer
---- @param use_utf16? boolean
---- @return integer
-function vim.str_byteindex(str, index, use_utf16) end
-
--- Gets a list of the starting byte positions of each UTF-8 codepoint in the given string.
---
--- Embedded NUL bytes are treated as terminating the string.
@@ -173,19 +161,6 @@ function vim.str_utf_start(str, index) end
--- @return integer
function vim.str_utf_end(str, index) end
---- Convert byte index to UTF-32 and UTF-16 indices. If {index} is not
---- supplied, the length of the string is used. All indices are zero-based.
----
---- Embedded NUL bytes are treated as terminating the string. Invalid UTF-8
---- bytes, and embedded surrogates are counted as one code point each. An
---- {index} in the middle of a UTF-8 sequence is rounded upwards to the end of
---- that sequence.
---- @param str string
---- @param index? integer
---- @return integer # UTF-32 index
---- @return integer # UTF-16 index
-function vim.str_utfindex(str, index) end
-
--- The result is a String, which is the text {str} converted from
--- encoding {from} to encoding {to}. When the conversion fails `nil` is
--- returned. When some characters could not be converted they
diff --git a/src/nvim/lua/stdlib.c b/src/nvim/lua/stdlib.c
index ee0eabbebb..bf8b085458 100644
--- a/src/nvim/lua/stdlib.c
+++ b/src/nvim/lua/stdlib.c
@@ -181,7 +181,9 @@ int nlua_str_utfindex(lua_State *const lstate) FUNC_ATTR_NONNULL_ALL
} else {
idx = luaL_checkinteger(lstate, 2);
if (idx < 0 || idx > (intptr_t)s1_len) {
- return luaL_error(lstate, "index out of range");
+ lua_pushnil(lstate);
+ lua_pushnil(lstate);
+ return 2;
}
}
@@ -272,7 +274,8 @@ int nlua_str_byteindex(lua_State *const lstate) FUNC_ATTR_NONNULL_ALL
const char *s1 = luaL_checklstring(lstate, 1, &s1_len);
intptr_t idx = luaL_checkinteger(lstate, 2);
if (idx < 0) {
- return luaL_error(lstate, "index out of range");
+ lua_pushnil(lstate);
+ return 1;
}
bool use_utf16 = false;
if (lua_gettop(lstate) >= 3) {
@@ -281,7 +284,8 @@ int nlua_str_byteindex(lua_State *const lstate) FUNC_ATTR_NONNULL_ALL
ssize_t byteidx = mb_utf_index_to_bytes(s1, s1_len, (size_t)idx, use_utf16);
if (byteidx == -1) {
- return luaL_error(lstate, "index out of range");
+ lua_pushnil(lstate);
+ return 1;
}
lua_pushinteger(lstate, (lua_Integer)byteidx);
@@ -695,10 +699,10 @@ void nlua_state_add_stdlib(lua_State *const lstate, bool is_thread)
lua_setfield(lstate, -2, "stricmp");
// str_utfindex
lua_pushcfunction(lstate, &nlua_str_utfindex);
- lua_setfield(lstate, -2, "str_utfindex");
+ lua_setfield(lstate, -2, "__str_utfindex");
// str_byteindex
lua_pushcfunction(lstate, &nlua_str_byteindex);
- lua_setfield(lstate, -2, "str_byteindex");
+ lua_setfield(lstate, -2, "__str_byteindex");
// str_utf_pos
lua_pushcfunction(lstate, &nlua_str_utf_pos);
lua_setfield(lstate, -2, "str_utf_pos");
diff --git a/test/functional/lua/vim_spec.lua b/test/functional/lua/vim_spec.lua
index b32712860a..13e146a9da 100644
--- a/test/functional/lua/vim_spec.lua
+++ b/test/functional/lua/vim_spec.lua
@@ -312,21 +312,106 @@ describe('lua stdlib', function()
49,
51,
}
+ local indices8 = {
+ [0] = 0,
+ 1,
+ 2,
+ 3,
+ 4,
+ 5,
+ 6,
+ 7,
+ 8,
+ 9,
+ 10,
+ 11,
+ 12,
+ 13,
+ 14,
+ 15,
+ 16,
+ 17,
+ 18,
+ 19,
+ 20,
+ 21,
+ 22,
+ 23,
+ 24,
+ 25,
+ 26,
+ 27,
+ 28,
+ 29,
+ 30,
+ 31,
+ 32,
+ 33,
+ 34,
+ 35,
+ 36,
+ 37,
+ 38,
+ 39,
+ 40,
+ 41,
+ 42,
+ 43,
+ 44,
+ 45,
+ 46,
+ 47,
+ 48,
+ 49,
+ 50,
+ 51,
+ }
for i, k in pairs(indices32) do
eq(k, exec_lua('return vim.str_byteindex(_G.test_text, ...)', i), i)
+ eq(k, exec_lua('return vim.str_byteindex(_G.test_text, ..., false)', i), i)
+ eq(k, exec_lua('return vim.str_byteindex(_G.test_text, "utf-32", ...)', i), i)
end
for i, k in pairs(indices16) do
eq(k, exec_lua('return vim.str_byteindex(_G.test_text, ..., true)', i), i)
+ eq(k, exec_lua('return vim.str_byteindex(_G.test_text, "utf-16", ...)', i), i)
end
- eq(
+ for i, k in pairs(indices8) do
+ eq(k, exec_lua('return vim.str_byteindex(_G.test_text, "utf-8", ...)', i), i)
+ end
+ matches(
'index out of range',
pcall_err(exec_lua, 'return vim.str_byteindex(_G.test_text, ...)', #indices32 + 1)
)
- eq(
+ matches(
'index out of range',
pcall_err(exec_lua, 'return vim.str_byteindex(_G.test_text, ..., true)', #indices16 + 1)
)
- local i32, i16 = 0, 0
+ matches(
+ 'index out of range',
+ pcall_err(exec_lua, 'return vim.str_byteindex(_G.test_text, "utf-16", ...)', #indices16 + 1)
+ )
+ matches(
+ 'index out of range',
+ pcall_err(exec_lua, 'return vim.str_byteindex(_G.test_text, "utf-32", ...)', #indices32 + 1)
+ )
+ matches(
+ 'invalid encoding',
+ pcall_err(exec_lua, 'return vim.str_byteindex("hello", "madeupencoding", 1)')
+ )
+ eq(
+ indices32[#indices32],
+ exec_lua('return vim.str_byteindex(_G.test_text, "utf-32", 99999, false)')
+ )
+ eq(
+ indices16[#indices16],
+ exec_lua('return vim.str_byteindex(_G.test_text, "utf-16", 99999, false)')
+ )
+ eq(
+ indices8[#indices8],
+ exec_lua('return vim.str_byteindex(_G.test_text, "utf-8", 99999, false)')
+ )
+ eq(2, exec_lua('return vim.str_byteindex("é", "utf-16", 2, false)'))
+ local i32, i16, i8 = 0, 0, 0
local len = 51
for k = 0, len do
if indices32[i32] < k then
@@ -338,9 +423,29 @@ describe('lua stdlib', function()
i16 = i16 + 1
end
end
+ if indices8[i8] < k then
+ i8 = i8 + 1
+ end
eq({ i32, i16 }, exec_lua('return {vim.str_utfindex(_G.test_text, ...)}', k), k)
+ eq({ i32 }, exec_lua('return {vim.str_utfindex(_G.test_text, "utf-32", ...)}', k), k)
+ eq({ i16 }, exec_lua('return {vim.str_utfindex(_G.test_text, "utf-16", ...)}', k), k)
+ eq({ i8 }, exec_lua('return {vim.str_utfindex(_G.test_text, "utf-8", ...)}', k), k)
end
- eq(
+
+ eq({ #indices32, #indices16 }, exec_lua('return {vim.str_utfindex(_G.test_text)}'))
+
+ eq(#indices32, exec_lua('return vim.str_utfindex(_G.test_text, "utf-32", math.huge, false)'))
+ eq(#indices16, exec_lua('return vim.str_utfindex(_G.test_text, "utf-16", math.huge, false)'))
+ eq(#indices8, exec_lua('return vim.str_utfindex(_G.test_text, "utf-8", math.huge, false)'))
+
+ eq(#indices32, exec_lua('return vim.str_utfindex(_G.test_text, "utf-32")'))
+ eq(#indices16, exec_lua('return vim.str_utfindex(_G.test_text, "utf-16")'))
+ eq(#indices8, exec_lua('return vim.str_utfindex(_G.test_text, "utf-8")'))
+ matches(
+ 'invalid encoding',
+ pcall_err(exec_lua, 'return vim.str_utfindex(_G.test_text, "madeupencoding", ...)', 1)
+ )
+ matches(
'index out of range',
pcall_err(exec_lua, 'return vim.str_utfindex(_G.test_text, ...)', len + 1)
)