aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBjörn Linse <bjorn.linse@gmail.com>2019-12-22 13:47:45 +0100
committerGitHub <noreply@github.com>2019-12-22 13:47:45 +0100
commit9e9dcd4bd79cc5ea0fd240bcb242ceea07deabe2 (patch)
treeb4c06d6b6485f364f502b97d8775546a9fced24f
parente1d63c180cc38cec5a8bf3e543bfe18472352da4 (diff)
parent440695c29696f261337227e5c419aa1cf313c2dd (diff)
downloadrneovim-9e9dcd4bd79cc5ea0fd240bcb242ceea07deabe2.tar.gz
rneovim-9e9dcd4bd79cc5ea0fd240bcb242ceea07deabe2.tar.bz2
rneovim-9e9dcd4bd79cc5ea0fd240bcb242ceea07deabe2.zip
Merge pull request #11113 from bfredl/tree-sitter-query
tree-sitter step 2: query API and highlighting prototype
-rw-r--r--runtime/doc/lua.txt96
-rw-r--r--runtime/lua/vim/treesitter.lua120
-rw-r--r--runtime/lua/vim/tshighlighter.lua142
-rw-r--r--src/nvim/api/buffer.c91
-rw-r--r--src/nvim/api/private/helpers.h6
-rw-r--r--src/nvim/api/vim.c33
-rw-r--r--src/nvim/buffer_defs.h6
-rw-r--r--src/nvim/buffer_updates.c11
-rw-r--r--src/nvim/func_attr.h2
-rw-r--r--src/nvim/generators/c_grammar.lua1
-rw-r--r--src/nvim/generators/gen_api_dispatch.lua15
-rw-r--r--src/nvim/generators/gen_eval.lua2
-rw-r--r--src/nvim/globals.h7
-rw-r--r--src/nvim/lua/executor.c25
-rw-r--r--src/nvim/lua/treesitter.c321
-rw-r--r--src/nvim/screen.c105
-rw-r--r--src/tree_sitter/api.h241
-rw-r--r--src/tree_sitter/bits.h29
-rw-r--r--src/tree_sitter/language.c102
-rw-r--r--src/tree_sitter/language.h3
-rw-r--r--src/tree_sitter/lexer.c334
-rw-r--r--src/tree_sitter/lexer.h2
-rw-r--r--src/tree_sitter/lib.c5
-rw-r--r--src/tree_sitter/node.c10
-rw-r--r--src/tree_sitter/parser.c41
-rw-r--r--src/tree_sitter/parser.h5
-rw-r--r--src/tree_sitter/point.h1
-rw-r--r--src/tree_sitter/query.c1450
-rw-r--r--src/tree_sitter/subtree.c13
-rw-r--r--src/tree_sitter/tree.c7
-rw-r--r--src/tree_sitter/tree_cursor.c79
-rw-r--r--src/tree_sitter/tree_cursor.h1
-rw-r--r--src/tree_sitter/unicode.h50
-rw-r--r--src/tree_sitter/unicode/ICU_SHA1
-rw-r--r--src/tree_sitter/unicode/LICENSE414
-rw-r--r--src/tree_sitter/unicode/README.md29
-rw-r--r--src/tree_sitter/unicode/ptypes.h1
-rw-r--r--src/tree_sitter/unicode/umachine.h448
-rw-r--r--src/tree_sitter/unicode/urename.h1
-rw-r--r--src/tree_sitter/unicode/utf.h1
-rw-r--r--src/tree_sitter/unicode/utf16.h733
-rw-r--r--src/tree_sitter/unicode/utf8.h881
-rw-r--r--test/functional/api/highlight_spec.lua19
-rw-r--r--test/functional/lua/treesitter_spec.lua476
44 files changed, 5963 insertions, 397 deletions
diff --git a/runtime/doc/lua.txt b/runtime/doc/lua.txt
index c0da06ffe3..1c3a7f70c9 100644
--- a/runtime/doc/lua.txt
+++ b/runtime/doc/lua.txt
@@ -594,6 +594,102 @@ tsnode:named_descendant_for_range(start_row, start_col, end_row, end_col)
Get the smallest named node within this node that spans the given
range of (row, column) positions
+Query methods *lua-treesitter-query*
+
+Tree-sitter queries are supported, with some limitations. Currently, the only
+supported match predicate is `eq?` (both comparing a capture against a string
+and two captures against each other).
+
+vim.treesitter.parse_query(lang, query)
+ *vim.treesitter.parse_query(()*
+ Parse the query as a string. (If the query is in a file, the caller
+ should read the contents into a string before calling).
+
+query:iter_captures(node, bufnr, start_row, end_row)
+ *query:iter_captures()*
+ Iterate over all captures from all matches inside a `node`.
+ `bufnr` is needed if the query contains predicates, then the caller
+ must ensure to use a freshly parsed tree consistent with the current
+ text of the buffer. `start_row` and `end_row` can be used to limit
+ matches inside a row range (this is typically used with root node
+ as the node, i e to get syntax highlight matches in the current
+ viewport)
+
+ The iterator returns two values, a numeric id identifying the capture
+ and the captured node. The following example shows how to get captures
+ by name:
+>
+ for id, node in query:iter_captures(tree:root(), bufnr, first, last) do
+ local name = query.captures[id] -- name of the capture in the query
+ -- typically useful info about the node:
+ local type = node:type() -- type of the captured node
+ local row1, col1, row2, col2 = node:range() -- range of the capture
+ ... use the info here ...
+ end
+<
+query:iter_matches(node, bufnr, start_row, end_row)
+ *query:iter_matches()*
+ Iterate over all matches within a node. The arguments are the same as
+ for |query:iter_captures()| but the iterated values are different:
+ an (1-based) index of the pattern in the query, and a table mapping
+ capture indices to nodes. If the query has more than one pattern
+ the capture table might be sparse, and e.g. `pairs` should be used and not
+ `ipairs`. Here an example iterating over all captures in
+ every match:
+>
+ for pattern, match in cquery:iter_matches(tree:root(), bufnr, first, last) do
+ for id,node in pairs(match) do
+ local name = query.captures[id]
+ -- `node` was captured by the `name` capture in the match
+ ... use the info here ...
+ end
+ end
+>
+Treesitter syntax highlighting (WIP) *lua-treesitter-highlight*
+
+NOTE: This is a partially implemented feature, and not usable as a default
+solution yet. What is documented here is a temporary interface indented
+for those who want to experiment with this feature and contribute to
+its development.
+
+Highlights are defined in the same query format as in the tree-sitter highlight
+crate, which some limitations and additions. Set a highlight query for a
+buffer with this code: >
+
+ local query = [[
+ "for" @keyword
+ "if" @keyword
+ "return" @keyword
+
+ (string_literal) @string
+ (number_literal) @number
+ (comment) @comment
+
+ (preproc_function_def name: (identifier) @function)
+
+ ; ... more definitions
+ ]]
+
+ highlighter = vim.treesitter.TSHighlighter.new(query, bufnr, lang)
+ -- alternatively, to use the current buffer and its filetype:
+ -- highlighter = vim.treesitter.TSHighlighter.new(query)
+
+ -- Don't recreate the highlighter for the same buffer, instead
+ -- modify the query like this:
+ local query2 = [[ ... ]]
+ highlighter:set_query(query2)
+
+As mentioned above the supported predicate is currently only `eq?`. `match?`
+predicates behave like matching always fails. As an addition a capture which
+begin with an upper-case letter like `@WarningMsg` will map directly to this
+highlight group, if defined. Also if the predicate begins with upper-case and
+contains a dot only the part before the first will be interpreted as the
+highlight group. As an example, this warns of a binary expression with two
+identical identifiers, highlighting both as |hl-WarningMsg|: >
+
+ ((binary_expression left: (identifier) @WarningMsg.left right: (identifier) @WarningMsg.right)
+ (eq? @WarningMsg.left @WarningMsg.right))
+
------------------------------------------------------------------------------
VIM *lua-builtin*
diff --git a/runtime/lua/vim/treesitter.lua b/runtime/lua/vim/treesitter.lua
index e0202927bb..aa8b8fcdd1 100644
--- a/runtime/lua/vim/treesitter.lua
+++ b/runtime/lua/vim/treesitter.lua
@@ -12,9 +12,13 @@ function Parser:parse()
if self.valid then
return self.tree
end
- self.tree = self._parser:parse_buf(self.bufnr)
+ local changes
+ self.tree, changes = self._parser:parse_buf(self.bufnr)
self.valid = true
- return self.tree
+ for _, cb in ipairs(self.change_cbs) do
+ cb(changes)
+ end
+ return self.tree, changes
end
function Parser:_on_lines(bufnr, _, start_row, old_stop_row, stop_row, old_byte_size)
@@ -26,17 +30,28 @@ function Parser:_on_lines(bufnr, _, start_row, old_stop_row, stop_row, old_byte_
self.valid = false
end
-local module = {
+local M = {
add_language=vim._ts_add_language,
inspect_language=vim._ts_inspect_language,
+ parse_query = vim._ts_parse_query,
}
-function module.create_parser(bufnr, ft, id)
+setmetatable(M, {
+ __index = function (t, k)
+ if k == "TSHighlighter" then
+ t[k] = require'vim.tshighlighter'
+ return t[k]
+ end
+ end
+ })
+
+function M.create_parser(bufnr, ft, id)
if bufnr == 0 then
bufnr = a.nvim_get_current_buf()
end
- local self = setmetatable({bufnr=bufnr, valid=false}, Parser)
+ local self = setmetatable({bufnr=bufnr, lang=ft, valid=false}, Parser)
self._parser = vim._create_ts_parser(ft)
+ self.change_cbs = {}
self:parse()
-- TODO(bfredl): use weakref to self, so that the parser is free'd is no plugin is
-- using it.
@@ -55,7 +70,7 @@ function module.create_parser(bufnr, ft, id)
return self
end
-function module.get_parser(bufnr, ft)
+function M.get_parser(bufnr, ft, cb)
if bufnr == nil or bufnr == 0 then
bufnr = a.nvim_get_current_buf()
end
@@ -65,9 +80,98 @@ function module.get_parser(bufnr, ft)
local id = tostring(bufnr)..'_'..ft
if parsers[id] == nil then
- parsers[id] = module.create_parser(bufnr, ft, id)
+ parsers[id] = M.create_parser(bufnr, ft, id)
+ end
+ if cb ~= nil then
+ table.insert(parsers[id].change_cbs, cb)
end
return parsers[id]
end
-return module
+-- query: pattern matching on trees
+-- predicate matching is implemented in lua
+local Query = {}
+Query.__index = Query
+
+function M.parse_query(lang, query)
+ local self = setmetatable({}, Query)
+ self.query = vim._ts_parse_query(lang, query)
+ self.info = self.query:inspect()
+ self.captures = self.info.captures
+ return self
+end
+
+local function get_node_text(node, bufnr)
+ local start_row, start_col, end_row, end_col = node:range()
+ if start_row ~= end_row then
+ return nil
+ end
+ local line = a.nvim_buf_get_lines(bufnr, start_row, start_row+1, true)[1]
+ return string.sub(line, start_col+1, end_col)
+end
+
+local function match_preds(match, preds, bufnr)
+ for _, pred in pairs(preds) do
+ if pred[1] == "eq?" then
+ local node = match[pred[2]]
+ local node_text = get_node_text(node, bufnr)
+
+ local str
+ if type(pred[3]) == "string" then
+ -- (eq? @aa "foo")
+ str = pred[3]
+ else
+ -- (eq? @aa @bb)
+ str = get_node_text(match[pred[3]], bufnr)
+ end
+
+ if node_text ~= str or str == nil then
+ return false
+ end
+ else
+ return false
+ end
+ end
+ return true
+end
+
+function Query:iter_captures(node, bufnr, start, stop)
+ if bufnr == 0 then
+ bufnr = vim.api.nvim_get_current_buf()
+ end
+ local raw_iter = node:_rawquery(self.query,true,start,stop)
+ local function iter()
+ local capture, captured_node, match = raw_iter()
+ if match ~= nil then
+ local preds = self.info.patterns[match.pattern]
+ local active = match_preds(match, preds, bufnr)
+ match.active = active
+ if not active then
+ return iter() -- tail call: try next match
+ end
+ end
+ return capture, captured_node
+ end
+ return iter
+end
+
+function Query:iter_matches(node, bufnr, start, stop)
+ if bufnr == 0 then
+ bufnr = vim.api.nvim_get_current_buf()
+ end
+ local raw_iter = node:_rawquery(self.query,false,start,stop)
+ local function iter()
+ local pattern, match = raw_iter()
+ if match ~= nil then
+ local preds = self.info.patterns[pattern]
+ local active = (not preds) or match_preds(match, preds, bufnr)
+ if not active then
+ return iter() -- tail call: try next match
+ end
+ end
+ return pattern, match
+ end
+ return iter
+end
+
+return M
diff --git a/runtime/lua/vim/tshighlighter.lua b/runtime/lua/vim/tshighlighter.lua
new file mode 100644
index 0000000000..1544ecbf49
--- /dev/null
+++ b/runtime/lua/vim/tshighlighter.lua
@@ -0,0 +1,142 @@
+local a = vim.api
+
+-- support reload for quick experimentation
+local TSHighlighter = rawget(vim.treesitter, 'TSHighlighter') or {}
+TSHighlighter.__index = TSHighlighter
+
+-- These are conventions defined by tree-sitter, though it
+-- needs to be user extensible also.
+-- TODO(bfredl): this is very much incomplete, we will need to
+-- go through a few tree-sitter provided queries and decide
+-- on translations that makes the most sense.
+TSHighlighter.hl_map = {
+ keyword="Keyword",
+ string="String",
+ type="Type",
+ comment="Comment",
+ constant="Constant",
+ operator="Operator",
+ number="Number",
+ label="Label",
+ ["function"]="Function",
+ ["function.special"]="Function",
+}
+
+function TSHighlighter.new(query, bufnr, ft)
+ local self = setmetatable({}, TSHighlighter)
+ self.parser = vim.treesitter.get_parser(bufnr, ft, function(...) self:on_change(...) end)
+ self.buf = self.parser.bufnr
+ -- TODO(bfredl): perhaps on_start should be called uncondionally, instead for only on mod?
+ local tree = self.parser:parse()
+ self.root = tree:root()
+ self:set_query(query)
+ self.edit_count = 0
+ self.redraw_count = 0
+ self.line_count = {}
+ a.nvim_buf_set_option(self.buf, "syntax", "")
+ a.nvim__buf_set_luahl(self.buf, {
+ on_start=function(...) return self:on_start(...) end,
+ on_window=function(...) return self:on_window(...) end,
+ on_line=function(...) return self:on_line(...) end,
+ })
+
+ -- Tricky: if syntax hasn't been enabled, we need to reload color scheme
+ -- but use synload.vim rather than syntax.vim to not enable
+ -- syntax FileType autocmds. Later on we should integrate with the
+ -- `:syntax` and `set syntax=...` machinery properly.
+ if vim.g.syntax_on ~= 1 then
+ vim.api.nvim_command("runtime! syntax/synload.vim")
+ end
+ return self
+end
+
+function TSHighlighter:set_query(query)
+ if type(query) == "string" then
+ query = vim.treesitter.parse_query(self.parser.lang, query)
+ end
+ self.query = query
+
+ self.id_map = {}
+ for i, capture in ipairs(self.query.captures) do
+ local hl = 0
+ local firstc = string.sub(capture, 1, 1)
+ local hl_group = self.hl_map[capture]
+ if firstc ~= string.lower(firstc) then
+ hl_group = vim.split(capture, '.', true)[1]
+ end
+ if hl_group then
+ hl = a.nvim_get_hl_id_by_name(hl_group)
+ end
+ self.id_map[i] = hl
+ end
+end
+
+function TSHighlighter:on_change(changes)
+ for _, ch in ipairs(changes or {}) do
+ a.nvim__buf_redraw_range(self.buf, ch[1], ch[3]+1)
+ end
+ self.edit_count = self.edit_count + 1
+end
+
+function TSHighlighter:on_start(_, _buf, _tick)
+ local tree = self.parser:parse()
+ self.root = tree:root()
+end
+
+function TSHighlighter:on_window(_, _win, _buf, _topline, botline)
+ self.iter = nil
+ self.active_nodes = {}
+ self.nextrow = 0
+ self.botline = botline
+ self.redraw_count = self.redraw_count + 1
+end
+
+function TSHighlighter:on_line(_, _win, buf, line)
+ if self.iter == nil then
+ self.iter = self.query:iter_captures(self.root,buf,line,self.botline)
+ end
+ while line >= self.nextrow do
+ local capture, node, match = self.iter()
+ local active = true
+ if capture == nil then
+ break
+ end
+ if match ~= nil then
+ active = self:run_pred(match)
+ match.active = active
+ end
+ local start_row, start_col, end_row, end_col = node:range()
+ local hl = self.id_map[capture]
+ if hl > 0 and active then
+ if start_row == line and end_row == line then
+ a.nvim__put_attr(hl, start_col, end_col)
+ elseif end_row >= line then
+ -- TODO(bfredl): this is quite messy. Togheter with multiline bufhl we should support
+ -- luahl generating multiline highlights (and other kinds of annotations)
+ self.active_nodes[{hl=hl, start_row=start_row, start_col=start_col, end_row=end_row, end_col=end_col}] = true
+ end
+ end
+ if start_row > line then
+ self.nextrow = start_row
+ end
+ end
+ for node,_ in pairs(self.active_nodes) do
+ if node.start_row <= line and node.end_row >= line then
+ local start_col, end_col = node.start_col, node.end_col
+ if node.start_row < line then
+ start_col = 0
+ end
+ if node.end_row > line then
+ end_col = 9000
+ end
+ a.nvim__put_attr(node.hl, start_col, end_col)
+ end
+ if node.end_row <= line then
+ self.active_nodes[node] = nil
+ end
+ end
+ self.line_count[line] = (self.line_count[line] or 0) + 1
+ --return tostring(self.line_count[line])
+end
+
+return TSHighlighter
diff --git a/src/nvim/api/buffer.c b/src/nvim/api/buffer.c
index 8f5718d97e..e6f8f73b9d 100644
--- a/src/nvim/api/buffer.c
+++ b/src/nvim/api/buffer.c
@@ -169,21 +169,21 @@ Boolean nvim_buf_attach(uint64_t channel_id,
goto error;
}
cb.on_lines = v->data.luaref;
- v->data.integer = LUA_NOREF;
+ v->data.luaref = LUA_NOREF;
} else if (is_lua && strequal("on_changedtick", k.data)) {
if (v->type != kObjectTypeLuaRef) {
api_set_error(err, kErrorTypeValidation, "callback is not a function");
goto error;
}
cb.on_changedtick = v->data.luaref;
- v->data.integer = LUA_NOREF;
+ v->data.luaref = LUA_NOREF;
} else if (is_lua && strequal("on_detach", k.data)) {
if (v->type != kObjectTypeLuaRef) {
api_set_error(err, kErrorTypeValidation, "callback is not a function");
goto error;
}
cb.on_detach = v->data.luaref;
- v->data.integer = LUA_NOREF;
+ v->data.luaref = LUA_NOREF;
} else if (is_lua && strequal("utf_sizes", k.data)) {
if (v->type != kObjectTypeBoolean) {
api_set_error(err, kErrorTypeValidation, "utf_sizes must be boolean");
@@ -231,6 +231,90 @@ Boolean nvim_buf_detach(uint64_t channel_id,
return true;
}
+static void buf_clear_luahl(buf_T *buf, bool force)
+{
+ if (buf->b_luahl || force) {
+ executor_free_luaref(buf->b_luahl_start);
+ executor_free_luaref(buf->b_luahl_window);
+ executor_free_luaref(buf->b_luahl_line);
+ executor_free_luaref(buf->b_luahl_end);
+ }
+ buf->b_luahl_start = LUA_NOREF;
+ buf->b_luahl_window = LUA_NOREF;
+ buf->b_luahl_line = LUA_NOREF;
+ buf->b_luahl_end = LUA_NOREF;
+}
+
+/// Unstabilized interface for defining syntax hl in lua.
+///
+/// This is not yet safe for general use, lua callbacks will need to
+/// be restricted, like textlock and probably other stuff.
+///
+/// The API on_line/nvim__put_attr is quite raw and not intended to be the
+/// final shape. Ideally this should operate on chunks larger than a single
+/// line to reduce interpreter overhead, and generate annotation objects
+/// (bufhl/virttext) on the fly but using the same representation.
+void nvim__buf_set_luahl(uint64_t channel_id, Buffer buffer,
+ DictionaryOf(LuaRef) opts, Error *err)
+ FUNC_API_LUA_ONLY
+{
+ buf_T *buf = find_buffer_by_handle(buffer, err);
+
+ if (!buf) {
+ return;
+ }
+
+ redraw_buf_later(buf, NOT_VALID);
+ buf_clear_luahl(buf, false);
+
+ for (size_t i = 0; i < opts.size; i++) {
+ String k = opts.items[i].key;
+ Object *v = &opts.items[i].value;
+ if (strequal("on_start", k.data)) {
+ if (v->type != kObjectTypeLuaRef) {
+ api_set_error(err, kErrorTypeValidation, "callback is not a function");
+ goto error;
+ }
+ buf->b_luahl_start = v->data.luaref;
+ v->data.luaref = LUA_NOREF;
+ } else if (strequal("on_window", k.data)) {
+ if (v->type != kObjectTypeLuaRef) {
+ api_set_error(err, kErrorTypeValidation, "callback is not a function");
+ goto error;
+ }
+ buf->b_luahl_window = v->data.luaref;
+ v->data.luaref = LUA_NOREF;
+ } else if (strequal("on_line", k.data)) {
+ if (v->type != kObjectTypeLuaRef) {
+ api_set_error(err, kErrorTypeValidation, "callback is not a function");
+ goto error;
+ }
+ buf->b_luahl_line = v->data.luaref;
+ v->data.luaref = LUA_NOREF;
+ } else {
+ api_set_error(err, kErrorTypeValidation, "unexpected key: %s", k.data);
+ goto error;
+ }
+ }
+ buf->b_luahl = true;
+ return;
+error:
+ buf_clear_luahl(buf, true);
+ buf->b_luahl = false;
+}
+
+void nvim__buf_redraw_range(Buffer buffer, Integer first, Integer last,
+ Error *err)
+ FUNC_API_LUA_ONLY
+{
+ buf_T *buf = find_buffer_by_handle(buffer, err);
+ if (!buf) {
+ return;
+ }
+
+ redraw_buf_range_later(buf, (linenr_T)first+1, (linenr_T)last);
+}
+
/// Sets a buffer line
///
/// @deprecated use nvim_buf_set_lines instead.
@@ -1112,7 +1196,6 @@ Array nvim_buf_get_extmarks(Buffer buffer, Integer ns_id, Object start,
return rv;
}
limit = v->data.integer;
- v->data.integer = LUA_NOREF;
} else {
api_set_error(err, kErrorTypeValidation, "unexpected key: %s", k.data);
return rv;
diff --git a/src/nvim/api/private/helpers.h b/src/nvim/api/private/helpers.h
index 8930f252f6..048b937136 100644
--- a/src/nvim/api/private/helpers.h
+++ b/src/nvim/api/private/helpers.h
@@ -60,6 +60,12 @@
#define ADD(array, item) \
kv_push(array, item)
+#define FIXED_TEMP_ARRAY(name, fixsize) \
+ Array name = ARRAY_DICT_INIT; \
+ Object name##__items[fixsize]; \
+ args.size = fixsize; \
+ args.items = name##__items; \
+
#define STATIC_CSTR_AS_STRING(s) ((String) {.data = s, .size = sizeof(s) - 1})
/// Create a new String instance, putting data in allocated memory
diff --git a/src/nvim/api/vim.c b/src/nvim/api/vim.c
index 19601b6539..2f59edee33 100644
--- a/src/nvim/api/vim.c
+++ b/src/nvim/api/vim.c
@@ -189,6 +189,15 @@ Dictionary nvim_get_hl_by_id(Integer hl_id, Boolean rgb, Error *err)
return hl_get_attr_by_id(attrcode, rgb, err);
}
+/// Gets a highlight group by name
+///
+/// similar to |hlID()|, but allocates a new ID if not present.
+Integer nvim_get_hl_id_by_name(String name)
+ FUNC_API_SINCE(7)
+{
+ return syn_check_group((const char_u *)name.data, (int)name.size);
+}
+
/// Sends input-keys to Nvim, subject to various quirks controlled by `mode`
/// flags. This is a blocking call, unlike |nvim_input()|.
///
@@ -2546,3 +2555,27 @@ Array nvim__inspect_cell(Integer grid, Integer row, Integer col, Error *err)
}
return ret;
}
+
+/// Set attrs in nvim__buf_set_lua_hl callbacks
+///
+/// TODO(bfredl): This is rather pedestrian. The final
+/// interface should probably be derived from a reformed
+/// bufhl/virttext interface with full support for multi-line
+/// ranges etc
+void nvim__put_attr(Integer id, Integer c0, Integer c1)
+ FUNC_API_LUA_ONLY
+{
+ if (!lua_attr_active) {
+ return;
+ }
+ if (id == 0 || syn_get_final_id((int)id) == 0) {
+ return;
+ }
+ int attr = syn_id2attr((int)id);
+ c0 = MAX(c0, 0);
+ c1 = MIN(c1, (Integer)lua_attr_bufsize);
+ for (Integer c = c0; c < c1; c++) {
+ lua_attr_buf[c] = (sattr_T)hl_combine_attr(lua_attr_buf[c], (int)attr);
+ }
+ return;
+}
diff --git a/src/nvim/buffer_defs.h b/src/nvim/buffer_defs.h
index 700d8b82e6..bc4fb2997d 100644
--- a/src/nvim/buffer_defs.h
+++ b/src/nvim/buffer_defs.h
@@ -832,6 +832,12 @@ struct file_buffer {
// The number for times the current line has been flushed in the memline.
int flush_count;
+ bool b_luahl;
+ LuaRef b_luahl_start;
+ LuaRef b_luahl_window;
+ LuaRef b_luahl_line;
+ LuaRef b_luahl_end;
+
int b_diff_failed; // internal diff failed for this buffer
};
diff --git a/src/nvim/buffer_updates.c b/src/nvim/buffer_updates.c
index 3604578b50..d12527d6ac 100644
--- a/src/nvim/buffer_updates.c
+++ b/src/nvim/buffer_updates.c
@@ -157,7 +157,7 @@ void buf_updates_unregister_all(buf_T *buf)
args.items[0] = BUFFER_OBJ(buf->handle);
textlock++;
- executor_exec_lua_cb(cb.on_detach, "detach", args, false);
+ executor_exec_lua_cb(cb.on_detach, "detach", args, false, NULL);
textlock--;
}
free_update_callbacks(cb);
@@ -265,7 +265,7 @@ void buf_updates_send_changes(buf_T *buf,
args.items[7] = INTEGER_OBJ((Integer)deleted_codeunits);
}
textlock++;
- Object res = executor_exec_lua_cb(cb.on_lines, "lines", args, true);
+ Object res = executor_exec_lua_cb(cb.on_lines, "lines", args, true, NULL);
textlock--;
if (res.type == kObjectTypeBoolean && res.data.boolean == true) {
@@ -293,10 +293,7 @@ void buf_updates_changedtick(buf_T *buf)
BufUpdateCallbacks cb = kv_A(buf->update_callbacks, i);
bool keep = true;
if (cb.on_changedtick != LUA_NOREF) {
- Array args = ARRAY_DICT_INIT;
- Object items[2];
- args.size = 2;
- args.items = items;
+ FIXED_TEMP_ARRAY(args, 2);
// the first argument is always the buffer handle
args.items[0] = BUFFER_OBJ(buf->handle);
@@ -306,7 +303,7 @@ void buf_updates_changedtick(buf_T *buf)
textlock++;
Object res = executor_exec_lua_cb(cb.on_changedtick, "changedtick",
- args, true);
+ args, true, NULL);
textlock--;
if (res.type == kObjectTypeBoolean && res.data.boolean == true) {
diff --git a/src/nvim/func_attr.h b/src/nvim/func_attr.h
index 14b8158c7f..38b51b5564 100644
--- a/src/nvim/func_attr.h
+++ b/src/nvim/func_attr.h
@@ -211,6 +211,8 @@
# define FUNC_API_NOEXPORT
/// API function not exposed in VimL/eval.
# define FUNC_API_REMOTE_ONLY
+/// API function not exposed in VimL/remote.
+# define FUNC_API_LUA_ONLY
/// API function introduced at the given API level.
# define FUNC_API_SINCE(X)
/// API function deprecated since the given API level.
diff --git a/src/nvim/generators/c_grammar.lua b/src/nvim/generators/c_grammar.lua
index 1aa8223da0..de098b7a7b 100644
--- a/src/nvim/generators/c_grammar.lua
+++ b/src/nvim/generators/c_grammar.lua
@@ -42,6 +42,7 @@ local c_proto = Ct(
(fill * Cg((P('FUNC_API_FAST') * Cc(true)), 'fast') ^ -1) *
(fill * Cg((P('FUNC_API_NOEXPORT') * Cc(true)), 'noexport') ^ -1) *
(fill * Cg((P('FUNC_API_REMOTE_ONLY') * Cc(true)), 'remote_only') ^ -1) *
+ (fill * Cg((P('FUNC_API_LUA_ONLY') * Cc(true)), 'lua_only') ^ -1) *
(fill * Cg((P('FUNC_API_REMOTE_IMPL') * Cc(true)), 'remote_impl') ^ -1) *
(fill * Cg((P('FUNC_API_BRIDGE_IMPL') * Cc(true)), 'bridge_impl') ^ -1) *
(fill * Cg((P('FUNC_API_COMPOSITOR_IMPL') * Cc(true)), 'compositor_impl') ^ -1) *
diff --git a/src/nvim/generators/gen_api_dispatch.lua b/src/nvim/generators/gen_api_dispatch.lua
index 76dcf849d1..e861cfda35 100644
--- a/src/nvim/generators/gen_api_dispatch.lua
+++ b/src/nvim/generators/gen_api_dispatch.lua
@@ -192,7 +192,7 @@ end
-- the real API.
for i = 1, #functions do
local fn = functions[i]
- if fn.impl_name == nil then
+ if fn.impl_name == nil and not fn.lua_only then
local args = {}
output:write('Object handle_'..fn.name..'(uint64_t channel_id, Array args, Error *error)')
@@ -310,12 +310,13 @@ void msgpack_rpc_init_method_table(void)
for i = 1, #functions do
local fn = functions[i]
- output:write(' msgpack_rpc_add_method_handler('..
- '(String) {.data = "'..fn.name..'", '..
- '.size = sizeof("'..fn.name..'") - 1}, '..
- '(MsgpackRpcRequestHandler) {.fn = handle_'.. (fn.impl_name or fn.name)..
- ', .fast = '..tostring(fn.fast)..'});\n')
-
+ if not fn.lua_only then
+ output:write(' msgpack_rpc_add_method_handler('..
+ '(String) {.data = "'..fn.name..'", '..
+ '.size = sizeof("'..fn.name..'") - 1}, '..
+ '(MsgpackRpcRequestHandler) {.fn = handle_'.. (fn.impl_name or fn.name)..
+ ', .fast = '..tostring(fn.fast)..'});\n')
+ end
end
output:write('\n}\n\n')
diff --git a/src/nvim/generators/gen_eval.lua b/src/nvim/generators/gen_eval.lua
index 2c6f8f2603..d16453530f 100644
--- a/src/nvim/generators/gen_eval.lua
+++ b/src/nvim/generators/gen_eval.lua
@@ -25,7 +25,7 @@ local gperfpipe = io.open(funcsfname .. '.gperf', 'wb')
local funcs = require('eval').funcs
local metadata = mpack.unpack(io.open(metadata_file, 'rb'):read("*all"))
for _,fun in ipairs(metadata) do
- if not fun.remote_only then
+ if not (fun.remote_only or fun.lua_only) then
funcs[fun.name] = {
args=#fun.parameters,
func='api_wrapper',
diff --git a/src/nvim/globals.h b/src/nvim/globals.h
index 172c190df2..0a7a2d551e 100644
--- a/src/nvim/globals.h
+++ b/src/nvim/globals.h
@@ -126,6 +126,13 @@ typedef off_t off_T;
*/
EXTERN int mod_mask INIT(= 0x0); /* current key modifiers */
+
+// TODO(bfredl): for the final interface this should find a more suitable
+// location.
+EXTERN sattr_T *lua_attr_buf INIT(= NULL);
+EXTERN size_t lua_attr_bufsize INIT(= 0);
+EXTERN bool lua_attr_active INIT(= false);
+
/*
* Cmdline_row is the row where the command line starts, just below the
* last window.
diff --git a/src/nvim/lua/executor.c b/src/nvim/lua/executor.c
index 25f4be1c4d..1d3d9929d3 100644
--- a/src/nvim/lua/executor.c
+++ b/src/nvim/lua/executor.c
@@ -835,7 +835,7 @@ Object executor_exec_lua_api(const String str, const Array args, Error *err)
}
Object executor_exec_lua_cb(LuaRef ref, const char *name, Array args,
- bool retval)
+ bool retval, Error *err)
{
lua_State *const lstate = nlua_enter();
nlua_pushref(lstate, ref);
@@ -845,16 +845,24 @@ Object executor_exec_lua_cb(LuaRef ref, const char *name, Array args,
}
if (lua_pcall(lstate, (int)args.size+1, retval ? 1 : 0, 0)) {
- // TODO(bfredl): callbacks:s might not always be msg-safe, for instance
- // lua callbacks for redraw events. Later on let the caller deal with the
- // error instead.
- nlua_error(lstate, _("Error executing lua callback: %.*s"));
+ // if err is passed, the caller will deal with the error.
+ if (err) {
+ size_t len;
+ const char *errstr = lua_tolstring(lstate, -1, &len);
+ api_set_error(err, kErrorTypeException,
+ "Error executing lua: %.*s", (int)len, errstr);
+ } else {
+ nlua_error(lstate, _("Error executing lua callback: %.*s"));
+ }
return NIL;
}
- Error err = ERROR_INIT;
if (retval) {
- return nlua_pop_Object(lstate, false, &err);
+ Error dummy = ERROR_INIT;
+ if (err == NULL) {
+ err = &dummy;
+ }
+ return nlua_pop_Object(lstate, false, err);
} else {
return NIL;
}
@@ -1007,4 +1015,7 @@ static void nlua_add_treesitter(lua_State *const lstate) FUNC_ATTR_NONNULL_ALL
lua_pushcfunction(lstate, tslua_inspect_lang);
lua_setfield(lstate, -2, "_ts_inspect_language");
+
+ lua_pushcfunction(lstate, ts_lua_parse_query);
+ lua_setfield(lstate, -2, "_ts_parse_query");
}
diff --git a/src/nvim/lua/treesitter.c b/src/nvim/lua/treesitter.c
index d2072402bb..874fabd89f 100644
--- a/src/nvim/lua/treesitter.c
+++ b/src/nvim/lua/treesitter.c
@@ -26,6 +26,11 @@ typedef struct {
TSTree *tree; // internal tree, used for editing/reparsing
} TSLua_parser;
+typedef struct {
+ TSQueryCursor *cursor;
+ int predicated_match;
+} TSLua_cursor;
+
#ifdef INCLUDE_GENERATED_DECLARATIONS
# include "lua/treesitter.c.generated.h"
#endif
@@ -66,6 +71,20 @@ static struct luaL_Reg node_meta[] = {
{ "descendant_for_range", node_descendant_for_range },
{ "named_descendant_for_range", node_named_descendant_for_range },
{ "parent", node_parent },
+ { "_rawquery", node_rawquery },
+ { NULL, NULL }
+};
+
+static struct luaL_Reg query_meta[] = {
+ { "__gc", query_gc },
+ { "__tostring", query_tostring },
+ { "inspect", query_inspect },
+ { NULL, NULL }
+};
+
+// cursor is not exposed, but still needs garbage collection
+static struct luaL_Reg querycursor_meta[] = {
+ { "__gc", querycursor_gc },
{ NULL, NULL }
};
@@ -96,6 +115,8 @@ void tslua_init(lua_State *L)
build_meta(L, "treesitter_parser", parser_meta);
build_meta(L, "treesitter_tree", tree_meta);
build_meta(L, "treesitter_node", node_meta);
+ build_meta(L, "treesitter_query", query_meta);
+ build_meta(L, "treesitter_querycursor", querycursor_meta);
}
int tslua_register_lang(lua_State *L)
@@ -276,13 +297,33 @@ static int parser_parse_buf(lua_State *L)
}
TSInput input = { payload, input_cb, TSInputEncodingUTF8 };
TSTree *new_tree = ts_parser_parse(p->parser, p->tree, input);
+
+ uint32_t n_ranges = 0;
+ TSRange *changed = p->tree ? ts_tree_get_changed_ranges(p->tree, new_tree,
+ &n_ranges) : NULL;
if (p->tree) {
ts_tree_delete(p->tree);
}
p->tree = new_tree;
tslua_push_tree(L, p->tree);
- return 1;
+
+ lua_createtable(L, n_ranges, 0);
+ for (size_t i = 0; i < n_ranges; i++) {
+ lua_createtable(L, 4, 0);
+ lua_pushinteger(L, changed[i].start_point.row);
+ lua_rawseti(L, -2, 1);
+ lua_pushinteger(L, changed[i].start_point.column);
+ lua_rawseti(L, -2, 2);
+ lua_pushinteger(L, changed[i].end_point.row);
+ lua_rawseti(L, -2, 3);
+ lua_pushinteger(L, changed[i].end_point.column);
+ lua_rawseti(L, -2, 4);
+
+ lua_rawseti(L, -2, i+1);
+ }
+ xfree(changed);
+ return 2;
}
static int parser_tree(lua_State *L)
@@ -383,7 +424,7 @@ static int tree_root(lua_State *L)
return 0;
}
TSNode root = ts_tree_root_node(tree);
- push_node(L, root);
+ push_node(L, root, 1);
return 1;
}
@@ -394,18 +435,19 @@ static int tree_root(lua_State *L)
/// top of stack must either be the tree this node belongs to or another node
/// of the same tree! This value is not popped. Can only be called inside a
/// cfunction with the tslua environment.
-static void push_node(lua_State *L, TSNode node)
+static void push_node(lua_State *L, TSNode node, int uindex)
{
+ assert(uindex > 0 || uindex < -LUA_MINSTACK);
if (ts_node_is_null(node)) {
- lua_pushnil(L); // [src, nil]
+ lua_pushnil(L); // [nil]
return;
}
- TSNode *ud = lua_newuserdata(L, sizeof(TSNode)); // [src, udata]
+ TSNode *ud = lua_newuserdata(L, sizeof(TSNode)); // [udata]
*ud = node;
- lua_getfield(L, LUA_REGISTRYINDEX, "treesitter_node"); // [src, udata, meta]
- lua_setmetatable(L, -2); // [src, udata]
- lua_getfenv(L, -2); // [src, udata, reftable]
- lua_setfenv(L, -2); // [src, udata]
+ lua_getfield(L, LUA_REGISTRYINDEX, "treesitter_node"); // [udata, meta]
+ lua_setmetatable(L, -2); // [udata]
+ lua_getfenv(L, uindex); // [udata, reftable]
+ lua_setfenv(L, -2); // [udata]
}
static bool node_check(lua_State *L, TSNode *res)
@@ -586,8 +628,7 @@ static int node_child(lua_State *L)
long num = lua_tointeger(L, 2);
TSNode child = ts_node_child(node, (uint32_t)num);
- lua_pushvalue(L, 1);
- push_node(L, child);
+ push_node(L, child, 1);
return 1;
}
@@ -600,8 +641,7 @@ static int node_named_child(lua_State *L)
long num = lua_tointeger(L, 2);
TSNode child = ts_node_named_child(node, (uint32_t)num);
- lua_pushvalue(L, 1);
- push_node(L, child);
+ push_node(L, child, 1);
return 1;
}
@@ -617,8 +657,7 @@ static int node_descendant_for_range(lua_State *L)
(uint32_t)lua_tointeger(L, 5) };
TSNode child = ts_node_descendant_for_point_range(node, start, end);
- lua_pushvalue(L, 1);
- push_node(L, child);
+ push_node(L, child, 1);
return 1;
}
@@ -634,8 +673,7 @@ static int node_named_descendant_for_range(lua_State *L)
(uint32_t)lua_tointeger(L, 5) };
TSNode child = ts_node_named_descendant_for_point_range(node, start, end);
- lua_pushvalue(L, 1);
- push_node(L, child);
+ push_node(L, child, 1);
return 1;
}
@@ -646,7 +684,254 @@ static int node_parent(lua_State *L)
return 0;
}
TSNode parent = ts_node_parent(node);
- push_node(L, parent);
+ push_node(L, parent, 1);
+ return 1;
+}
+
+/// assumes the match table being on top of the stack
+static void set_match(lua_State *L, TSQueryMatch *match, int nodeidx)
+{
+ for (int i = 0; i < match->capture_count; i++) {
+ push_node(L, match->captures[i].node, nodeidx);
+ lua_rawseti(L, -2, match->captures[i].index+1);
+ }
+}
+
+static int query_next_match(lua_State *L)
+{
+ TSLua_cursor *ud = lua_touserdata(L, lua_upvalueindex(1));
+ TSQueryCursor *cursor = ud->cursor;
+
+ TSQuery *query = query_check(L, lua_upvalueindex(3));
+ TSQueryMatch match;
+ if (ts_query_cursor_next_match(cursor, &match)) {
+ lua_pushinteger(L, match.pattern_index+1); // [index]
+ lua_createtable(L, ts_query_capture_count(query), 2); // [index, match]
+ set_match(L, &match, lua_upvalueindex(2));
+ return 2;
+ }
+ return 0;
+}
+
+
+static int query_next_capture(lua_State *L)
+{
+ TSLua_cursor *ud = lua_touserdata(L, lua_upvalueindex(1));
+ TSQueryCursor *cursor = ud->cursor;
+
+ TSQuery *query = query_check(L, lua_upvalueindex(3));
+
+ if (ud->predicated_match > -1) {
+ lua_getfield(L, lua_upvalueindex(4), "active");
+ bool active = lua_toboolean(L, -1);
+ lua_pop(L, 1);
+ if (!active) {
+ ts_query_cursor_remove_match(cursor, ud->predicated_match);
+ }
+ ud->predicated_match = -1;
+ }
+
+ TSQueryMatch match;
+ uint32_t capture_index;
+ if (ts_query_cursor_next_capture(cursor, &match, &capture_index)) {
+ TSQueryCapture capture = match.captures[capture_index];
+
+ lua_pushinteger(L, capture.index+1); // [index]
+ push_node(L, capture.node, lua_upvalueindex(2)); // [index, node]
+
+ uint32_t n_pred;
+ ts_query_predicates_for_pattern(query, match.pattern_index, &n_pred);
+ if (n_pred > 0 && capture_index == 0) {
+ lua_pushvalue(L, lua_upvalueindex(4)); // [index, node, match]
+ set_match(L, &match, lua_upvalueindex(2));
+ lua_pushinteger(L, match.pattern_index+1);
+ lua_setfield(L, -2, "pattern");
+
+ if (match.capture_count > 1) {
+ ud->predicated_match = match.id;
+ lua_pushboolean(L, false);
+ lua_setfield(L, -2, "active");
+ }
+ return 3;
+ }
+ return 2;
+ }
+ return 0;
+}
+
+static int node_rawquery(lua_State *L)
+{
+ TSNode node;
+ if (!node_check(L, &node)) {
+ return 0;
+ }
+ TSQuery *query = query_check(L, 2);
+ // TODO(bfredl): these are expensive allegedly,
+ // use a reuse list later on?
+ TSQueryCursor *cursor = ts_query_cursor_new();
+ ts_query_cursor_exec(cursor, query, node);
+
+ bool captures = lua_toboolean(L, 3);
+
+ if (lua_gettop(L) >= 4) {
+ int start = luaL_checkinteger(L, 4);
+ int end = lua_gettop(L) >= 5 ? luaL_checkinteger(L, 5) : MAXLNUM;
+ ts_query_cursor_set_point_range(cursor,
+ (TSPoint){ start, 0 }, (TSPoint){ end, 0 });
+ }
+
+ TSLua_cursor *ud = lua_newuserdata(L, sizeof(*ud)); // [udata]
+ ud->cursor = cursor;
+ ud->predicated_match = -1;
+
+ lua_getfield(L, LUA_REGISTRYINDEX, "treesitter_querycursor");
+ lua_setmetatable(L, -2); // [udata]
+ lua_pushvalue(L, 1); // [udata, node]
+
+ // include query separately, as to keep a ref to it for gc
+ lua_pushvalue(L, 2); // [udata, node, query]
+
+ if (captures) {
+ // placeholder for match state
+ lua_createtable(L, ts_query_capture_count(query), 2); // [u, n, q, match]
+ lua_pushcclosure(L, query_next_capture, 4); // [closure]
+ } else {
+ lua_pushcclosure(L, query_next_match, 3); // [closure]
+ }
+
return 1;
}
+static int querycursor_gc(lua_State *L)
+{
+ TSLua_cursor *ud = luaL_checkudata(L, 1, "treesitter_querycursor");
+ ts_query_cursor_delete(ud->cursor);
+ return 0;
+}
+
+// Query methods
+
+int ts_lua_parse_query(lua_State *L)
+{
+ if (lua_gettop(L) < 2 || !lua_isstring(L, 1) || !lua_isstring(L, 2)) {
+ return luaL_error(L, "string expected");
+ }
+
+ const char *lang_name = lua_tostring(L, 1);
+ TSLanguage *lang = pmap_get(cstr_t)(langs, lang_name);
+ if (!lang) {
+ return luaL_error(L, "no such language: %s", lang_name);
+ }
+
+ size_t len;
+ const char *src = lua_tolstring(L, 2, &len);
+
+ uint32_t error_offset;
+ TSQueryError error_type;
+ TSQuery *query = ts_query_new(lang, src, len, &error_offset, &error_type);
+
+ if (!query) {
+ return luaL_error(L, "query: %s at position %d",
+ query_err_string(error_type), (int)error_offset);
+ }
+
+ TSQuery **ud = lua_newuserdata(L, sizeof(TSQuery *)); // [udata]
+ *ud = query;
+ lua_getfield(L, LUA_REGISTRYINDEX, "treesitter_query"); // [udata, meta]
+ lua_setmetatable(L, -2); // [udata]
+ return 1;
+}
+
+
+static const char *query_err_string(TSQueryError err) {
+ switch (err) {
+ case TSQueryErrorSyntax: return "invalid syntax";
+ case TSQueryErrorNodeType: return "invalid node type";
+ case TSQueryErrorField: return "invalid field";
+ case TSQueryErrorCapture: return "invalid capture";
+ default: return "error";
+ }
+}
+
+static TSQuery *query_check(lua_State *L, int index)
+{
+ TSQuery **ud = luaL_checkudata(L, index, "treesitter_query");
+ return *ud;
+}
+
+static int query_gc(lua_State *L)
+{
+ TSQuery *query = query_check(L, 1);
+ if (!query) {
+ return 0;
+ }
+
+ ts_query_delete(query);
+ return 0;
+}
+
+static int query_tostring(lua_State *L)
+{
+ lua_pushstring(L, "<query>");
+ return 1;
+}
+
+static int query_inspect(lua_State *L)
+{
+ TSQuery *query = query_check(L, 1);
+ if (!query) {
+ return 0;
+ }
+
+ uint32_t n_pat = ts_query_pattern_count(query);
+ lua_createtable(L, 0, 2); // [retval]
+ lua_createtable(L, n_pat, 1); // [retval, patterns]
+ for (size_t i = 0; i < n_pat; i++) {
+ uint32_t len;
+ const TSQueryPredicateStep *step = ts_query_predicates_for_pattern(query,
+ i, &len);
+ if (len == 0) {
+ continue;
+ }
+ lua_createtable(L, len/4, 1); // [retval, patterns, pat]
+ lua_createtable(L, 3, 0); // [retval, patterns, pat, pred]
+ int nextpred = 1;
+ int nextitem = 1;
+ for (size_t k = 0; k < len; k++) {
+ if (step[k].type == TSQueryPredicateStepTypeDone) {
+ lua_rawseti(L, -2, nextpred++); // [retval, patterns, pat]
+ lua_createtable(L, 3, 0); // [retval, patterns, pat, pred]
+ nextitem = 1;
+ continue;
+ }
+
+ if (step[k].type == TSQueryPredicateStepTypeString) {
+ uint32_t strlen;
+ const char *str = ts_query_string_value_for_id(query, step[k].value_id,
+ &strlen);
+ lua_pushlstring(L, str, strlen); // [retval, patterns, pat, pred, item]
+ } else if (step[k].type == TSQueryPredicateStepTypeCapture) {
+ lua_pushnumber(L, step[k].value_id+1); // [..., pat, pred, item]
+ } else {
+ abort();
+ }
+ lua_rawseti(L, -2, nextitem++); // [retval, patterns, pat, pred]
+ }
+ // last predicate should have ended with TypeDone
+ lua_pop(L, 1); // [retval, patters, pat]
+ lua_rawseti(L, -2, i+1); // [retval, patterns]
+ }
+ lua_setfield(L, -2, "patterns"); // [retval]
+
+ uint32_t n_captures = ts_query_capture_count(query);
+ lua_createtable(L, n_captures, 0); // [retval, captures]
+ for (size_t i = 0; i < n_captures; i++) {
+ uint32_t strlen;
+ const char *str = ts_query_capture_name_for_id(query, i, &strlen);
+ lua_pushlstring(L, str, strlen); // [retval, captures, capture]
+ lua_rawseti(L, -2, i+1);
+ }
+ lua_setfield(L, -2, "captures"); // [retval]
+
+ return 1;
+}
diff --git a/src/nvim/screen.c b/src/nvim/screen.c
index 1d29ae064e..4082208dd4 100644
--- a/src/nvim/screen.c
+++ b/src/nvim/screen.c
@@ -116,6 +116,8 @@
#include "nvim/window.h"
#include "nvim/os/time.h"
#include "nvim/api/private/helpers.h"
+#include "nvim/api/vim.h"
+#include "nvim/lua/executor.h"
#define MB_FILLER_CHAR '<' /* character used when a double-width character
* doesn't fit. */
@@ -232,6 +234,22 @@ void redraw_buf_line_later(buf_T *buf, linenr_T line)
}
}
+void redraw_buf_range_later(buf_T *buf, linenr_T firstline, linenr_T lastline)
+{
+ FOR_ALL_WINDOWS_IN_TAB(wp, curtab) {
+ if (wp->w_buffer == buf
+ && lastline >= wp->w_topline && firstline < wp->w_botline) {
+ if (wp->w_redraw_top == 0 || wp->w_redraw_top > firstline) {
+ wp->w_redraw_top = firstline;
+ }
+ if (wp->w_redraw_bot == 0 || wp->w_redraw_bot < lastline) {
+ wp->w_redraw_bot = lastline;
+ }
+ redraw_win_later(wp, VALID);
+ }
+ }
+}
+
/*
* Changed something in the current window, at buffer line "lnum", that
* requires that line and possibly other lines to be redrawn.
@@ -477,6 +495,19 @@ int update_screen(int type)
if (wwp == wp && syntax_present(wp)) {
syn_stack_apply_changes(wp->w_buffer);
}
+
+ buf_T *buf = wp->w_buffer;
+ if (buf->b_luahl && buf->b_luahl_window != LUA_NOREF) {
+ Error err = ERROR_INIT;
+ FIXED_TEMP_ARRAY(args, 2);
+ args.items[0] = BUFFER_OBJ(buf->handle);
+ args.items[1] = INTEGER_OBJ(display_tick);
+ executor_exec_lua_cb(buf->b_luahl_start, "start", args, false, &err);
+ if (ERROR_SET(&err)) {
+ ELOG("error in luahl start: %s", err.msg);
+ api_clear_error(&err);
+ }
+ }
}
}
@@ -1181,7 +1212,27 @@ static void win_update(win_T *wp)
idx = 0; /* first entry in w_lines[].wl_size */
row = 0;
srow = 0;
- lnum = wp->w_topline; /* first line shown in window */
+ lnum = wp->w_topline; // first line shown in window
+
+ if (buf->b_luahl && buf->b_luahl_window != LUA_NOREF) {
+ Error err = ERROR_INIT;
+ FIXED_TEMP_ARRAY(args, 4);
+ linenr_T knownmax = ((wp->w_valid & VALID_BOTLINE)
+ ? wp->w_botline
+ : (wp->w_topline + wp->w_height_inner));
+ args.items[0] = WINDOW_OBJ(wp->handle);
+ args.items[1] = BUFFER_OBJ(buf->handle);
+ args.items[2] = INTEGER_OBJ(wp->w_topline-1);
+ args.items[3] = INTEGER_OBJ(knownmax);
+ // TODO(bfredl): we could allow this callback to change mod_top, mod_bot.
+ // For now the "start" callback is expected to use nvim__buf_redraw_range.
+ executor_exec_lua_cb(buf->b_luahl_window, "window", args, false, &err);
+ if (ERROR_SET(&err)) {
+ ELOG("error in luahl window: %s", err.msg);
+ api_clear_error(&err);
+ }
+ }
+
for (;; ) {
/* stop updating when reached the end of the window (check for _past_
* the end of the window is at the end of the loop) */
@@ -2229,6 +2280,8 @@ win_line (
row = startrow;
+ char *luatext = NULL;
+
if (!number_only) {
// To speed up the loop below, set extra_check when there is linebreak,
// trailing white space and/or syntax processing to be done.
@@ -2454,6 +2507,41 @@ win_line (
line = ml_get_buf(wp->w_buffer, lnum, FALSE);
ptr = line;
+ buf_T *buf = wp->w_buffer;
+ if (buf->b_luahl && buf->b_luahl_line != LUA_NOREF) {
+ size_t size = STRLEN(line);
+ if (lua_attr_bufsize < size) {
+ xfree(lua_attr_buf);
+ lua_attr_buf = xcalloc(size, sizeof(*lua_attr_buf));
+ lua_attr_bufsize = size;
+ } else if (lua_attr_buf) {
+ memset(lua_attr_buf, 0, size * sizeof(*lua_attr_buf));
+ }
+ Error err = ERROR_INIT;
+ // TODO(bfredl): build a macro for the "static array" pattern
+ // in buf_updates_send_changes?
+ FIXED_TEMP_ARRAY(args, 3);
+ args.items[0] = WINDOW_OBJ(wp->handle);
+ args.items[1] = BUFFER_OBJ(buf->handle);
+ args.items[2] = INTEGER_OBJ(lnum-1);
+ lua_attr_active = true;
+ extra_check = true;
+ Object o = executor_exec_lua_cb(buf->b_luahl_line, "line",
+ args, true, &err);
+ lua_attr_active = false;
+ if (o.type == kObjectTypeString) {
+ // TODO(bfredl): this is a bit of a hack. A final API should use an
+ // "unified" interface where luahl can add both bufhl and virttext
+ luatext = o.data.string.data;
+ do_virttext = true;
+ } else if (ERROR_SET(&err)) {
+ ELOG("error in luahl line: %s", err.msg);
+ luatext = err.msg;
+ do_virttext = true;
+ api_clear_error(&err);
+ }
+ }
+
if (has_spell && !number_only) {
// For checking first word with a capital skip white space.
if (cap_col == 0) {
@@ -3429,6 +3517,10 @@ win_line (
}
}
+ if (buf->b_luahl && v > 0 && v < (long)lua_attr_bufsize+1) {
+ char_attr = hl_combine_attr(char_attr, lua_attr_buf[v-1]);
+ }
+
if (wp->w_buffer->terminal) {
char_attr = hl_combine_attr(term_attrs[vcol], char_attr);
}
@@ -3917,8 +4009,14 @@ win_line (
int rightmost_vcol = 0;
int i;
- VirtText virt_text = do_virttext ? bufhl_info.line->virt_text
- : (VirtText)KV_INITIAL_VALUE;
+ VirtText virt_text;
+ if (luatext) {
+ virt_text = (VirtText)KV_INITIAL_VALUE;
+ kv_push(virt_text, ((VirtTextChunk){ .text = luatext, .hl_id = 0 }));
+ } else {
+ virt_text = do_virttext ? bufhl_info.line->virt_text
+ : (VirtText)KV_INITIAL_VALUE;
+ }
size_t virt_pos = 0;
LineState s = LINE_STATE((char_u *)"");
int virt_attr = 0;
@@ -4319,6 +4417,7 @@ win_line (
}
xfree(p_extra_free);
+ xfree(luatext);
return row;
}
diff --git a/src/tree_sitter/api.h b/src/tree_sitter/api.h
index d39d0521ee..40187e3db0 100644
--- a/src/tree_sitter/api.h
+++ b/src/tree_sitter/api.h
@@ -14,7 +14,19 @@ extern "C" {
/* Section - ABI Versioning */
/****************************/
+/**
+ * The latest ABI version that is supported by the current version of the
+ * library. When Languages are generated by the Tree-sitter CLI, they are
+ * assigned an ABI version number that corresponds to the current CLI version.
+ * The Tree-sitter library is generally backwards-compatible with languages
+ * generated using older CLI versions, but is not forwards-compatible.
+ */
#define TREE_SITTER_LANGUAGE_VERSION 11
+
+/**
+ * The earliest ABI version that is supported by the current version of the
+ * library.
+ */
#define TREE_SITTER_MIN_COMPATIBLE_LANGUAGE_VERSION 9
/*******************/
@@ -26,6 +38,8 @@ typedef uint16_t TSFieldId;
typedef struct TSLanguage TSLanguage;
typedef struct TSParser TSParser;
typedef struct TSTree TSTree;
+typedef struct TSQuery TSQuery;
+typedef struct TSQueryCursor TSQueryCursor;
typedef enum {
TSInputEncodingUTF8,
@@ -87,6 +101,37 @@ typedef struct {
uint32_t context[2];
} TSTreeCursor;
+typedef struct {
+ TSNode node;
+ uint32_t index;
+} TSQueryCapture;
+
+typedef struct {
+ uint32_t id;
+ uint16_t pattern_index;
+ uint16_t capture_count;
+ const TSQueryCapture *captures;
+} TSQueryMatch;
+
+typedef enum {
+ TSQueryPredicateStepTypeDone,
+ TSQueryPredicateStepTypeCapture,
+ TSQueryPredicateStepTypeString,
+} TSQueryPredicateStepType;
+
+typedef struct {
+ TSQueryPredicateStepType type;
+ uint32_t value_id;
+} TSQueryPredicateStep;
+
+typedef enum {
+ TSQueryErrorNone = 0,
+ TSQueryErrorSyntax,
+ TSQueryErrorNodeType,
+ TSQueryErrorField,
+ TSQueryErrorCapture,
+} TSQueryError;
+
/********************/
/* Section - Parser */
/********************/
@@ -119,7 +164,7 @@ bool ts_parser_set_language(TSParser *self, const TSLanguage *language);
const TSLanguage *ts_parser_language(const TSParser *self);
/**
- * Set the spans of text that the parser should include when parsing.
+ * Set the ranges of text that the parser should include when parsing.
*
* By default, the parser will always include entire documents. This function
* allows you to parse only a *portion* of a document but still return a syntax
@@ -226,14 +271,16 @@ TSTree *ts_parser_parse_string_encoding(
* by default, it will resume where it left off on the next call to
* `ts_parser_parse` or other parsing functions. If you don't want to resume,
* and instead intend to use this parser to parse some other document, you must
- * call this `ts_parser_reset` first.
+ * call `ts_parser_reset` first.
*/
void ts_parser_reset(TSParser *self);
/**
* Set the maximum duration in microseconds that parsing should be allowed to
- * take before halting. If parsing takes longer than this, it will halt early,
- * returning NULL. See `ts_parser_parse` for more information.
+ * take before halting.
+ *
+ * If parsing takes longer than this, it will halt early, returning NULL.
+ * See `ts_parser_parse` for more information.
*/
void ts_parser_set_timeout_micros(TSParser *self, uint64_t timeout);
@@ -243,10 +290,11 @@ void ts_parser_set_timeout_micros(TSParser *self, uint64_t timeout);
uint64_t ts_parser_timeout_micros(const TSParser *self);
/**
- * Set the parser's current cancellation flag pointer. If a non-null pointer is
- * assigned, then the parser will periodically read from this pointer during
- * parsing. If it reads a non-zero value, it will halt early, returning NULL.
- * See `ts_parser_parse` for more information.
+ * Set the parser's current cancellation flag pointer.
+ *
+ * If a non-null pointer is assigned, then the parser will periodically read
+ * from this pointer during parsing. If it reads a non-zero value, it will
+ * halt early, returning NULL. See `ts_parser_parse` for more information.
*/
void ts_parser_set_cancellation_flag(TSParser *self, const size_t *flag);
@@ -322,22 +370,22 @@ const TSLanguage *ts_tree_language(const TSTree *);
void ts_tree_edit(TSTree *self, const TSInputEdit *edit);
/**
- * Compare a new syntax tree to a previous syntax tree representing the same
+ * Compare an old edited syntax tree to a new syntax tree representing the same
* document, returning an array of ranges whose syntactic structure has changed.
*
* For this to work correctly, the old syntax tree must have been edited such
* that its ranges match up to the new tree. Generally, you'll want to call
- * this function right after calling one of the `ts_parser_parse` functions,
- * passing in the new tree that was returned from `ts_parser_parse` and the old
- * tree that was passed as a parameter.
+ * this function right after calling one of the `ts_parser_parse` functions.
+ * You need to pass the old tree that was passed to parse, as well as the new
+ * tree that was returned from that function.
*
* The returned array is allocated using `malloc` and the caller is responsible
* for freeing it using `free`. The length of the array will be written to the
* given `length` pointer.
*/
TSRange *ts_tree_get_changed_ranges(
- const TSTree *self,
const TSTree *old_tree,
+ const TSTree *new_tree,
uint32_t *length
);
@@ -409,8 +457,8 @@ bool ts_node_is_named(TSNode);
bool ts_node_is_missing(TSNode);
/**
- * Check if the node is *missing*. Missing nodes are inserted by the parser in
- * order to recover from certain kinds of syntax errors.
+ * Check if the node is *extra*. Extra nodes represent things like comments,
+ * which are not required the grammar, but can appear anywhere.
*/
bool ts_node_is_extra(TSNode);
@@ -542,7 +590,7 @@ TSTreeCursor ts_tree_cursor_new(TSNode);
void ts_tree_cursor_delete(TSTreeCursor *);
/**
- * Re-initialize a tree cursor to start at a different ndoe.
+ * Re-initialize a tree cursor to start at a different node.
*/
void ts_tree_cursor_reset(TSTreeCursor *, TSNode);
@@ -584,7 +632,7 @@ bool ts_tree_cursor_goto_parent(TSTreeCursor *);
bool ts_tree_cursor_goto_next_sibling(TSTreeCursor *);
/**
- * Move the cursor to the first schild of its current node.
+ * Move the cursor to the first child of its current node.
*
* This returns `true` if the cursor successfully moved, and returns `false`
* if there were no children.
@@ -592,7 +640,7 @@ bool ts_tree_cursor_goto_next_sibling(TSTreeCursor *);
bool ts_tree_cursor_goto_first_child(TSTreeCursor *);
/**
- * Move the cursor to the first schild of its current node that extends beyond
+ * Move the cursor to the first child of its current node that extends beyond
* the given byte offset.
*
* This returns the index of the child node if one was found, and returns -1
@@ -602,6 +650,156 @@ int64_t ts_tree_cursor_goto_first_child_for_byte(TSTreeCursor *, uint32_t);
TSTreeCursor ts_tree_cursor_copy(const TSTreeCursor *);
+/*******************/
+/* Section - Query */
+/*******************/
+
+/**
+ * Create a new query from a string containing one or more S-expression
+ * patterns. The query is associated with a particular language, and can
+ * only be run on syntax nodes parsed with that language.
+ *
+ * If all of the given patterns are valid, this returns a `TSQuery`.
+ * If a pattern is invalid, this returns `NULL`, and provides two pieces
+ * of information about the problem:
+ * 1. The byte offset of the error is written to the `error_offset` parameter.
+ * 2. The type of error is written to the `error_type` parameter.
+ */
+TSQuery *ts_query_new(
+ const TSLanguage *language,
+ const char *source,
+ uint32_t source_len,
+ uint32_t *error_offset,
+ TSQueryError *error_type
+);
+
+/**
+ * Delete a query, freeing all of the memory that it used.
+ */
+void ts_query_delete(TSQuery *);
+
+/**
+ * Get the number of patterns, captures, or string literals in the query.
+ */
+uint32_t ts_query_pattern_count(const TSQuery *);
+uint32_t ts_query_capture_count(const TSQuery *);
+uint32_t ts_query_string_count(const TSQuery *);
+
+/**
+ * Get the byte offset where the given pattern starts in the query's source.
+ *
+ * This can be useful when combining queries by concatenating their source
+ * code strings.
+ */
+uint32_t ts_query_start_byte_for_pattern(const TSQuery *, uint32_t);
+
+/**
+ * Get all of the predicates for the given pattern in the query.
+ *
+ * The predicates are represented as a single array of steps. There are three
+ * types of steps in this array, which correspond to the three legal values for
+ * the `type` field:
+ * - `TSQueryPredicateStepTypeCapture` - Steps with this type represent names
+ * of captures. Their `value_id` can be used with the
+ * `ts_query_capture_name_for_id` function to obtain the name of the capture.
+ * - `TSQueryPredicateStepTypeString` - Steps with this type represent literal
+ * strings. Their `value_id` can be used with the
+ * `ts_query_string_value_for_id` function to obtain their string value.
+ * - `TSQueryPredicateStepTypeDone` - Steps with this type are *sentinels*
+ * that represent the end of an individual predicate. If a pattern has two
+ * predicates, then there will be two steps with this `type` in the array.
+ */
+const TSQueryPredicateStep *ts_query_predicates_for_pattern(
+ const TSQuery *self,
+ uint32_t pattern_index,
+ uint32_t *length
+);
+
+/**
+ * Get the name and length of one of the query's captures, or one of the
+ * query's string literals. Each capture and string is associated with a
+ * numeric id based on the order that it appeared in the query's source.
+ */
+const char *ts_query_capture_name_for_id(
+ const TSQuery *,
+ uint32_t id,
+ uint32_t *length
+);
+const char *ts_query_string_value_for_id(
+ const TSQuery *,
+ uint32_t id,
+ uint32_t *length
+);
+
+/**
+ * Disable a certain capture within a query. This prevents the capture
+ * from being returned in matches, and also avoids any resource usage
+ * associated with recording the capture.
+ */
+void ts_query_disable_capture(TSQuery *, const char *, uint32_t);
+
+/**
+ * Create a new cursor for executing a given query.
+ *
+ * The cursor stores the state that is needed to iteratively search
+ * for matches. To use the query cursor, first call `ts_query_cursor_exec`
+ * to start running a given query on a given syntax node. Then, there are
+ * two options for consuming the results of the query:
+ * 1. Repeatedly call `ts_query_cursor_next_match` to iterate over all of the
+ * the *matches* in the order that they were found. Each match contains the
+ * index of the pattern that matched, and an array of captures. Because
+ * multiple patterns can match the same set of nodes, one match may contain
+ * captures that appear *before* some of the captures from a previous match.
+ * 2. Repeatedly call `ts_query_cursor_next_capture` to iterate over all of the
+ * individual *captures* in the order that they appear. This is useful if
+ * don't care about which pattern matched, and just want a single ordered
+ * sequence of captures.
+ *
+ * If you don't care about consuming all of the results, you can stop calling
+ * `ts_query_cursor_next_match` or `ts_query_cursor_next_capture` at any point.
+ * You can then start executing another query on another node by calling
+ * `ts_query_cursor_exec` again.
+ */
+TSQueryCursor *ts_query_cursor_new(void);
+
+/**
+ * Delete a query cursor, freeing all of the memory that it used.
+ */
+void ts_query_cursor_delete(TSQueryCursor *);
+
+/**
+ * Start running a given query on a given node.
+ */
+void ts_query_cursor_exec(TSQueryCursor *, const TSQuery *, TSNode);
+
+/**
+ * Set the range of bytes or (row, column) positions in which the query
+ * will be executed.
+ */
+void ts_query_cursor_set_byte_range(TSQueryCursor *, uint32_t, uint32_t);
+void ts_query_cursor_set_point_range(TSQueryCursor *, TSPoint, TSPoint);
+
+/**
+ * Advance to the next match of the currently running query.
+ *
+ * If there is a match, write it to `*match` and return `true`.
+ * Otherwise, return `false`.
+ */
+bool ts_query_cursor_next_match(TSQueryCursor *, TSQueryMatch *match);
+void ts_query_cursor_remove_match(TSQueryCursor *, uint32_t id);
+
+/**
+ * Advance to the next capture of the currently running query.
+ *
+ * If there is a capture, write its match to `*match` and its index within
+ * the matche's capture list to `*capture_index`. Otherwise, return `false`.
+ */
+bool ts_query_cursor_next_capture(
+ TSQueryCursor *,
+ TSQueryMatch *match,
+ uint32_t *capture_index
+);
+
/**********************/
/* Section - Language */
/**********************/
@@ -619,7 +817,12 @@ const char *ts_language_symbol_name(const TSLanguage *, TSSymbol);
/**
* Get the numerical id for the given node type string.
*/
-TSSymbol ts_language_symbol_for_name(const TSLanguage *, const char *);
+TSSymbol ts_language_symbol_for_name(
+ const TSLanguage *self,
+ const char *string,
+ uint32_t length,
+ bool is_named
+);
/**
* Get the number of distinct field names in the language.
diff --git a/src/tree_sitter/bits.h b/src/tree_sitter/bits.h
new file mode 100644
index 0000000000..3bec455dd1
--- /dev/null
+++ b/src/tree_sitter/bits.h
@@ -0,0 +1,29 @@
+#ifndef TREE_SITTER_BITS_H_
+#define TREE_SITTER_BITS_H_
+
+#include <stdint.h>
+
+static inline uint32_t bitmask_for_index(uint16_t id) {
+ return (1u << (31 - id));
+}
+
+#ifdef _WIN32
+
+#include <intrin.h>
+
+static inline uint32_t count_leading_zeros(uint32_t x) {
+ if (x == 0) return 32;
+ uint32_t result;
+ _BitScanReverse(&result, x);
+ return 31 - result;
+}
+
+#else
+
+static inline uint32_t count_leading_zeros(uint32_t x) {
+ if (x == 0) return 32;
+ return __builtin_clz(x);
+}
+
+#endif
+#endif // TREE_SITTER_BITS_H_
diff --git a/src/tree_sitter/language.c b/src/tree_sitter/language.c
index 1bfb1a8d03..e240ef2a53 100644
--- a/src/tree_sitter/language.c
+++ b/src/tree_sitter/language.c
@@ -3,8 +3,28 @@
#include "./error_costs.h"
#include <string.h>
-void ts_language_table_entry(const TSLanguage *self, TSStateId state,
- TSSymbol symbol, TableEntry *result) {
+uint32_t ts_language_symbol_count(const TSLanguage *self) {
+ return self->symbol_count + self->alias_count;
+}
+
+uint32_t ts_language_version(const TSLanguage *self) {
+ return self->version;
+}
+
+uint32_t ts_language_field_count(const TSLanguage *self) {
+ if (self->version >= TREE_SITTER_LANGUAGE_VERSION_WITH_FIELDS) {
+ return self->field_count;
+ } else {
+ return 0;
+ }
+}
+
+void ts_language_table_entry(
+ const TSLanguage *self,
+ TSStateId state,
+ TSSymbol symbol,
+ TableEntry *result
+) {
if (symbol == ts_builtin_sym_error || symbol == ts_builtin_sym_error_repeat) {
result->action_count = 0;
result->is_reusable = false;
@@ -19,48 +39,72 @@ void ts_language_table_entry(const TSLanguage *self, TSStateId state,
}
}
-uint32_t ts_language_symbol_count(const TSLanguage *language) {
- return language->symbol_count + language->alias_count;
-}
-
-uint32_t ts_language_version(const TSLanguage *language) {
- return language->version;
-}
-
-TSSymbolMetadata ts_language_symbol_metadata(const TSLanguage *language, TSSymbol symbol) {
+TSSymbolMetadata ts_language_symbol_metadata(
+ const TSLanguage *self,
+ TSSymbol symbol
+) {
if (symbol == ts_builtin_sym_error) {
return (TSSymbolMetadata){.visible = true, .named = true};
} else if (symbol == ts_builtin_sym_error_repeat) {
return (TSSymbolMetadata){.visible = false, .named = false};
} else {
- return language->symbol_metadata[symbol];
+ return self->symbol_metadata[symbol];
+ }
+}
+
+TSSymbol ts_language_public_symbol(
+ const TSLanguage *self,
+ TSSymbol symbol
+) {
+ if (symbol == ts_builtin_sym_error) return symbol;
+ if (self->version >= TREE_SITTER_LANGUAGE_VERSION_WITH_SYMBOL_DEDUPING) {
+ return self->public_symbol_map[symbol];
+ } else {
+ return symbol;
}
}
-const char *ts_language_symbol_name(const TSLanguage *language, TSSymbol symbol) {
+const char *ts_language_symbol_name(
+ const TSLanguage *self,
+ TSSymbol symbol
+) {
if (symbol == ts_builtin_sym_error) {
return "ERROR";
} else if (symbol == ts_builtin_sym_error_repeat) {
return "_ERROR";
} else {
- return language->symbol_names[symbol];
+ return self->symbol_names[symbol];
}
}
-TSSymbol ts_language_symbol_for_name(const TSLanguage *self, const char *name) {
- if (!strcmp(name, "ERROR")) return ts_builtin_sym_error;
-
+TSSymbol ts_language_symbol_for_name(
+ const TSLanguage *self,
+ const char *string,
+ uint32_t length,
+ bool is_named
+) {
+ if (!strncmp(string, "ERROR", length)) return ts_builtin_sym_error;
uint32_t count = ts_language_symbol_count(self);
for (TSSymbol i = 0; i < count; i++) {
- if (!strcmp(self->symbol_names[i], name)) {
- return i;
+ TSSymbolMetadata metadata = ts_language_symbol_metadata(self, i);
+ if (!metadata.visible || metadata.named != is_named) continue;
+ const char *symbol_name = self->symbol_names[i];
+ if (!strncmp(symbol_name, string, length) && !symbol_name[length]) {
+ if (self->version >= TREE_SITTER_LANGUAGE_VERSION_WITH_SYMBOL_DEDUPING) {
+ return self->public_symbol_map[i];
+ } else {
+ return i;
+ }
}
}
return 0;
}
-TSSymbolType ts_language_symbol_type(const TSLanguage *language, TSSymbol symbol) {
- TSSymbolMetadata metadata = ts_language_symbol_metadata(language, symbol);
+TSSymbolType ts_language_symbol_type(
+ const TSLanguage *self,
+ TSSymbol symbol
+) {
+ TSSymbolMetadata metadata = ts_language_symbol_metadata(self, symbol);
if (metadata.named) {
return TSSymbolTypeRegular;
} else if (metadata.visible) {
@@ -70,15 +114,10 @@ TSSymbolType ts_language_symbol_type(const TSLanguage *language, TSSymbol symbol
}
}
-uint32_t ts_language_field_count(const TSLanguage *self) {
- if (self->version >= TREE_SITTER_LANGUAGE_VERSION_WITH_FIELDS) {
- return self->field_count;
- } else {
- return 0;
- }
-}
-
-const char *ts_language_field_name_for_id(const TSLanguage *self, TSFieldId id) {
+const char *ts_language_field_name_for_id(
+ const TSLanguage *self,
+ TSFieldId id
+) {
uint32_t count = ts_language_field_count(self);
if (count) {
return self->field_names[id];
@@ -96,7 +135,8 @@ TSFieldId ts_language_field_id_for_name(
for (TSSymbol i = 1; i < count + 1; i++) {
switch (strncmp(name, self->field_names[i], name_length)) {
case 0:
- return i;
+ if (self->field_names[i][name_length] == 0) return i;
+ break;
case -1:
return 0;
default:
diff --git a/src/tree_sitter/language.h b/src/tree_sitter/language.h
index 0741486a1b..d7e17c3d70 100644
--- a/src/tree_sitter/language.h
+++ b/src/tree_sitter/language.h
@@ -10,6 +10,7 @@ extern "C" {
#define ts_builtin_sym_error_repeat (ts_builtin_sym_error - 1)
#define TREE_SITTER_LANGUAGE_VERSION_WITH_FIELDS 10
+#define TREE_SITTER_LANGUAGE_VERSION_WITH_SYMBOL_DEDUPING 11
#define TREE_SITTER_LANGUAGE_VERSION_WITH_SMALL_STATES 11
typedef struct {
@@ -22,6 +23,8 @@ void ts_language_table_entry(const TSLanguage *, TSStateId, TSSymbol, TableEntry
TSSymbolMetadata ts_language_symbol_metadata(const TSLanguage *, TSSymbol);
+TSSymbol ts_language_public_symbol(const TSLanguage *, TSSymbol);
+
static inline bool ts_language_is_symbol_external(const TSLanguage *self, TSSymbol symbol) {
return 0 < symbol && symbol < self->external_token_count + 1;
}
diff --git a/src/tree_sitter/lexer.c b/src/tree_sitter/lexer.c
index fdc127466f..e2ca851973 100644
--- a/src/tree_sitter/lexer.c
+++ b/src/tree_sitter/lexer.c
@@ -2,26 +2,58 @@
#include "./lexer.h"
#include "./subtree.h"
#include "./length.h"
-#include "./utf16.h"
-#include "utf8proc.h"
-
-#define LOG(...) \
- if (self->logger.log) { \
- snprintf(self->debug_buffer, TREE_SITTER_SERIALIZATION_BUFFER_SIZE, __VA_ARGS__); \
- self->logger.log(self->logger.payload, TSLogTypeLex, self->debug_buffer); \
+#include "./unicode.h"
+
+#define LOG(message, character) \
+ if (self->logger.log) { \
+ snprintf( \
+ self->debug_buffer, \
+ TREE_SITTER_SERIALIZATION_BUFFER_SIZE, \
+ 32 <= character && character < 127 ? \
+ message " character:'%c'" : \
+ message " character:%d", \
+ character \
+ ); \
+ self->logger.log( \
+ self->logger.payload, \
+ TSLogTypeLex, \
+ self->debug_buffer \
+ ); \
}
-#define LOG_CHARACTER(message, character) \
- LOG( \
- 32 <= character && character < 127 ? \
- message " character:'%c'" : \
- message " character:%d", character \
- )
+static const int32_t BYTE_ORDER_MARK = 0xFEFF;
-static const char empty_chunk[3] = { 0, 0 };
+static const TSRange DEFAULT_RANGE = {
+ .start_point = {
+ .row = 0,
+ .column = 0,
+ },
+ .end_point = {
+ .row = UINT32_MAX,
+ .column = UINT32_MAX,
+ },
+ .start_byte = 0,
+ .end_byte = UINT32_MAX
+};
-static const int32_t BYTE_ORDER_MARK = 0xFEFF;
+// Check if the lexer has reached EOF. This state is stored
+// by setting the lexer's `current_included_range_index` such that
+// it has consumed all of its available ranges.
+static bool ts_lexer__eof(const TSLexer *_self) {
+ Lexer *self = (Lexer *)_self;
+ return self->current_included_range_index == self->included_range_count;
+}
+
+// Clear the currently stored chunk of source code, because the lexer's
+// position has changed.
+static void ts_lexer__clear_chunk(Lexer *self) {
+ self->chunk = NULL;
+ self->chunk_size = 0;
+ self->chunk_start = 0;
+}
+// Call the lexer's input callback to obtain a new chunk of source code
+// for the current position.
static void ts_lexer__get_chunk(Lexer *self) {
self->chunk_start = self->current_position.bytes;
self->chunk = self->input.read(
@@ -30,15 +62,15 @@ static void ts_lexer__get_chunk(Lexer *self) {
self->current_position.extent,
&self->chunk_size
);
- if (!self->chunk_size) self->chunk = empty_chunk;
+ if (!self->chunk_size) {
+ self->current_included_range_index = self->included_range_count;
+ self->chunk = NULL;
+ }
}
-typedef utf8proc_ssize_t (*DecodeFunction)(
- const utf8proc_uint8_t *,
- utf8proc_ssize_t,
- utf8proc_int32_t *
-);
-
+// Decode the next unicode character in the current chunk of source code.
+// This assumes that the lexer has already retrieved a chunk of source
+// code that spans the current position.
static void ts_lexer__get_lookahead(Lexer *self) {
uint32_t position_in_chunk = self->current_position.bytes - self->chunk_start;
const uint8_t *chunk = (const uint8_t *)self->chunk + position_in_chunk;
@@ -50,29 +82,37 @@ static void ts_lexer__get_lookahead(Lexer *self) {
return;
}
- DecodeFunction decode =
- self->input.encoding == TSInputEncodingUTF8 ? utf8proc_iterate : utf16_iterate;
+ UnicodeDecodeFunction decode = self->input.encoding == TSInputEncodingUTF8
+ ? ts_decode_utf8
+ : ts_decode_utf16;
self->lookahead_size = decode(chunk, size, &self->data.lookahead);
// If this chunk ended in the middle of a multi-byte character,
// try again with a fresh chunk.
- if (self->data.lookahead == -1 && size < 4) {
+ if (self->data.lookahead == TS_DECODE_ERROR && size < 4) {
ts_lexer__get_chunk(self);
chunk = (const uint8_t *)self->chunk;
size = self->chunk_size;
self->lookahead_size = decode(chunk, size, &self->data.lookahead);
}
- if (self->data.lookahead == -1) {
+ if (self->data.lookahead == TS_DECODE_ERROR) {
self->lookahead_size = 1;
}
}
-static void ts_lexer__advance(TSLexer *payload, bool skip) {
- Lexer *self = (Lexer *)payload;
- if (self->chunk == empty_chunk)
- return;
+// Advance to the next character in the source code, retrieving a new
+// chunk of source code if needed.
+static void ts_lexer__advance(TSLexer *_self, bool skip) {
+ Lexer *self = (Lexer *)_self;
+ if (!self->chunk) return;
+
+ if (skip) {
+ LOG("skip", self->data.lookahead);
+ } else {
+ LOG("consume", self->data.lookahead);
+ }
if (self->lookahead_size) {
self->current_position.bytes += self->lookahead_size;
@@ -84,53 +124,65 @@ static void ts_lexer__advance(TSLexer *payload, bool skip) {
}
}
- TSRange *current_range = &self->included_ranges[self->current_included_range_index];
- if (self->current_position.bytes == current_range->end_byte) {
- self->current_included_range_index++;
- if (self->current_included_range_index == self->included_range_count) {
- self->data.lookahead = '\0';
- self->lookahead_size = 1;
- return;
- } else {
- current_range++;
- self->current_position = (Length) {
- current_range->start_byte,
- current_range->start_point,
- };
+ const TSRange *current_range = NULL;
+ if (self->current_included_range_index < self->included_range_count) {
+ current_range = &self->included_ranges[self->current_included_range_index];
+ if (self->current_position.bytes == current_range->end_byte) {
+ self->current_included_range_index++;
+ if (self->current_included_range_index < self->included_range_count) {
+ current_range++;
+ self->current_position = (Length) {
+ current_range->start_byte,
+ current_range->start_point,
+ };
+ } else {
+ current_range = NULL;
+ }
}
}
- if (skip) {
- LOG_CHARACTER("skip", self->data.lookahead);
- self->token_start_position = self->current_position;
- } else {
- LOG_CHARACTER("consume", self->data.lookahead);
- }
+ if (skip) self->token_start_position = self->current_position;
- if (self->current_position.bytes >= self->chunk_start + self->chunk_size) {
- ts_lexer__get_chunk(self);
+ if (current_range) {
+ if (self->current_position.bytes >= self->chunk_start + self->chunk_size) {
+ ts_lexer__get_chunk(self);
+ }
+ ts_lexer__get_lookahead(self);
+ } else {
+ ts_lexer__clear_chunk(self);
+ self->data.lookahead = '\0';
+ self->lookahead_size = 1;
}
-
- ts_lexer__get_lookahead(self);
}
-static void ts_lexer__mark_end(TSLexer *payload) {
- Lexer *self = (Lexer *)payload;
- TSRange *current_included_range = &self->included_ranges[self->current_included_range_index];
- if (self->current_included_range_index > 0 &&
- self->current_position.bytes == current_included_range->start_byte) {
- TSRange *previous_included_range = current_included_range - 1;
- self->token_end_position = (Length) {
- previous_included_range->end_byte,
- previous_included_range->end_point,
- };
- } else {
- self->token_end_position = self->current_position;
+// Mark that a token match has completed. This can be called multiple
+// times if a longer match is found later.
+static void ts_lexer__mark_end(TSLexer *_self) {
+ Lexer *self = (Lexer *)_self;
+ if (!ts_lexer__eof(&self->data)) {
+ // If the lexer is right at the beginning of included range,
+ // then the token should be considered to end at the *end* of the
+ // previous included range, rather than here.
+ TSRange *current_included_range = &self->included_ranges[
+ self->current_included_range_index
+ ];
+ if (
+ self->current_included_range_index > 0 &&
+ self->current_position.bytes == current_included_range->start_byte
+ ) {
+ TSRange *previous_included_range = current_included_range - 1;
+ self->token_end_position = (Length) {
+ previous_included_range->end_byte,
+ previous_included_range->end_point,
+ };
+ return;
+ }
}
+ self->token_end_position = self->current_position;
}
-static uint32_t ts_lexer__get_column(TSLexer *payload) {
- Lexer *self = (Lexer *)payload;
+static uint32_t ts_lexer__get_column(TSLexer *_self) {
+ Lexer *self = (Lexer *)_self;
uint32_t goal_byte = self->current_position.bytes;
self->current_position.bytes -= self->current_position.extent.column;
@@ -142,67 +194,69 @@ static uint32_t ts_lexer__get_column(TSLexer *payload) {
uint32_t result = 0;
while (self->current_position.bytes < goal_byte) {
- ts_lexer__advance(payload, false);
+ ts_lexer__advance(&self->data, false);
result++;
}
return result;
}
-static bool ts_lexer__is_at_included_range_start(TSLexer *payload) {
- const Lexer *self = (const Lexer *)payload;
- TSRange *current_range = &self->included_ranges[self->current_included_range_index];
- return self->current_position.bytes == current_range->start_byte;
+// Is the lexer at a boundary between two disjoint included ranges of
+// source code? This is exposed as an API because some languages' external
+// scanners need to perform custom actions at these bounaries.
+static bool ts_lexer__is_at_included_range_start(const TSLexer *_self) {
+ const Lexer *self = (const Lexer *)_self;
+ if (self->current_included_range_index < self->included_range_count) {
+ TSRange *current_range = &self->included_ranges[self->current_included_range_index];
+ return self->current_position.bytes == current_range->start_byte;
+ } else {
+ return false;
+ }
}
-// The lexer's methods are stored as a struct field so that generated
-// parsers can call them without needing to be linked against this library.
-
void ts_lexer_init(Lexer *self) {
*self = (Lexer) {
.data = {
+ // The lexer's methods are stored as struct fields so that generated
+ // parsers can call them without needing to be linked against this
+ // library.
.advance = ts_lexer__advance,
.mark_end = ts_lexer__mark_end,
.get_column = ts_lexer__get_column,
.is_at_included_range_start = ts_lexer__is_at_included_range_start,
+ .eof = ts_lexer__eof,
.lookahead = 0,
.result_symbol = 0,
},
.chunk = NULL,
+ .chunk_size = 0,
.chunk_start = 0,
- .current_position = {UINT32_MAX, {0, 0}},
+ .current_position = {0, {0, 0}},
.logger = {
.payload = NULL,
.log = NULL
},
+ .included_ranges = NULL,
+ .included_range_count = 0,
.current_included_range_index = 0,
};
-
- self->included_ranges = NULL;
ts_lexer_set_included_ranges(self, NULL, 0);
- ts_lexer_reset(self, length_zero());
}
void ts_lexer_delete(Lexer *self) {
ts_free(self->included_ranges);
}
-void ts_lexer_set_input(Lexer *self, TSInput input) {
- self->input = input;
- self->data.lookahead = 0;
- self->lookahead_size = 0;
- self->chunk = 0;
- self->chunk_start = 0;
- self->chunk_size = 0;
-}
-
static void ts_lexer_goto(Lexer *self, Length position) {
+ self->current_position = position;
bool found_included_range = false;
+
+ // Move to the first valid position at or after the given position.
for (unsigned i = 0; i < self->included_range_count; i++) {
TSRange *included_range = &self->included_ranges[i];
if (included_range->end_byte > position.bytes) {
if (included_range->start_byte > position.bytes) {
- position = (Length) {
+ self->current_position = (Length) {
.bytes = included_range->start_byte,
.extent = included_range->start_point,
};
@@ -214,46 +268,61 @@ static void ts_lexer_goto(Lexer *self, Length position) {
}
}
- if (!found_included_range) {
+ if (found_included_range) {
+ // If the current position is outside of the current chunk of text,
+ // then clear out the current chunk of text.
+ if (self->chunk && (
+ position.bytes < self->chunk_start ||
+ position.bytes >= self->chunk_start + self->chunk_size
+ )) {
+ ts_lexer__clear_chunk(self);
+ }
+
+ self->lookahead_size = 0;
+ self->data.lookahead = '\0';
+ }
+
+ // If the given position is beyond any of included ranges, move to the EOF
+ // state - past the end of the included ranges.
+ else {
+ self->current_included_range_index = self->included_range_count;
TSRange *last_included_range = &self->included_ranges[self->included_range_count - 1];
- position = (Length) {
+ self->current_position = (Length) {
.bytes = last_included_range->end_byte,
.extent = last_included_range->end_point,
};
- self->chunk = empty_chunk;
- self->chunk_start = position.bytes;
- self->chunk_size = 2;
- }
-
- self->token_start_position = position;
- self->token_end_position = LENGTH_UNDEFINED;
- self->current_position = position;
-
- if (self->chunk && (position.bytes < self->chunk_start ||
- position.bytes >= self->chunk_start + self->chunk_size)) {
- self->chunk = 0;
- self->chunk_start = 0;
- self->chunk_size = 0;
+ ts_lexer__clear_chunk(self);
+ self->lookahead_size = 1;
+ self->data.lookahead = '\0';
}
+}
- self->lookahead_size = 0;
- self->data.lookahead = 0;
+void ts_lexer_set_input(Lexer *self, TSInput input) {
+ self->input = input;
+ ts_lexer__clear_chunk(self);
+ ts_lexer_goto(self, self->current_position);
}
+// Move the lexer to the given position. This doesn't do any work
+// if the parser is already at the given position.
void ts_lexer_reset(Lexer *self, Length position) {
- if (position.bytes != self->current_position.bytes) ts_lexer_goto(self, position);
+ if (position.bytes != self->current_position.bytes) {
+ ts_lexer_goto(self, position);
+ }
}
void ts_lexer_start(Lexer *self) {
self->token_start_position = self->current_position;
self->token_end_position = LENGTH_UNDEFINED;
self->data.result_symbol = 0;
- if (!self->chunk) ts_lexer__get_chunk(self);
- if (!self->lookahead_size) ts_lexer__get_lookahead(self);
- if (
- self->current_position.bytes == 0 &&
- self->data.lookahead == BYTE_ORDER_MARK
- ) ts_lexer__advance((TSLexer *)self, true);
+ if (!ts_lexer__eof(&self->data)) {
+ if (!self->chunk_size) ts_lexer__get_chunk(self);
+ if (!self->lookahead_size) ts_lexer__get_lookahead(self);
+ if (
+ self->current_position.bytes == 0 &&
+ self->data.lookahead == BYTE_ORDER_MARK
+ ) ts_lexer__advance(&self->data, true);
+ }
}
void ts_lexer_finish(Lexer *self, uint32_t *lookahead_end_byte) {
@@ -267,7 +336,7 @@ void ts_lexer_finish(Lexer *self, uint32_t *lookahead_end_byte) {
// the character decoding algorithm may have looked at the following byte.
// Therefore, the next byte *after* the current (invalid) character
// affects the interpretation of the current character.
- if (self->data.lookahead == -1) {
+ if (self->data.lookahead == TS_DECODE_ERROR) {
current_lookahead_end_byte++;
}
@@ -277,8 +346,8 @@ void ts_lexer_finish(Lexer *self, uint32_t *lookahead_end_byte) {
}
void ts_lexer_advance_to_end(Lexer *self) {
- while (self->data.lookahead != 0) {
- ts_lexer__advance((TSLexer *)self, false);
+ while (self->chunk) {
+ ts_lexer__advance(&self->data, false);
}
}
@@ -286,30 +355,19 @@ void ts_lexer_mark_end(Lexer *self) {
ts_lexer__mark_end(&self->data);
}
-static const TSRange DEFAULT_RANGES[] = {
- {
- .start_point = {
- .row = 0,
- .column = 0,
- },
- .end_point = {
- .row = UINT32_MAX,
- .column = UINT32_MAX,
- },
- .start_byte = 0,
- .end_byte = UINT32_MAX
- }
-};
-
-void ts_lexer_set_included_ranges(Lexer *self, const TSRange *ranges, uint32_t count) {
- if (!ranges) {
- ranges = DEFAULT_RANGES;
+void ts_lexer_set_included_ranges(
+ Lexer *self,
+ const TSRange *ranges,
+ uint32_t count
+) {
+ if (count == 0 || !ranges) {
+ ranges = &DEFAULT_RANGE;
count = 1;
}
- size_t sz = count * sizeof(TSRange);
- self->included_ranges = ts_realloc(self->included_ranges, sz);
- memcpy(self->included_ranges, ranges, sz);
+ size_t size = count * sizeof(TSRange);
+ self->included_ranges = ts_realloc(self->included_ranges, size);
+ memcpy(self->included_ranges, ranges, size);
self->included_range_count = count;
ts_lexer_goto(self, self->current_position);
}
diff --git a/src/tree_sitter/lexer.h b/src/tree_sitter/lexer.h
index f523d88f65..8cd9c26706 100644
--- a/src/tree_sitter/lexer.h
+++ b/src/tree_sitter/lexer.h
@@ -16,7 +16,7 @@ typedef struct {
Length token_start_position;
Length token_end_position;
- TSRange * included_ranges;
+ TSRange *included_ranges;
size_t included_range_count;
size_t current_included_range_index;
diff --git a/src/tree_sitter/lib.c b/src/tree_sitter/lib.c
index fc5fbc9210..289d32f4c5 100644
--- a/src/tree_sitter/lib.c
+++ b/src/tree_sitter/lib.c
@@ -2,19 +2,16 @@
//
// The following directories must be added to the include path:
// - include
-// - utf8proc
#define _POSIX_C_SOURCE 200112L
-#define UTF8PROC_STATIC
#include "./get_changed_ranges.c"
#include "./language.c"
#include "./lexer.c"
#include "./node.c"
#include "./parser.c"
+#include "./query.c"
#include "./stack.c"
#include "./subtree.c"
#include "./tree_cursor.c"
#include "./tree.c"
-#include "./utf16.c"
-#include "utf8proc.c"
diff --git a/src/tree_sitter/node.c b/src/tree_sitter/node.c
index 6b2be36ee5..b03e2fc979 100644
--- a/src/tree_sitter/node.c
+++ b/src/tree_sitter/node.c
@@ -415,13 +415,15 @@ TSPoint ts_node_end_point(TSNode self) {
}
TSSymbol ts_node_symbol(TSNode self) {
- return ts_node__alias(&self)
- ? ts_node__alias(&self)
- : ts_subtree_symbol(ts_node__subtree(self));
+ TSSymbol symbol = ts_node__alias(&self);
+ if (!symbol) symbol = ts_subtree_symbol(ts_node__subtree(self));
+ return ts_language_public_symbol(self.tree->language, symbol);
}
const char *ts_node_type(TSNode self) {
- return ts_language_symbol_name(self.tree->language, ts_node_symbol(self));
+ TSSymbol symbol = ts_node__alias(&self);
+ if (!symbol) symbol = ts_subtree_symbol(ts_node__subtree(self));
+ return ts_language_symbol_name(self.tree->language, symbol);
}
char *ts_node_string(TSNode self) {
diff --git a/src/tree_sitter/parser.c b/src/tree_sitter/parser.c
index 88b20845fd..f381afccab 100644
--- a/src/tree_sitter/parser.c
+++ b/src/tree_sitter/parser.c
@@ -351,6 +351,7 @@ static Subtree ts_parser__lex(
Length start_position = ts_stack_position(self->stack, version);
Subtree external_token = ts_stack_last_external_token(self->stack, version);
TSLexMode lex_mode = self->language->lex_modes[parse_state];
+ if (lex_mode.lex_state == (uint16_t)-1) return NULL_SUBTREE;
const bool *valid_external_tokens = ts_language_enabled_external_tokens(
self->language,
lex_mode.external_lex_state
@@ -438,7 +439,7 @@ static Subtree ts_parser__lex(
}
if (self->lexer.current_position.bytes == error_end_position.bytes) {
- if (self->lexer.data.lookahead == 0) {
+ if (self->lexer.data.eof(&self->lexer.data)) {
self->lexer.data.result_symbol = ts_builtin_sym_error;
break;
}
@@ -748,7 +749,8 @@ static StackVersion ts_parser__reduce(
uint32_t count,
int dynamic_precedence,
uint16_t production_id,
- bool fragile
+ bool is_fragile,
+ bool is_extra
) {
uint32_t initial_version_count = ts_stack_version_count(self->stack);
uint32_t removed_version_count = 0;
@@ -813,7 +815,8 @@ static StackVersion ts_parser__reduce(
TSStateId state = ts_stack_state(self->stack, slice_version);
TSStateId next_state = ts_language_next_state(self->language, state, symbol);
- if (fragile || pop.size > 1 || initial_version_count > 1) {
+ if (is_extra) parent.ptr->extra = true;
+ if (is_fragile || pop.size > 1 || initial_version_count > 1) {
parent.ptr->fragile_left = true;
parent.ptr->fragile_right = true;
parent.ptr->parse_state = TS_TREE_STATE_NONE;
@@ -962,7 +965,7 @@ static bool ts_parser__do_all_potential_reductions(
reduction_version = ts_parser__reduce(
self, version, action.symbol, action.count,
action.dynamic_precedence, action.production_id,
- true
+ true, false
);
}
@@ -1366,8 +1369,17 @@ static bool ts_parser__advance(
// Otherwise, re-run the lexer.
if (!lookahead.ptr) {
lookahead = ts_parser__lex(self, version, state);
- ts_parser__set_cached_token(self, position, last_external_token, lookahead);
- ts_language_table_entry(self->language, state, ts_subtree_symbol(lookahead), &table_entry);
+ if (lookahead.ptr) {
+ ts_parser__set_cached_token(self, position, last_external_token, lookahead);
+ ts_language_table_entry(self->language, state, ts_subtree_symbol(lookahead), &table_entry);
+ }
+
+ // When parsing a non-terminal extra, a null lookahead indicates the
+ // end of the rule. The reduction is stored in the EOF table entry.
+ // After the reduction, the lexer needs to be run again.
+ else {
+ ts_language_table_entry(self->language, state, ts_builtin_sym_end, &table_entry);
+ }
}
for (;;) {
@@ -1422,11 +1434,12 @@ static bool ts_parser__advance(
case TSParseActionTypeReduce: {
bool is_fragile = table_entry.action_count > 1;
+ bool is_extra = lookahead.ptr == NULL;
LOG("reduce sym:%s, child_count:%u", SYM_NAME(action.params.symbol), action.params.child_count);
StackVersion reduction_version = ts_parser__reduce(
self, version, action.params.symbol, action.params.child_count,
action.params.dynamic_precedence, action.params.production_id,
- is_fragile
+ is_fragile, is_extra
);
if (reduction_version != STACK_VERSION_NONE) {
last_reduction_version = reduction_version;
@@ -1459,6 +1472,15 @@ static bool ts_parser__advance(
ts_stack_renumber_version(self->stack, last_reduction_version, version);
LOG_STACK();
state = ts_stack_state(self->stack, version);
+
+ // At the end of a non-terminal extra rule, the lexer will return a
+ // null subtree, because the parser needs to perform a fixed reduction
+ // regardless of the lookahead node. After performing that reduction,
+ // (and completing the non-terminal extra rule) run the lexer again based
+ // on the current parse state.
+ if (!lookahead.ptr) {
+ lookahead = ts_parser__lex(self, version, state);
+ }
ts_language_table_entry(
self->language,
state,
@@ -1655,6 +1677,7 @@ TSParser *ts_parser_new(void) {
void ts_parser_delete(TSParser *self) {
if (!self) return;
+ ts_parser_set_language(self, NULL);
ts_stack_delete(self->stack);
if (self->reduce_actions.contents) {
array_delete(&self->reduce_actions);
@@ -1670,7 +1693,6 @@ void ts_parser_delete(TSParser *self) {
ts_parser__set_cached_token(self, 0, NULL_SUBTREE, NULL_SUBTREE);
ts_subtree_pool_delete(&self->tree_pool);
reusable_node_delete(&self->reusable_node);
- ts_parser_set_language(self, NULL);
ts_free(self);
}
@@ -1695,6 +1717,7 @@ bool ts_parser_set_language(TSParser *self, const TSLanguage *language) {
}
self->language = language;
+ ts_parser_reset(self);
return true;
}
@@ -1747,7 +1770,7 @@ const TSRange *ts_parser_included_ranges(const TSParser *self, uint32_t *count)
}
void ts_parser_reset(TSParser *self) {
- if (self->language->external_scanner.deserialize) {
+ if (self->language && self->language->external_scanner.deserialize) {
self->language->external_scanner.deserialize(self->external_scanner_payload, NULL, 0);
}
diff --git a/src/tree_sitter/parser.h b/src/tree_sitter/parser.h
index 974a7ca52f..9df91f8c3c 100644
--- a/src/tree_sitter/parser.h
+++ b/src/tree_sitter/parser.h
@@ -45,7 +45,8 @@ struct TSLexer {
void (*advance)(TSLexer *, bool);
void (*mark_end)(TSLexer *);
uint32_t (*get_column)(TSLexer *);
- bool (*is_at_included_range_start)(TSLexer *);
+ bool (*is_at_included_range_start)(const TSLexer *);
+ bool (*eof)(const TSLexer *);
};
typedef enum {
@@ -117,6 +118,7 @@ struct TSLanguage {
uint32_t large_state_count;
const uint16_t *small_parse_table;
const uint32_t *small_parse_table_map;
+ const TSSymbol *public_symbol_map;
};
/*
@@ -126,6 +128,7 @@ struct TSLanguage {
#define START_LEXER() \
bool result = false; \
bool skip = false; \
+ bool eof = false; \
int32_t lookahead; \
goto start; \
next_state: \
diff --git a/src/tree_sitter/point.h b/src/tree_sitter/point.h
index 4d0aed18ef..a50d20214b 100644
--- a/src/tree_sitter/point.h
+++ b/src/tree_sitter/point.h
@@ -3,6 +3,7 @@
#include "tree_sitter/api.h"
+#define POINT_ZERO ((TSPoint) {0, 0})
#define POINT_MAX ((TSPoint) {UINT32_MAX, UINT32_MAX})
static inline TSPoint point__new(unsigned row, unsigned column) {
diff --git a/src/tree_sitter/query.c b/src/tree_sitter/query.c
new file mode 100644
index 0000000000..2563325248
--- /dev/null
+++ b/src/tree_sitter/query.c
@@ -0,0 +1,1450 @@
+#include "tree_sitter/api.h"
+#include "./alloc.h"
+#include "./array.h"
+#include "./bits.h"
+#include "./language.h"
+#include "./point.h"
+#include "./tree_cursor.h"
+#include "./unicode.h"
+#include <wctype.h>
+
+/*
+ * Stream - A sequence of unicode characters derived from a UTF8 string.
+ * This struct is used in parsing queries from S-expressions.
+ */
+typedef struct {
+ const char *input;
+ const char *end;
+ int32_t next;
+ uint8_t next_size;
+} Stream;
+
+/*
+ * QueryStep - A step in the process of matching a query. Each node within
+ * a query S-expression maps to one of these steps. An entire pattern is
+ * represented as a sequence of these steps. Fields:
+ *
+ * - `symbol` - The grammar symbol to match. A zero value represents the
+ * wildcard symbol, '*'.
+ * - `field` - The field name to match. A zero value means that a field name
+ * was not specified.
+ * - `capture_id` - An integer representing the name of the capture associated
+ * with this node in the pattern. A `NONE` value means this node is not
+ * captured in this pattern.
+ * - `depth` - The depth where this node occurs in the pattern. The root node
+ * of the pattern has depth zero.
+ */
+typedef struct {
+ TSSymbol symbol;
+ TSFieldId field;
+ uint16_t capture_id;
+ uint16_t depth: 15;
+ bool contains_captures: 1;
+} QueryStep;
+
+/*
+ * Slice - A slice of an external array. Within a query, capture names,
+ * literal string values, and predicate step informations are stored in three
+ * contiguous arrays. Individual captures, string values, and predicates are
+ * represented as slices of these three arrays.
+ */
+typedef struct {
+ uint32_t offset;
+ uint32_t length;
+} Slice;
+
+/*
+ * SymbolTable - a two-way mapping of strings to ids.
+ */
+typedef struct {
+ Array(char) characters;
+ Array(Slice) slices;
+} SymbolTable;
+
+/*
+ * PatternEntry - The set of steps needed to match a particular pattern,
+ * represented as a slice of a shared array. These entries are stored in a
+ * 'pattern map' - a sorted array that makes it possible to efficiently lookup
+ * patterns based on the symbol for their first step.
+ */
+typedef struct {
+ uint16_t step_index;
+ uint16_t pattern_index;
+} PatternEntry;
+
+/*
+ * QueryState - The state of an in-progress match of a particular pattern
+ * in a query. While executing, a `TSQueryCursor` must keep track of a number
+ * of possible in-progress matches. Each of those possible matches is
+ * represented as one of these states.
+ */
+typedef struct {
+ uint16_t start_depth;
+ uint16_t pattern_index;
+ uint16_t step_index;
+ uint16_t capture_count;
+ uint16_t capture_list_id;
+ uint16_t consumed_capture_count;
+ uint32_t id;
+} QueryState;
+
+/*
+ * CaptureListPool - A collection of *lists* of captures. Each QueryState
+ * needs to maintain its own list of captures. They are all represented as
+ * slices of one shared array. The CaptureListPool keeps track of which
+ * parts of the shared array are currently in use by a QueryState.
+ */
+typedef struct {
+ Array(TSQueryCapture) list;
+ uint32_t usage_map;
+} CaptureListPool;
+
+/*
+ * TSQuery - A tree query, compiled from a string of S-expressions. The query
+ * itself is immutable. The mutable state used in the process of executing the
+ * query is stored in a `TSQueryCursor`.
+ */
+struct TSQuery {
+ SymbolTable captures;
+ SymbolTable predicate_values;
+ Array(QueryStep) steps;
+ Array(PatternEntry) pattern_map;
+ Array(TSQueryPredicateStep) predicate_steps;
+ Array(Slice) predicates_by_pattern;
+ Array(uint32_t) start_bytes_by_pattern;
+ const TSLanguage *language;
+ uint16_t max_capture_count;
+ uint16_t wildcard_root_pattern_count;
+ TSSymbol *symbol_map;
+};
+
+/*
+ * TSQueryCursor - A stateful struct used to execute a query on a tree.
+ */
+struct TSQueryCursor {
+ const TSQuery *query;
+ TSTreeCursor cursor;
+ Array(QueryState) states;
+ Array(QueryState) finished_states;
+ CaptureListPool capture_list_pool;
+ uint32_t depth;
+ uint32_t start_byte;
+ uint32_t end_byte;
+ uint32_t next_state_id;
+ TSPoint start_point;
+ TSPoint end_point;
+ bool ascending;
+};
+
+static const TSQueryError PARENT_DONE = -1;
+static const uint8_t PATTERN_DONE_MARKER = UINT8_MAX;
+static const uint16_t NONE = UINT16_MAX;
+static const TSSymbol WILDCARD_SYMBOL = 0;
+static const uint16_t MAX_STATE_COUNT = 32;
+
+// #define LOG(...) fprintf(stderr, __VA_ARGS__)
+#define LOG(...)
+
+/**********
+ * Stream
+ **********/
+
+// Advance to the next unicode code point in the stream.
+static bool stream_advance(Stream *self) {
+ self->input += self->next_size;
+ if (self->input < self->end) {
+ uint32_t size = ts_decode_utf8(
+ (const uint8_t *)self->input,
+ self->end - self->input,
+ &self->next
+ );
+ if (size > 0) {
+ self->next_size = size;
+ return true;
+ }
+ } else {
+ self->next_size = 0;
+ self->next = '\0';
+ }
+ return false;
+}
+
+// Reset the stream to the given input position, represented as a pointer
+// into the input string.
+static void stream_reset(Stream *self, const char *input) {
+ self->input = input;
+ self->next_size = 0;
+ stream_advance(self);
+}
+
+static Stream stream_new(const char *string, uint32_t length) {
+ Stream self = {
+ .next = 0,
+ .input = string,
+ .end = string + length,
+ };
+ stream_advance(&self);
+ return self;
+}
+
+static void stream_skip_whitespace(Stream *stream) {
+ for (;;) {
+ if (iswspace(stream->next)) {
+ stream_advance(stream);
+ } else if (stream->next == ';') {
+ // skip over comments
+ stream_advance(stream);
+ while (stream->next && stream->next != '\n') {
+ if (!stream_advance(stream)) break;
+ }
+ } else {
+ break;
+ }
+ }
+}
+
+static bool stream_is_ident_start(Stream *stream) {
+ return iswalnum(stream->next) || stream->next == '_' || stream->next == '-';
+}
+
+static void stream_scan_identifier(Stream *stream) {
+ do {
+ stream_advance(stream);
+ } while (
+ iswalnum(stream->next) ||
+ stream->next == '_' ||
+ stream->next == '-' ||
+ stream->next == '.' ||
+ stream->next == '?' ||
+ stream->next == '!'
+ );
+}
+
+/******************
+ * CaptureListPool
+ ******************/
+
+static CaptureListPool capture_list_pool_new(void) {
+ return (CaptureListPool) {
+ .list = array_new(),
+ .usage_map = UINT32_MAX,
+ };
+}
+
+static void capture_list_pool_reset(CaptureListPool *self, uint16_t list_size) {
+ self->usage_map = UINT32_MAX;
+ uint32_t total_size = MAX_STATE_COUNT * list_size;
+ array_reserve(&self->list, total_size);
+ self->list.size = total_size;
+}
+
+static void capture_list_pool_delete(CaptureListPool *self) {
+ array_delete(&self->list);
+}
+
+static TSQueryCapture *capture_list_pool_get(CaptureListPool *self, uint16_t id) {
+ return &self->list.contents[id * (self->list.size / MAX_STATE_COUNT)];
+}
+
+static bool capture_list_pool_is_empty(const CaptureListPool *self) {
+ return self->usage_map == 0;
+}
+
+static uint16_t capture_list_pool_acquire(CaptureListPool *self) {
+ // In the usage_map bitmask, ones represent free lists, and zeros represent
+ // lists that are in use. A free list id can quickly be found by counting
+ // the leading zeros in the usage map. An id of zero corresponds to the
+ // highest-order bit in the bitmask.
+ uint16_t id = count_leading_zeros(self->usage_map);
+ if (id == 32) return NONE;
+ self->usage_map &= ~bitmask_for_index(id);
+ return id;
+}
+
+static void capture_list_pool_release(CaptureListPool *self, uint16_t id) {
+ self->usage_map |= bitmask_for_index(id);
+}
+
+/**************
+ * SymbolTable
+ **************/
+
+static SymbolTable symbol_table_new(void) {
+ return (SymbolTable) {
+ .characters = array_new(),
+ .slices = array_new(),
+ };
+}
+
+static void symbol_table_delete(SymbolTable *self) {
+ array_delete(&self->characters);
+ array_delete(&self->slices);
+}
+
+static int symbol_table_id_for_name(
+ const SymbolTable *self,
+ const char *name,
+ uint32_t length
+) {
+ for (unsigned i = 0; i < self->slices.size; i++) {
+ Slice slice = self->slices.contents[i];
+ if (
+ slice.length == length &&
+ !strncmp(&self->characters.contents[slice.offset], name, length)
+ ) return i;
+ }
+ return -1;
+}
+
+static const char *symbol_table_name_for_id(
+ const SymbolTable *self,
+ uint16_t id,
+ uint32_t *length
+) {
+ Slice slice = self->slices.contents[id];
+ *length = slice.length;
+ return &self->characters.contents[slice.offset];
+}
+
+static uint16_t symbol_table_insert_name(
+ SymbolTable *self,
+ const char *name,
+ uint32_t length
+) {
+ int id = symbol_table_id_for_name(self, name, length);
+ if (id >= 0) return (uint16_t)id;
+ Slice slice = {
+ .offset = self->characters.size,
+ .length = length,
+ };
+ array_grow_by(&self->characters, length + 1);
+ memcpy(&self->characters.contents[slice.offset], name, length);
+ self->characters.contents[self->characters.size - 1] = 0;
+ array_push(&self->slices, slice);
+ return self->slices.size - 1;
+}
+
+/*********
+ * Query
+ *********/
+
+// The `pattern_map` contains a mapping from TSSymbol values to indices in the
+// `steps` array. For a given syntax node, the `pattern_map` makes it possible
+// to quickly find the starting steps of all of the patterns whose root matches
+// that node. Each entry has two fields: a `pattern_index`, which identifies one
+// of the patterns in the query, and a `step_index`, which indicates the start
+// offset of that pattern's steps pattern within the `steps` array.
+//
+// The entries are sorted by the patterns' root symbols, and lookups use a
+// binary search. This ensures that the cost of this initial lookup step
+// scales logarithmically with the number of patterns in the query.
+//
+// This returns `true` if the symbol is present and `false` otherwise.
+// If the symbol is not present `*result` is set to the index where the
+// symbol should be inserted.
+static inline bool ts_query__pattern_map_search(
+ const TSQuery *self,
+ TSSymbol needle,
+ uint32_t *result
+) {
+ uint32_t base_index = self->wildcard_root_pattern_count;
+ uint32_t size = self->pattern_map.size - base_index;
+ if (size == 0) {
+ *result = base_index;
+ return false;
+ }
+ while (size > 1) {
+ uint32_t half_size = size / 2;
+ uint32_t mid_index = base_index + half_size;
+ TSSymbol mid_symbol = self->steps.contents[
+ self->pattern_map.contents[mid_index].step_index
+ ].symbol;
+ if (needle > mid_symbol) base_index = mid_index;
+ size -= half_size;
+ }
+
+ TSSymbol symbol = self->steps.contents[
+ self->pattern_map.contents[base_index].step_index
+ ].symbol;
+
+ if (needle > symbol) {
+ base_index++;
+ if (base_index < self->pattern_map.size) {
+ symbol = self->steps.contents[
+ self->pattern_map.contents[base_index].step_index
+ ].symbol;
+ }
+ }
+
+ *result = base_index;
+ return needle == symbol;
+}
+
+// Insert a new pattern's start index into the pattern map, maintaining
+// the pattern map's ordering invariant.
+static inline void ts_query__pattern_map_insert(
+ TSQuery *self,
+ TSSymbol symbol,
+ uint32_t start_step_index
+) {
+ uint32_t index;
+ ts_query__pattern_map_search(self, symbol, &index);
+ array_insert(&self->pattern_map, index, ((PatternEntry) {
+ .step_index = start_step_index,
+ .pattern_index = self->pattern_map.size,
+ }));
+}
+
+static void ts_query__finalize_steps(TSQuery *self) {
+ for (unsigned i = 0; i < self->steps.size; i++) {
+ QueryStep *step = &self->steps.contents[i];
+ uint32_t depth = step->depth;
+ if (step->capture_id != NONE) {
+ step->contains_captures = true;
+ } else {
+ step->contains_captures = false;
+ for (unsigned j = i + 1; j < self->steps.size; j++) {
+ QueryStep *s = &self->steps.contents[j];
+ if (s->depth == PATTERN_DONE_MARKER || s->depth <= depth) break;
+ if (s->capture_id != NONE) step->contains_captures = true;
+ }
+ }
+ }
+}
+
+// Parse a single predicate associated with a pattern, adding it to the
+// query's internal `predicate_steps` array. Predicates are arbitrary
+// S-expressions associated with a pattern which are meant to be handled at
+// a higher level of abstraction, such as the Rust/JavaScript bindings. They
+// can contain '@'-prefixed capture names, double-quoted strings, and bare
+// symbols, which also represent strings.
+static TSQueryError ts_query__parse_predicate(
+ TSQuery *self,
+ Stream *stream
+) {
+ if (stream->next == ')') return PARENT_DONE;
+ if (stream->next != '(') return TSQueryErrorSyntax;
+ stream_advance(stream);
+ stream_skip_whitespace(stream);
+
+ unsigned step_count = 0;
+ for (;;) {
+ if (stream->next == ')') {
+ stream_advance(stream);
+ stream_skip_whitespace(stream);
+ array_back(&self->predicates_by_pattern)->length++;
+ array_push(&self->predicate_steps, ((TSQueryPredicateStep) {
+ .type = TSQueryPredicateStepTypeDone,
+ .value_id = 0,
+ }));
+ break;
+ }
+
+ // Parse an '@'-prefixed capture name
+ else if (stream->next == '@') {
+ stream_advance(stream);
+
+ // Parse the capture name
+ if (!stream_is_ident_start(stream)) return TSQueryErrorSyntax;
+ const char *capture_name = stream->input;
+ stream_scan_identifier(stream);
+ uint32_t length = stream->input - capture_name;
+
+ // Add the capture id to the first step of the pattern
+ int capture_id = symbol_table_id_for_name(
+ &self->captures,
+ capture_name,
+ length
+ );
+ if (capture_id == -1) {
+ stream_reset(stream, capture_name);
+ return TSQueryErrorCapture;
+ }
+
+ array_back(&self->predicates_by_pattern)->length++;
+ array_push(&self->predicate_steps, ((TSQueryPredicateStep) {
+ .type = TSQueryPredicateStepTypeCapture,
+ .value_id = capture_id,
+ }));
+ }
+
+ // Parse a string literal
+ else if (stream->next == '"') {
+ stream_advance(stream);
+
+ // Parse the string content
+ const char *string_content = stream->input;
+ while (stream->next != '"') {
+ if (stream->next == '\n' || !stream_advance(stream)) {
+ stream_reset(stream, string_content - 1);
+ return TSQueryErrorSyntax;
+ }
+ }
+ uint32_t length = stream->input - string_content;
+
+ // Add a step for the node
+ uint16_t id = symbol_table_insert_name(
+ &self->predicate_values,
+ string_content,
+ length
+ );
+ array_back(&self->predicates_by_pattern)->length++;
+ array_push(&self->predicate_steps, ((TSQueryPredicateStep) {
+ .type = TSQueryPredicateStepTypeString,
+ .value_id = id,
+ }));
+
+ if (stream->next != '"') return TSQueryErrorSyntax;
+ stream_advance(stream);
+ }
+
+ // Parse a bare symbol
+ else if (stream_is_ident_start(stream)) {
+ const char *symbol_start = stream->input;
+ stream_scan_identifier(stream);
+ uint32_t length = stream->input - symbol_start;
+ uint16_t id = symbol_table_insert_name(
+ &self->predicate_values,
+ symbol_start,
+ length
+ );
+ array_back(&self->predicates_by_pattern)->length++;
+ array_push(&self->predicate_steps, ((TSQueryPredicateStep) {
+ .type = TSQueryPredicateStepTypeString,
+ .value_id = id,
+ }));
+ }
+
+ else {
+ return TSQueryErrorSyntax;
+ }
+
+ step_count++;
+ stream_skip_whitespace(stream);
+ }
+
+ return 0;
+}
+
+// Read one S-expression pattern from the stream, and incorporate it into
+// the query's internal state machine representation. For nested patterns,
+// this function calls itself recursively.
+static TSQueryError ts_query__parse_pattern(
+ TSQuery *self,
+ Stream *stream,
+ uint32_t depth,
+ uint32_t *capture_count
+) {
+ uint16_t starting_step_index = self->steps.size;
+
+ if (stream->next == 0) return TSQueryErrorSyntax;
+
+ // Finish the parent S-expression
+ if (stream->next == ')') {
+ return PARENT_DONE;
+ }
+
+ // Parse a parenthesized node expression
+ else if (stream->next == '(') {
+ stream_advance(stream);
+ stream_skip_whitespace(stream);
+
+ // Parse a nested list, which represents a pattern followed by
+ // zero-or-more predicates.
+ if (stream->next == '(' && depth == 0) {
+ TSQueryError e = ts_query__parse_pattern(self, stream, 0, capture_count);
+ if (e) return e;
+
+ // Parse the predicates.
+ stream_skip_whitespace(stream);
+ for (;;) {
+ TSQueryError e = ts_query__parse_predicate(self, stream);
+ if (e == PARENT_DONE) {
+ stream_advance(stream);
+ stream_skip_whitespace(stream);
+ return 0;
+ } else if (e) {
+ return e;
+ }
+ }
+ }
+
+ TSSymbol symbol;
+
+ // Parse the wildcard symbol
+ if (stream->next == '*') {
+ symbol = WILDCARD_SYMBOL;
+ stream_advance(stream);
+ }
+
+ // Parse a normal node name
+ else if (stream_is_ident_start(stream)) {
+ const char *node_name = stream->input;
+ stream_scan_identifier(stream);
+ uint32_t length = stream->input - node_name;
+ symbol = ts_language_symbol_for_name(
+ self->language,
+ node_name,
+ length,
+ true
+ );
+ if (!symbol) {
+ stream_reset(stream, node_name);
+ return TSQueryErrorNodeType;
+ }
+ } else {
+ return TSQueryErrorSyntax;
+ }
+
+ // Add a step for the node.
+ array_push(&self->steps, ((QueryStep) {
+ .depth = depth,
+ .symbol = symbol,
+ .field = 0,
+ .capture_id = NONE,
+ .contains_captures = false,
+ }));
+
+ // Parse the child patterns
+ stream_skip_whitespace(stream);
+ for (;;) {
+ TSQueryError e = ts_query__parse_pattern(self, stream, depth + 1, capture_count);
+ if (e == PARENT_DONE) {
+ stream_advance(stream);
+ break;
+ } else if (e) {
+ return e;
+ }
+ }
+ }
+
+ // Parse a double-quoted anonymous leaf node expression
+ else if (stream->next == '"') {
+ stream_advance(stream);
+
+ // Parse the string content
+ const char *string_content = stream->input;
+ while (stream->next != '"') {
+ if (!stream_advance(stream)) {
+ stream_reset(stream, string_content - 1);
+ return TSQueryErrorSyntax;
+ }
+ }
+ uint32_t length = stream->input - string_content;
+
+ // Add a step for the node
+ TSSymbol symbol = ts_language_symbol_for_name(
+ self->language,
+ string_content,
+ length,
+ false
+ );
+ if (!symbol) {
+ stream_reset(stream, string_content);
+ return TSQueryErrorNodeType;
+ }
+ array_push(&self->steps, ((QueryStep) {
+ .depth = depth,
+ .symbol = symbol,
+ .field = 0,
+ .capture_id = NONE,
+ .contains_captures = false,
+ }));
+
+ if (stream->next != '"') return TSQueryErrorSyntax;
+ stream_advance(stream);
+ }
+
+ // Parse a field-prefixed pattern
+ else if (stream_is_ident_start(stream)) {
+ // Parse the field name
+ const char *field_name = stream->input;
+ stream_scan_identifier(stream);
+ uint32_t length = stream->input - field_name;
+ stream_skip_whitespace(stream);
+
+ if (stream->next != ':') {
+ stream_reset(stream, field_name);
+ return TSQueryErrorSyntax;
+ }
+ stream_advance(stream);
+ stream_skip_whitespace(stream);
+
+ // Parse the pattern
+ uint32_t step_index = self->steps.size;
+ TSQueryError e = ts_query__parse_pattern(self, stream, depth, capture_count);
+ if (e == PARENT_DONE) return TSQueryErrorSyntax;
+ if (e) return e;
+
+ // Add the field name to the first step of the pattern
+ TSFieldId field_id = ts_language_field_id_for_name(
+ self->language,
+ field_name,
+ length
+ );
+ if (!field_id) {
+ stream->input = field_name;
+ return TSQueryErrorField;
+ }
+ self->steps.contents[step_index].field = field_id;
+ }
+
+ // Parse a wildcard pattern
+ else if (stream->next == '*') {
+ stream_advance(stream);
+ stream_skip_whitespace(stream);
+
+ // Add a step that matches any kind of node
+ array_push(&self->steps, ((QueryStep) {
+ .depth = depth,
+ .symbol = WILDCARD_SYMBOL,
+ .field = 0,
+ .contains_captures = false,
+ }));
+ }
+
+ else {
+ return TSQueryErrorSyntax;
+ }
+
+ stream_skip_whitespace(stream);
+
+ // Parse an '@'-prefixed capture pattern
+ if (stream->next == '@') {
+ stream_advance(stream);
+
+ // Parse the capture name
+ if (!stream_is_ident_start(stream)) return TSQueryErrorSyntax;
+ const char *capture_name = stream->input;
+ stream_scan_identifier(stream);
+ uint32_t length = stream->input - capture_name;
+
+ // Add the capture id to the first step of the pattern
+ uint16_t capture_id = symbol_table_insert_name(
+ &self->captures,
+ capture_name,
+ length
+ );
+ self->steps.contents[starting_step_index].capture_id = capture_id;
+ (*capture_count)++;
+
+ stream_skip_whitespace(stream);
+ }
+
+ return 0;
+}
+
+TSQuery *ts_query_new(
+ const TSLanguage *language,
+ const char *source,
+ uint32_t source_len,
+ uint32_t *error_offset,
+ TSQueryError *error_type
+) {
+ TSSymbol *symbol_map;
+ if (ts_language_version(language) >= TREE_SITTER_LANGUAGE_VERSION_WITH_SYMBOL_DEDUPING) {
+ symbol_map = NULL;
+ } else {
+ // Work around the fact that multiple symbols can currently be
+ // associated with the same name, due to "simple aliases".
+ // In the next language ABI version, this map will be contained
+ // in the language's `public_symbol_map` field.
+ uint32_t symbol_count = ts_language_symbol_count(language);
+ symbol_map = ts_malloc(sizeof(TSSymbol) * symbol_count);
+ for (unsigned i = 0; i < symbol_count; i++) {
+ const char *name = ts_language_symbol_name(language, i);
+ const TSSymbolType symbol_type = ts_language_symbol_type(language, i);
+
+ symbol_map[i] = i;
+
+ for (unsigned j = 0; j < i; j++) {
+ if (ts_language_symbol_type(language, j) == symbol_type) {
+ if (!strcmp(name, ts_language_symbol_name(language, j))) {
+ symbol_map[i] = j;
+ break;
+ }
+ }
+ }
+ }
+ }
+
+ TSQuery *self = ts_malloc(sizeof(TSQuery));
+ *self = (TSQuery) {
+ .steps = array_new(),
+ .pattern_map = array_new(),
+ .captures = symbol_table_new(),
+ .predicate_values = symbol_table_new(),
+ .predicate_steps = array_new(),
+ .predicates_by_pattern = array_new(),
+ .symbol_map = symbol_map,
+ .wildcard_root_pattern_count = 0,
+ .max_capture_count = 0,
+ .language = language,
+ };
+
+ // Parse all of the S-expressions in the given string.
+ Stream stream = stream_new(source, source_len);
+ stream_skip_whitespace(&stream);
+ uint32_t start_step_index;
+ while (stream.input < stream.end) {
+ start_step_index = self->steps.size;
+ uint32_t capture_count = 0;
+ array_push(&self->start_bytes_by_pattern, stream.input - source);
+ array_push(&self->predicates_by_pattern, ((Slice) {
+ .offset = self->predicate_steps.size,
+ .length = 0,
+ }));
+ *error_type = ts_query__parse_pattern(self, &stream, 0, &capture_count);
+ array_push(&self->steps, ((QueryStep) { .depth = PATTERN_DONE_MARKER }));
+
+ // If any pattern could not be parsed, then report the error information
+ // and terminate.
+ if (*error_type) {
+ *error_offset = stream.input - source;
+ ts_query_delete(self);
+ return NULL;
+ }
+
+ // Maintain a map that can look up patterns for a given root symbol.
+ ts_query__pattern_map_insert(
+ self,
+ self->steps.contents[start_step_index].symbol,
+ start_step_index
+ );
+ if (self->steps.contents[start_step_index].symbol == WILDCARD_SYMBOL) {
+ self->wildcard_root_pattern_count++;
+ }
+
+ // Keep track of the maximum number of captures in pattern, because
+ // that numer determines how much space is needed to store each capture
+ // list.
+ if (capture_count > self->max_capture_count) {
+ self->max_capture_count = capture_count;
+ }
+ }
+
+ ts_query__finalize_steps(self);
+ return self;
+}
+
+void ts_query_delete(TSQuery *self) {
+ if (self) {
+ array_delete(&self->steps);
+ array_delete(&self->pattern_map);
+ array_delete(&self->predicate_steps);
+ array_delete(&self->predicates_by_pattern);
+ array_delete(&self->start_bytes_by_pattern);
+ symbol_table_delete(&self->captures);
+ symbol_table_delete(&self->predicate_values);
+ ts_free(self->symbol_map);
+ ts_free(self);
+ }
+}
+
+uint32_t ts_query_pattern_count(const TSQuery *self) {
+ return self->predicates_by_pattern.size;
+}
+
+uint32_t ts_query_capture_count(const TSQuery *self) {
+ return self->captures.slices.size;
+}
+
+uint32_t ts_query_string_count(const TSQuery *self) {
+ return self->predicate_values.slices.size;
+}
+
+const char *ts_query_capture_name_for_id(
+ const TSQuery *self,
+ uint32_t index,
+ uint32_t *length
+) {
+ return symbol_table_name_for_id(&self->captures, index, length);
+}
+
+const char *ts_query_string_value_for_id(
+ const TSQuery *self,
+ uint32_t index,
+ uint32_t *length
+) {
+ return symbol_table_name_for_id(&self->predicate_values, index, length);
+}
+
+const TSQueryPredicateStep *ts_query_predicates_for_pattern(
+ const TSQuery *self,
+ uint32_t pattern_index,
+ uint32_t *step_count
+) {
+ Slice slice = self->predicates_by_pattern.contents[pattern_index];
+ *step_count = slice.length;
+ return &self->predicate_steps.contents[slice.offset];
+}
+
+uint32_t ts_query_start_byte_for_pattern(
+ const TSQuery *self,
+ uint32_t pattern_index
+) {
+ return self->start_bytes_by_pattern.contents[pattern_index];
+}
+
+void ts_query_disable_capture(
+ TSQuery *self,
+ const char *name,
+ uint32_t length
+) {
+ int id = symbol_table_id_for_name(&self->captures, name, length);
+ if (id != -1) {
+ for (unsigned i = 0; i < self->steps.size; i++) {
+ QueryStep *step = &self->steps.contents[i];
+ if (step->capture_id == id) {
+ step->capture_id = NONE;
+ }
+ }
+ }
+ ts_query__finalize_steps(self);
+}
+
+/***************
+ * QueryCursor
+ ***************/
+
+TSQueryCursor *ts_query_cursor_new() {
+ TSQueryCursor *self = ts_malloc(sizeof(TSQueryCursor));
+ *self = (TSQueryCursor) {
+ .ascending = false,
+ .states = array_new(),
+ .finished_states = array_new(),
+ .capture_list_pool = capture_list_pool_new(),
+ .start_byte = 0,
+ .end_byte = UINT32_MAX,
+ .start_point = {0, 0},
+ .end_point = POINT_MAX,
+ };
+ array_reserve(&self->states, MAX_STATE_COUNT);
+ array_reserve(&self->finished_states, MAX_STATE_COUNT);
+ return self;
+}
+
+void ts_query_cursor_delete(TSQueryCursor *self) {
+ array_delete(&self->states);
+ array_delete(&self->finished_states);
+ ts_tree_cursor_delete(&self->cursor);
+ capture_list_pool_delete(&self->capture_list_pool);
+ ts_free(self);
+}
+
+void ts_query_cursor_exec(
+ TSQueryCursor *self,
+ const TSQuery *query,
+ TSNode node
+) {
+ array_clear(&self->states);
+ array_clear(&self->finished_states);
+ ts_tree_cursor_reset(&self->cursor, node);
+ capture_list_pool_reset(&self->capture_list_pool, query->max_capture_count);
+ self->next_state_id = 0;
+ self->depth = 0;
+ self->ascending = false;
+ self->query = query;
+}
+
+void ts_query_cursor_set_byte_range(
+ TSQueryCursor *self,
+ uint32_t start_byte,
+ uint32_t end_byte
+) {
+ if (end_byte == 0) {
+ start_byte = 0;
+ end_byte = UINT32_MAX;
+ }
+ self->start_byte = start_byte;
+ self->end_byte = end_byte;
+}
+
+void ts_query_cursor_set_point_range(
+ TSQueryCursor *self,
+ TSPoint start_point,
+ TSPoint end_point
+) {
+ if (end_point.row == 0 && end_point.column == 0) {
+ start_point = POINT_ZERO;
+ end_point = POINT_MAX;
+ }
+ self->start_point = start_point;
+ self->end_point = end_point;
+}
+
+// Search through all of the in-progress states, and find the captured
+// node that occurs earliest in the document.
+static bool ts_query_cursor__first_in_progress_capture(
+ TSQueryCursor *self,
+ uint32_t *state_index,
+ uint32_t *byte_offset,
+ uint32_t *pattern_index
+) {
+ bool result = false;
+ for (unsigned i = 0; i < self->states.size; i++) {
+ const QueryState *state = &self->states.contents[i];
+ if (state->capture_count > 0) {
+ const TSQueryCapture *captures = capture_list_pool_get(
+ &self->capture_list_pool,
+ state->capture_list_id
+ );
+ uint32_t capture_byte = ts_node_start_byte(captures[0].node);
+ if (
+ !result ||
+ capture_byte < *byte_offset ||
+ (
+ capture_byte == *byte_offset &&
+ state->pattern_index < *pattern_index
+ )
+ ) {
+ result = true;
+ *state_index = i;
+ *byte_offset = capture_byte;
+ *pattern_index = state->pattern_index;
+ }
+ }
+ }
+ return result;
+}
+
+static bool ts_query__cursor_add_state(
+ TSQueryCursor *self,
+ const PatternEntry *slice
+) {
+ uint32_t list_id = capture_list_pool_acquire(&self->capture_list_pool);
+
+ // If there are no capture lists left in the pool, then terminate whichever
+ // state has captured the earliest node in the document, and steal its
+ // capture list.
+ if (list_id == NONE) {
+ uint32_t state_index, byte_offset, pattern_index;
+ if (ts_query_cursor__first_in_progress_capture(
+ self,
+ &state_index,
+ &byte_offset,
+ &pattern_index
+ )) {
+ LOG(
+ " abandon state. index:%u, pattern:%u, offset:%u.\n",
+ state_index, pattern_index, byte_offset
+ );
+ list_id = self->states.contents[state_index].capture_list_id;
+ array_erase(&self->states, state_index);
+ } else {
+ LOG(" too many finished states.\n");
+ return false;
+ }
+ }
+
+ LOG(" start state. pattern:%u\n", slice->pattern_index);
+ array_push(&self->states, ((QueryState) {
+ .capture_list_id = list_id,
+ .step_index = slice->step_index,
+ .pattern_index = slice->pattern_index,
+ .start_depth = self->depth,
+ .capture_count = 0,
+ .consumed_capture_count = 0,
+ }));
+ return true;
+}
+
+static QueryState *ts_query__cursor_copy_state(
+ TSQueryCursor *self,
+ const QueryState *state
+) {
+ uint32_t new_list_id = capture_list_pool_acquire(&self->capture_list_pool);
+ if (new_list_id == NONE) return NULL;
+ array_push(&self->states, *state);
+ QueryState *new_state = array_back(&self->states);
+ new_state->capture_list_id = new_list_id;
+ TSQueryCapture *old_captures = capture_list_pool_get(
+ &self->capture_list_pool,
+ state->capture_list_id
+ );
+ TSQueryCapture *new_captures = capture_list_pool_get(
+ &self->capture_list_pool,
+ new_list_id
+ );
+ memcpy(new_captures, old_captures, state->capture_count * sizeof(TSQueryCapture));
+ return new_state;
+}
+
+// Walk the tree, processing patterns until at least one pattern finishes,
+// If one or more patterns finish, return `true` and store their states in the
+// `finished_states` array. Multiple patterns can finish on the same node. If
+// there are no more matches, return `false`.
+static inline bool ts_query_cursor__advance(TSQueryCursor *self) {
+ do {
+ if (self->ascending) {
+ LOG("leave node. type:%s\n", ts_node_type(ts_tree_cursor_current_node(&self->cursor)));
+
+ // When leaving a node, remove any unfinished states whose next step
+ // needed to match something within that node.
+ uint32_t deleted_count = 0;
+ for (unsigned i = 0, n = self->states.size; i < n; i++) {
+ QueryState *state = &self->states.contents[i];
+ QueryStep *step = &self->query->steps.contents[state->step_index];
+
+ if ((uint32_t)state->start_depth + (uint32_t)step->depth > self->depth) {
+ LOG(
+ " failed to match. pattern:%u, step:%u\n",
+ state->pattern_index,
+ state->step_index
+ );
+
+ capture_list_pool_release(
+ &self->capture_list_pool,
+ state->capture_list_id
+ );
+ deleted_count++;
+ } else if (deleted_count > 0) {
+ self->states.contents[i - deleted_count] = *state;
+ }
+ }
+
+ self->states.size -= deleted_count;
+
+ if (ts_tree_cursor_goto_next_sibling(&self->cursor)) {
+ self->ascending = false;
+ } else if (ts_tree_cursor_goto_parent(&self->cursor)) {
+ self->depth--;
+ } else {
+ return self->finished_states.size > 0;
+ }
+ } else {
+ bool can_have_later_siblings;
+ bool can_have_later_siblings_with_this_field;
+ TSFieldId field_id = ts_tree_cursor_current_status(
+ &self->cursor,
+ &can_have_later_siblings,
+ &can_have_later_siblings_with_this_field
+ );
+ TSNode node = ts_tree_cursor_current_node(&self->cursor);
+ TSSymbol symbol = ts_node_symbol(node);
+ if (symbol != ts_builtin_sym_error && self->query->symbol_map) {
+ symbol = self->query->symbol_map[symbol];
+ }
+
+ // If this node is before the selected range, then avoid descending
+ // into it.
+ if (
+ ts_node_end_byte(node) <= self->start_byte ||
+ point_lte(ts_node_end_point(node), self->start_point)
+ ) {
+ if (!ts_tree_cursor_goto_next_sibling(&self->cursor)) {
+ self->ascending = true;
+ }
+ continue;
+ }
+
+ // If this node is after the selected range, then stop walking.
+ if (
+ self->end_byte <= ts_node_start_byte(node) ||
+ point_lte(self->end_point, ts_node_start_point(node))
+ ) return false;
+
+ LOG(
+ "enter node. type:%s, field:%s, row:%u state_count:%u, finished_state_count:%u, can_have_later_siblings:%d, can_have_later_siblings_with_this_field:%d\n",
+ ts_node_type(node),
+ ts_language_field_name_for_id(self->query->language, field_id),
+ ts_node_start_point(node).row,
+ self->states.size,
+ self->finished_states.size,
+ can_have_later_siblings,
+ can_have_later_siblings_with_this_field
+ );
+
+ // Add new states for any patterns whose root node is a wildcard.
+ for (unsigned i = 0; i < self->query->wildcard_root_pattern_count; i++) {
+ PatternEntry *slice = &self->query->pattern_map.contents[i];
+ QueryStep *step = &self->query->steps.contents[slice->step_index];
+
+ // If this node matches the first step of the pattern, then add a new
+ // state at the start of this pattern.
+ if (step->field && field_id != step->field) continue;
+ if (!ts_query__cursor_add_state(self, slice)) break;
+ }
+
+ // Add new states for any patterns whose root node matches this node.
+ unsigned i;
+ if (ts_query__pattern_map_search(self->query, symbol, &i)) {
+ PatternEntry *slice = &self->query->pattern_map.contents[i];
+ QueryStep *step = &self->query->steps.contents[slice->step_index];
+ do {
+ // If this node matches the first step of the pattern, then add a new
+ // state at the start of this pattern.
+ if (step->field && field_id != step->field) continue;
+ if (!ts_query__cursor_add_state(self, slice)) break;
+
+ // Advance to the next pattern whose root node matches this node.
+ i++;
+ if (i == self->query->pattern_map.size) break;
+ slice = &self->query->pattern_map.contents[i];
+ step = &self->query->steps.contents[slice->step_index];
+ } while (step->symbol == symbol);
+ }
+
+ // Update all of the in-progress states with current node.
+ for (unsigned i = 0, n = self->states.size; i < n; i++) {
+ QueryState *state = &self->states.contents[i];
+ QueryStep *step = &self->query->steps.contents[state->step_index];
+
+ // Check that the node matches all of the criteria for the next
+ // step of the pattern.if (
+ if ((uint32_t)state->start_depth + (uint32_t)step->depth != self->depth) continue;
+
+ // Determine if this node matches this step of the pattern, and also
+ // if this node can have later siblings that match this step of the
+ // pattern.
+ bool node_does_match = !step->symbol || step->symbol == symbol;
+ bool later_sibling_can_match = can_have_later_siblings;
+ if (step->field) {
+ if (step->field == field_id) {
+ if (!can_have_later_siblings_with_this_field) {
+ later_sibling_can_match = false;
+ }
+ } else {
+ node_does_match = false;
+ }
+ }
+
+ if (!node_does_match) {
+ if (!later_sibling_can_match) {
+ LOG(
+ " discard state. pattern:%u, step:%u\n",
+ state->pattern_index,
+ state->step_index
+ );
+ capture_list_pool_release(
+ &self->capture_list_pool,
+ state->capture_list_id
+ );
+ array_erase(&self->states, i);
+ i--;
+ n--;
+ }
+ continue;
+ }
+
+ // Some patterns can match their root node in multiple ways,
+ // capturing different children. If this pattern step could match
+ // later children within the same parent, then this query state
+ // cannot simply be updated in place. It must be split into two
+ // states: one that matches this node, and one which skips over
+ // this node, to preserve the possibility of matching later
+ // siblings.
+ QueryState *next_state = state;
+ if (
+ step->depth > 0 &&
+ step->contains_captures &&
+ later_sibling_can_match
+ ) {
+ QueryState *copy = ts_query__cursor_copy_state(self, state);
+ if (copy) {
+ LOG(
+ " split state. pattern:%u, step:%u\n",
+ copy->pattern_index,
+ copy->step_index
+ );
+ next_state = copy;
+ } else {
+ LOG(" canot split state.\n");
+ }
+ }
+
+ LOG(
+ " advance state. pattern:%u, step:%u\n",
+ next_state->pattern_index,
+ next_state->step_index
+ );
+
+ // If the current node is captured in this pattern, add it to the
+ // capture list.
+ if (step->capture_id != NONE) {
+ LOG(
+ " capture node. pattern:%u, capture_id:%u\n",
+ next_state->pattern_index,
+ step->capture_id
+ );
+ TSQueryCapture *capture_list = capture_list_pool_get(
+ &self->capture_list_pool,
+ next_state->capture_list_id
+ );
+ capture_list[next_state->capture_count++] = (TSQueryCapture) {
+ node,
+ step->capture_id
+ };
+ }
+
+ // If the pattern is now done, then remove it from the list of
+ // in-progress states, and add it to the list of finished states.
+ next_state->step_index++;
+ QueryStep *next_step = step + 1;
+ if (next_step->depth == PATTERN_DONE_MARKER) {
+ LOG(" finish pattern %u\n", next_state->pattern_index);
+
+ next_state->id = self->next_state_id++;
+ array_push(&self->finished_states, *next_state);
+ if (next_state == state) {
+ array_erase(&self->states, i);
+ i--;
+ n--;
+ } else {
+ self->states.size--;
+ }
+ }
+ }
+
+ // Continue descending if possible.
+ if (ts_tree_cursor_goto_first_child(&self->cursor)) {
+ self->depth++;
+ } else {
+ self->ascending = true;
+ }
+ }
+ } while (self->finished_states.size == 0);
+
+ return true;
+}
+
+bool ts_query_cursor_next_match(
+ TSQueryCursor *self,
+ TSQueryMatch *match
+) {
+ if (self->finished_states.size == 0) {
+ if (!ts_query_cursor__advance(self)) {
+ return false;
+ }
+ }
+
+ QueryState *state = &self->finished_states.contents[0];
+ match->id = state->id;
+ match->pattern_index = state->pattern_index;
+ match->capture_count = state->capture_count;
+ match->captures = capture_list_pool_get(
+ &self->capture_list_pool,
+ state->capture_list_id
+ );
+ capture_list_pool_release(&self->capture_list_pool, state->capture_list_id);
+ array_erase(&self->finished_states, 0);
+ return true;
+}
+
+void ts_query_cursor_remove_match(
+ TSQueryCursor *self,
+ uint32_t match_id
+) {
+ for (unsigned i = 0; i < self->finished_states.size; i++) {
+ const QueryState *state = &self->finished_states.contents[i];
+ if (state->id == match_id) {
+ capture_list_pool_release(
+ &self->capture_list_pool,
+ state->capture_list_id
+ );
+ array_erase(&self->finished_states, i);
+ return;
+ }
+ }
+}
+
+bool ts_query_cursor_next_capture(
+ TSQueryCursor *self,
+ TSQueryMatch *match,
+ uint32_t *capture_index
+) {
+ for (;;) {
+ // The goal here is to return captures in order, even though they may not
+ // be discovered in order, because patterns can overlap. If there are any
+ // finished patterns, then try to find one that contains a capture that
+ // is *definitely* before any capture in an *unfinished* pattern.
+ if (self->finished_states.size > 0) {
+ // First, identify the position of the earliest capture in an unfinished
+ // match. For a finished capture to be returned, it must be *before*
+ // this position.
+ uint32_t first_unfinished_capture_byte = UINT32_MAX;
+ uint32_t first_unfinished_pattern_index = UINT32_MAX;
+ uint32_t first_unfinished_state_index;
+ ts_query_cursor__first_in_progress_capture(
+ self,
+ &first_unfinished_state_index,
+ &first_unfinished_capture_byte,
+ &first_unfinished_pattern_index
+ );
+
+ // Find the earliest capture in a finished match.
+ int first_finished_state_index = -1;
+ uint32_t first_finished_capture_byte = first_unfinished_capture_byte;
+ uint32_t first_finished_pattern_index = first_unfinished_pattern_index;
+ for (unsigned i = 0; i < self->finished_states.size; i++) {
+ const QueryState *state = &self->finished_states.contents[i];
+ if (state->capture_count > state->consumed_capture_count) {
+ const TSQueryCapture *captures = capture_list_pool_get(
+ &self->capture_list_pool,
+ state->capture_list_id
+ );
+ uint32_t capture_byte = ts_node_start_byte(
+ captures[state->consumed_capture_count].node
+ );
+ if (
+ capture_byte < first_finished_capture_byte ||
+ (
+ capture_byte == first_finished_capture_byte &&
+ state->pattern_index < first_finished_pattern_index
+ )
+ ) {
+ first_finished_state_index = i;
+ first_finished_capture_byte = capture_byte;
+ first_finished_pattern_index = state->pattern_index;
+ }
+ } else {
+ capture_list_pool_release(
+ &self->capture_list_pool,
+ state->capture_list_id
+ );
+ array_erase(&self->finished_states, i);
+ i--;
+ }
+ }
+
+ // If there is finished capture that is clearly before any unfinished
+ // capture, then return its match, and its capture index. Internally
+ // record the fact that the capture has been 'consumed'.
+ if (first_finished_state_index != -1) {
+ QueryState *state = &self->finished_states.contents[
+ first_finished_state_index
+ ];
+ match->id = state->id;
+ match->pattern_index = state->pattern_index;
+ match->capture_count = state->capture_count;
+ match->captures = capture_list_pool_get(
+ &self->capture_list_pool,
+ state->capture_list_id
+ );
+ *capture_index = state->consumed_capture_count;
+ state->consumed_capture_count++;
+ return true;
+ }
+
+ if (capture_list_pool_is_empty(&self->capture_list_pool)) {
+ LOG(
+ " abandon state. index:%u, pattern:%u, offset:%u.\n",
+ first_unfinished_state_index,
+ first_unfinished_pattern_index,
+ first_unfinished_capture_byte
+ );
+ capture_list_pool_release(
+ &self->capture_list_pool,
+ self->states.contents[first_unfinished_state_index].capture_list_id
+ );
+ array_erase(&self->states, first_unfinished_state_index);
+ }
+ }
+
+ // If there are no finished matches that are ready to be returned, then
+ // continue finding more matches.
+ if (!ts_query_cursor__advance(self)) return false;
+ }
+}
+
+#undef LOG
diff --git a/src/tree_sitter/subtree.c b/src/tree_sitter/subtree.c
index e95733eb46..30144fa175 100644
--- a/src/tree_sitter/subtree.c
+++ b/src/tree_sitter/subtree.c
@@ -18,18 +18,9 @@ typedef struct {
Length new_end;
} Edit;
-#ifdef TREE_SITTER_TEST
-
-#define TS_MAX_INLINE_TREE_LENGTH 2
-#define TS_MAX_TREE_POOL_SIZE 0
-
-#else
-
#define TS_MAX_INLINE_TREE_LENGTH UINT8_MAX
#define TS_MAX_TREE_POOL_SIZE 32
-#endif
-
static const ExternalScannerState empty_state = {.length = 0, .short_data = {0}};
// ExternalScannerState
@@ -775,10 +766,10 @@ Subtree ts_subtree_last_external_token(Subtree tree) {
}
static size_t ts_subtree__write_char_to_string(char *s, size_t n, int32_t c) {
- if (c == 0)
- return snprintf(s, n, "EOF");
if (c == -1)
return snprintf(s, n, "INVALID");
+ else if (c == '\0')
+ return snprintf(s, n, "'\\0'");
else if (c == '\n')
return snprintf(s, n, "'\\n'");
else if (c == '\t')
diff --git a/src/tree_sitter/tree.c b/src/tree_sitter/tree.c
index 04cb1d242f..391fa7f592 100644
--- a/src/tree_sitter/tree.c
+++ b/src/tree_sitter/tree.c
@@ -84,12 +84,10 @@ void ts_tree_edit(TSTree *self, const TSInputEdit *edit) {
}
TSRange *ts_tree_get_changed_ranges(const TSTree *self, const TSTree *other, uint32_t *count) {
- TSRange *result;
TreeCursor cursor1 = {NULL, array_new()};
TreeCursor cursor2 = {NULL, array_new()};
- TSNode root = ts_tree_root_node(self);
- ts_tree_cursor_init(&cursor1, root);
- ts_tree_cursor_init(&cursor2, root);
+ ts_tree_cursor_init(&cursor1, ts_tree_root_node(self));
+ ts_tree_cursor_init(&cursor2, ts_tree_root_node(other));
TSRangeArray included_range_differences = array_new();
ts_range_array_get_changed_ranges(
@@ -98,6 +96,7 @@ TSRange *ts_tree_get_changed_ranges(const TSTree *self, const TSTree *other, uin
&included_range_differences
);
+ TSRange *result;
*count = ts_subtree_get_changed_ranges(
&self->root, &other->root, &cursor1, &cursor2,
self->language, &included_range_differences, &result
diff --git a/src/tree_sitter/tree_cursor.c b/src/tree_sitter/tree_cursor.c
index 7103fc411d..00b9679d73 100644
--- a/src/tree_sitter/tree_cursor.c
+++ b/src/tree_sitter/tree_cursor.c
@@ -244,6 +244,72 @@ TSNode ts_tree_cursor_current_node(const TSTreeCursor *_self) {
);
}
+TSFieldId ts_tree_cursor_current_status(
+ const TSTreeCursor *_self,
+ bool *can_have_later_siblings,
+ bool *can_have_later_siblings_with_this_field
+) {
+ const TreeCursor *self = (const TreeCursor *)_self;
+ TSFieldId result = 0;
+ *can_have_later_siblings = false;
+ *can_have_later_siblings_with_this_field = false;
+
+ // Walk up the tree, visiting the current node and its invisible ancestors,
+ // because fields can refer to nodes through invisible *wrapper* nodes,
+ for (unsigned i = self->stack.size - 1; i > 0; i--) {
+ TreeCursorEntry *entry = &self->stack.contents[i];
+ TreeCursorEntry *parent_entry = &self->stack.contents[i - 1];
+
+ // Stop walking up when a visible ancestor is found.
+ if (i != self->stack.size - 1) {
+ if (ts_subtree_visible(*entry->subtree)) break;
+ const TSSymbol *alias_sequence = ts_language_alias_sequence(
+ self->tree->language,
+ parent_entry->subtree->ptr->production_id
+ );
+ if (alias_sequence && alias_sequence[entry->structural_child_index]) {
+ break;
+ }
+ }
+
+ if (ts_subtree_child_count(*parent_entry->subtree) > entry->child_index + 1) {
+ *can_have_later_siblings = true;
+ }
+
+ if (ts_subtree_extra(*entry->subtree)) break;
+
+ const TSFieldMapEntry *field_map, *field_map_end;
+ ts_language_field_map(
+ self->tree->language,
+ parent_entry->subtree->ptr->production_id,
+ &field_map, &field_map_end
+ );
+
+ // Look for a field name associated with the current node.
+ if (!result) {
+ for (const TSFieldMapEntry *i = field_map; i < field_map_end; i++) {
+ if (!i->inherited && i->child_index == entry->structural_child_index) {
+ result = i->field_id;
+ *can_have_later_siblings_with_this_field = false;
+ break;
+ }
+ }
+ }
+
+ // Determine if there other later siblings with the same field name.
+ if (result) {
+ for (const TSFieldMapEntry *i = field_map; i < field_map_end; i++) {
+ if (i->field_id == result && i->child_index > entry->structural_child_index) {
+ *can_have_later_siblings_with_this_field = true;
+ break;
+ }
+ }
+ }
+ }
+
+ return result;
+}
+
TSFieldId ts_tree_cursor_current_field_id(const TSTreeCursor *_self) {
const TreeCursor *self = (const TreeCursor *)_self;
@@ -264,19 +330,18 @@ TSFieldId ts_tree_cursor_current_field_id(const TSTreeCursor *_self) {
}
}
+ if (ts_subtree_extra(*entry->subtree)) break;
+
const TSFieldMapEntry *field_map, *field_map_end;
ts_language_field_map(
self->tree->language,
parent_entry->subtree->ptr->production_id,
&field_map, &field_map_end
);
-
- while (field_map < field_map_end) {
- if (
- !field_map->inherited &&
- field_map->child_index == entry->structural_child_index
- ) return field_map->field_id;
- field_map++;
+ for (const TSFieldMapEntry *i = field_map; i < field_map_end; i++) {
+ if (!i->inherited && i->child_index == entry->structural_child_index) {
+ return i->field_id;
+ }
}
}
return 0;
diff --git a/src/tree_sitter/tree_cursor.h b/src/tree_sitter/tree_cursor.h
index 55bdad86da..5a39dd278c 100644
--- a/src/tree_sitter/tree_cursor.h
+++ b/src/tree_sitter/tree_cursor.h
@@ -16,5 +16,6 @@ typedef struct {
} TreeCursor;
void ts_tree_cursor_init(TreeCursor *, TSNode);
+TSFieldId ts_tree_cursor_current_status(const TSTreeCursor *, bool *, bool *);
#endif // TREE_SITTER_TREE_CURSOR_H_
diff --git a/src/tree_sitter/unicode.h b/src/tree_sitter/unicode.h
new file mode 100644
index 0000000000..2ab51c2a3a
--- /dev/null
+++ b/src/tree_sitter/unicode.h
@@ -0,0 +1,50 @@
+#ifndef TREE_SITTER_UNICODE_H_
+#define TREE_SITTER_UNICODE_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <limits.h>
+#include <stdint.h>
+
+#define U_EXPORT
+#define U_EXPORT2
+#include "./unicode/utf8.h"
+#include "./unicode/utf16.h"
+
+static const int32_t TS_DECODE_ERROR = U_SENTINEL;
+
+// These functions read one unicode code point from the given string,
+// returning the number of bytes consumed.
+typedef uint32_t (*UnicodeDecodeFunction)(
+ const uint8_t *string,
+ uint32_t length,
+ int32_t *code_point
+);
+
+static inline uint32_t ts_decode_utf8(
+ const uint8_t *string,
+ uint32_t length,
+ int32_t *code_point
+) {
+ uint32_t i = 0;
+ U8_NEXT(string, i, length, *code_point);
+ return i;
+}
+
+static inline uint32_t ts_decode_utf16(
+ const uint8_t *string,
+ uint32_t length,
+ int32_t *code_point
+) {
+ uint32_t i = 0;
+ U16_NEXT(((uint16_t *)string), i, length, *code_point);
+ return i * 2;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // TREE_SITTER_UNICODE_H_
diff --git a/src/tree_sitter/unicode/ICU_SHA b/src/tree_sitter/unicode/ICU_SHA
new file mode 100644
index 0000000000..3622283ba3
--- /dev/null
+++ b/src/tree_sitter/unicode/ICU_SHA
@@ -0,0 +1 @@
+552b01f61127d30d6589aa4bf99468224979b661
diff --git a/src/tree_sitter/unicode/LICENSE b/src/tree_sitter/unicode/LICENSE
new file mode 100644
index 0000000000..2e01e36876
--- /dev/null
+++ b/src/tree_sitter/unicode/LICENSE
@@ -0,0 +1,414 @@
+COPYRIGHT AND PERMISSION NOTICE (ICU 58 and later)
+
+Copyright © 1991-2019 Unicode, Inc. All rights reserved.
+Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of the Unicode data files and any associated documentation
+(the "Data Files") or Unicode software and any associated documentation
+(the "Software") to deal in the Data Files or Software
+without restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, and/or sell copies of
+the Data Files or Software, and to permit persons to whom the Data Files
+or Software are furnished to do so, provided that either
+(a) this copyright and permission notice appear with all copies
+of the Data Files or Software, or
+(b) this copyright and permission notice appear in associated
+Documentation.
+
+THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
+ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
+NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
+DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+PERFORMANCE OF THE DATA FILES OR SOFTWARE.
+
+Except as contained in this notice, the name of a copyright holder
+shall not be used in advertising or otherwise to promote the sale,
+use or other dealings in these Data Files or Software without prior
+written authorization of the copyright holder.
+
+---------------------
+
+Third-Party Software Licenses
+
+This section contains third-party software notices and/or additional
+terms for licensed third-party software components included within ICU
+libraries.
+
+1. ICU License - ICU 1.8.1 to ICU 57.1
+
+COPYRIGHT AND PERMISSION NOTICE
+
+Copyright (c) 1995-2016 International Business Machines Corporation and others
+All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, and/or sell copies of the Software, and to permit persons
+to whom the Software is furnished to do so, provided that the above
+copyright notice(s) and this permission notice appear in all copies of
+the Software and that both the above copyright notice(s) and this
+permission notice appear in supporting documentation.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
+OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY
+SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER
+RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF
+CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+Except as contained in this notice, the name of a copyright holder
+shall not be used in advertising or otherwise to promote the sale, use
+or other dealings in this Software without prior written authorization
+of the copyright holder.
+
+All trademarks and registered trademarks mentioned herein are the
+property of their respective owners.
+
+2. Chinese/Japanese Word Break Dictionary Data (cjdict.txt)
+
+ # The Google Chrome software developed by Google is licensed under
+ # the BSD license. Other software included in this distribution is
+ # provided under other licenses, as set forth below.
+ #
+ # The BSD License
+ # http://opensource.org/licenses/bsd-license.php
+ # Copyright (C) 2006-2008, Google Inc.
+ #
+ # All rights reserved.
+ #
+ # Redistribution and use in source and binary forms, with or without
+ # modification, are permitted provided that the following conditions are met:
+ #
+ # Redistributions of source code must retain the above copyright notice,
+ # this list of conditions and the following disclaimer.
+ # Redistributions in binary form must reproduce the above
+ # copyright notice, this list of conditions and the following
+ # disclaimer in the documentation and/or other materials provided with
+ # the distribution.
+ # Neither the name of Google Inc. nor the names of its
+ # contributors may be used to endorse or promote products derived from
+ # this software without specific prior written permission.
+ #
+ #
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+ # CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ # INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ # BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ #
+ #
+ # The word list in cjdict.txt are generated by combining three word lists
+ # listed below with further processing for compound word breaking. The
+ # frequency is generated with an iterative training against Google web
+ # corpora.
+ #
+ # * Libtabe (Chinese)
+ # - https://sourceforge.net/project/?group_id=1519
+ # - Its license terms and conditions are shown below.
+ #
+ # * IPADIC (Japanese)
+ # - http://chasen.aist-nara.ac.jp/chasen/distribution.html
+ # - Its license terms and conditions are shown below.
+ #
+ # ---------COPYING.libtabe ---- BEGIN--------------------
+ #
+ # /*
+ # * Copyright (c) 1999 TaBE Project.
+ # * Copyright (c) 1999 Pai-Hsiang Hsiao.
+ # * All rights reserved.
+ # *
+ # * Redistribution and use in source and binary forms, with or without
+ # * modification, are permitted provided that the following conditions
+ # * are met:
+ # *
+ # * . Redistributions of source code must retain the above copyright
+ # * notice, this list of conditions and the following disclaimer.
+ # * . Redistributions in binary form must reproduce the above copyright
+ # * notice, this list of conditions and the following disclaimer in
+ # * the documentation and/or other materials provided with the
+ # * distribution.
+ # * . Neither the name of the TaBE Project nor the names of its
+ # * contributors may be used to endorse or promote products derived
+ # * from this software without specific prior written permission.
+ # *
+ # * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ # * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ # * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ # * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ # * REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ # * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ # * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ # * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ # * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ # * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ # * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ # * OF THE POSSIBILITY OF SUCH DAMAGE.
+ # */
+ #
+ # /*
+ # * Copyright (c) 1999 Computer Systems and Communication Lab,
+ # * Institute of Information Science, Academia
+ # * Sinica. All rights reserved.
+ # *
+ # * Redistribution and use in source and binary forms, with or without
+ # * modification, are permitted provided that the following conditions
+ # * are met:
+ # *
+ # * . Redistributions of source code must retain the above copyright
+ # * notice, this list of conditions and the following disclaimer.
+ # * . Redistributions in binary form must reproduce the above copyright
+ # * notice, this list of conditions and the following disclaimer in
+ # * the documentation and/or other materials provided with the
+ # * distribution.
+ # * . Neither the name of the Computer Systems and Communication Lab
+ # * nor the names of its contributors may be used to endorse or
+ # * promote products derived from this software without specific
+ # * prior written permission.
+ # *
+ # * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ # * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ # * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ # * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ # * REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ # * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ # * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ # * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ # * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ # * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ # * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ # * OF THE POSSIBILITY OF SUCH DAMAGE.
+ # */
+ #
+ # Copyright 1996 Chih-Hao Tsai @ Beckman Institute,
+ # University of Illinois
+ # c-tsai4@uiuc.edu http://casper.beckman.uiuc.edu/~c-tsai4
+ #
+ # ---------------COPYING.libtabe-----END--------------------------------
+ #
+ #
+ # ---------------COPYING.ipadic-----BEGIN-------------------------------
+ #
+ # Copyright 2000, 2001, 2002, 2003 Nara Institute of Science
+ # and Technology. All Rights Reserved.
+ #
+ # Use, reproduction, and distribution of this software is permitted.
+ # Any copy of this software, whether in its original form or modified,
+ # must include both the above copyright notice and the following
+ # paragraphs.
+ #
+ # Nara Institute of Science and Technology (NAIST),
+ # the copyright holders, disclaims all warranties with regard to this
+ # software, including all implied warranties of merchantability and
+ # fitness, in no event shall NAIST be liable for
+ # any special, indirect or consequential damages or any damages
+ # whatsoever resulting from loss of use, data or profits, whether in an
+ # action of contract, negligence or other tortuous action, arising out
+ # of or in connection with the use or performance of this software.
+ #
+ # A large portion of the dictionary entries
+ # originate from ICOT Free Software. The following conditions for ICOT
+ # Free Software applies to the current dictionary as well.
+ #
+ # Each User may also freely distribute the Program, whether in its
+ # original form or modified, to any third party or parties, PROVIDED
+ # that the provisions of Section 3 ("NO WARRANTY") will ALWAYS appear
+ # on, or be attached to, the Program, which is distributed substantially
+ # in the same form as set out herein and that such intended
+ # distribution, if actually made, will neither violate or otherwise
+ # contravene any of the laws and regulations of the countries having
+ # jurisdiction over the User or the intended distribution itself.
+ #
+ # NO WARRANTY
+ #
+ # The program was produced on an experimental basis in the course of the
+ # research and development conducted during the project and is provided
+ # to users as so produced on an experimental basis. Accordingly, the
+ # program is provided without any warranty whatsoever, whether express,
+ # implied, statutory or otherwise. The term "warranty" used herein
+ # includes, but is not limited to, any warranty of the quality,
+ # performance, merchantability and fitness for a particular purpose of
+ # the program and the nonexistence of any infringement or violation of
+ # any right of any third party.
+ #
+ # Each user of the program will agree and understand, and be deemed to
+ # have agreed and understood, that there is no warranty whatsoever for
+ # the program and, accordingly, the entire risk arising from or
+ # otherwise connected with the program is assumed by the user.
+ #
+ # Therefore, neither ICOT, the copyright holder, or any other
+ # organization that participated in or was otherwise related to the
+ # development of the program and their respective officials, directors,
+ # officers and other employees shall be held liable for any and all
+ # damages, including, without limitation, general, special, incidental
+ # and consequential damages, arising out of or otherwise in connection
+ # with the use or inability to use the program or any product, material
+ # or result produced or otherwise obtained by using the program,
+ # regardless of whether they have been advised of, or otherwise had
+ # knowledge of, the possibility of such damages at any time during the
+ # project or thereafter. Each user will be deemed to have agreed to the
+ # foregoing by his or her commencement of use of the program. The term
+ # "use" as used herein includes, but is not limited to, the use,
+ # modification, copying and distribution of the program and the
+ # production of secondary products from the program.
+ #
+ # In the case where the program, whether in its original form or
+ # modified, was distributed or delivered to or received by a user from
+ # any person, organization or entity other than ICOT, unless it makes or
+ # grants independently of ICOT any specific warranty to the user in
+ # writing, such person, organization or entity, will also be exempted
+ # from and not be held liable to the user for any such damages as noted
+ # above as far as the program is concerned.
+ #
+ # ---------------COPYING.ipadic-----END----------------------------------
+
+3. Lao Word Break Dictionary Data (laodict.txt)
+
+ # Copyright (c) 2013 International Business Machines Corporation
+ # and others. All Rights Reserved.
+ #
+ # Project: http://code.google.com/p/lao-dictionary/
+ # Dictionary: http://lao-dictionary.googlecode.com/git/Lao-Dictionary.txt
+ # License: http://lao-dictionary.googlecode.com/git/Lao-Dictionary-LICENSE.txt
+ # (copied below)
+ #
+ # This file is derived from the above dictionary, with slight
+ # modifications.
+ # ----------------------------------------------------------------------
+ # Copyright (C) 2013 Brian Eugene Wilson, Robert Martin Campbell.
+ # All rights reserved.
+ #
+ # Redistribution and use in source and binary forms, with or without
+ # modification,
+ # are permitted provided that the following conditions are met:
+ #
+ #
+ # Redistributions of source code must retain the above copyright notice, this
+ # list of conditions and the following disclaimer. Redistributions in
+ # binary form must reproduce the above copyright notice, this list of
+ # conditions and the following disclaimer in the documentation and/or
+ # other materials provided with the distribution.
+ #
+ #
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ # COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ # INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ # STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ # OF THE POSSIBILITY OF SUCH DAMAGE.
+ # --------------------------------------------------------------------------
+
+4. Burmese Word Break Dictionary Data (burmesedict.txt)
+
+ # Copyright (c) 2014 International Business Machines Corporation
+ # and others. All Rights Reserved.
+ #
+ # This list is part of a project hosted at:
+ # github.com/kanyawtech/myanmar-karen-word-lists
+ #
+ # --------------------------------------------------------------------------
+ # Copyright (c) 2013, LeRoy Benjamin Sharon
+ # All rights reserved.
+ #
+ # Redistribution and use in source and binary forms, with or without
+ # modification, are permitted provided that the following conditions
+ # are met: Redistributions of source code must retain the above
+ # copyright notice, this list of conditions and the following
+ # disclaimer. Redistributions in binary form must reproduce the
+ # above copyright notice, this list of conditions and the following
+ # disclaimer in the documentation and/or other materials provided
+ # with the distribution.
+ #
+ # Neither the name Myanmar Karen Word Lists, nor the names of its
+ # contributors may be used to endorse or promote products derived
+ # from this software without specific prior written permission.
+ #
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+ # CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ # INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+ # BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ # TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ # ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
+ # TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
+ # THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ # SUCH DAMAGE.
+ # --------------------------------------------------------------------------
+
+5. Time Zone Database
+
+ ICU uses the public domain data and code derived from Time Zone
+Database for its time zone support. The ownership of the TZ database
+is explained in BCP 175: Procedure for Maintaining the Time Zone
+Database section 7.
+
+ # 7. Database Ownership
+ #
+ # The TZ database itself is not an IETF Contribution or an IETF
+ # document. Rather it is a pre-existing and regularly updated work
+ # that is in the public domain, and is intended to remain in the
+ # public domain. Therefore, BCPs 78 [RFC5378] and 79 [RFC3979] do
+ # not apply to the TZ Database or contributions that individuals make
+ # to it. Should any claims be made and substantiated against the TZ
+ # Database, the organization that is providing the IANA
+ # Considerations defined in this RFC, under the memorandum of
+ # understanding with the IETF, currently ICANN, may act in accordance
+ # with all competent court orders. No ownership claims will be made
+ # by ICANN or the IETF Trust on the database or the code. Any person
+ # making a contribution to the database or code waives all rights to
+ # future claims in that contribution or in the TZ Database.
+
+6. Google double-conversion
+
+Copyright 2006-2011, the V8 project authors. All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above
+ copyright notice, this list of conditions and the following
+ disclaimer in the documentation and/or other materials provided
+ with the distribution.
+ * Neither the name of Google Inc. nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/src/tree_sitter/unicode/README.md b/src/tree_sitter/unicode/README.md
new file mode 100644
index 0000000000..623b8e3843
--- /dev/null
+++ b/src/tree_sitter/unicode/README.md
@@ -0,0 +1,29 @@
+# ICU Parts
+
+This directory contains a small subset of files from the Unicode organization's [ICU repository](https://github.com/unicode-org/icu).
+
+### License
+
+The license for these files is contained in the `LICENSE` file within this directory.
+
+### Contents
+
+* Source files taken from the [`icu4c/source/common/unicode`](https://github.com/unicode-org/icu/tree/552b01f61127d30d6589aa4bf99468224979b661/icu4c/source/common/unicode) directory:
+ * `utf8.h`
+ * `utf16.h`
+ * `umachine.h`
+* Empty source files that are referenced by the above source files, but whose original contents in `libicu` are not needed:
+ * `ptypes.h`
+ * `urename.h`
+ * `utf.h`
+* `ICU_SHA` - File containing the Git SHA of the commit in the `icu` repository from which the files were obtained.
+* `LICENSE` - The license file from the [`icu4c`](https://github.com/unicode-org/icu/tree/552b01f61127d30d6589aa4bf99468224979b661/icu4c) directory of the `icu` repository.
+* `README.md` - This text file.
+
+### Updating ICU
+
+To incorporate changes from the upstream `icu` repository:
+
+* Update `ICU_SHA` with the new Git SHA.
+* Update `LICENSE` with the license text from the directory mentioned above.
+* Update `utf8.h`, `utf16.h`, and `umachine.h` with their new contents in the `icu` repository.
diff --git a/src/tree_sitter/unicode/ptypes.h b/src/tree_sitter/unicode/ptypes.h
new file mode 100644
index 0000000000..ac79ad0f98
--- /dev/null
+++ b/src/tree_sitter/unicode/ptypes.h
@@ -0,0 +1 @@
+// This file must exist in order for `utf8.h` and `utf16.h` to be used.
diff --git a/src/tree_sitter/unicode/umachine.h b/src/tree_sitter/unicode/umachine.h
new file mode 100644
index 0000000000..bbf6ef9c8b
--- /dev/null
+++ b/src/tree_sitter/unicode/umachine.h
@@ -0,0 +1,448 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+******************************************************************************
+*
+* Copyright (C) 1999-2015, International Business Machines
+* Corporation and others. All Rights Reserved.
+*
+******************************************************************************
+* file name: umachine.h
+* encoding: UTF-8
+* tab size: 8 (not used)
+* indentation:4
+*
+* created on: 1999sep13
+* created by: Markus W. Scherer
+*
+* This file defines basic types and constants for ICU to be
+* platform-independent. umachine.h and utf.h are included into
+* utypes.h to provide all the general definitions for ICU.
+* All of these definitions used to be in utypes.h before
+* the UTF-handling macros made this unmaintainable.
+*/
+
+#ifndef __UMACHINE_H__
+#define __UMACHINE_H__
+
+
+/**
+ * \file
+ * \brief Basic types and constants for UTF
+ *
+ * <h2> Basic types and constants for UTF </h2>
+ * This file defines basic types and constants for utf.h to be
+ * platform-independent. umachine.h and utf.h are included into
+ * utypes.h to provide all the general definitions for ICU.
+ * All of these definitions used to be in utypes.h before
+ * the UTF-handling macros made this unmaintainable.
+ *
+ */
+/*==========================================================================*/
+/* Include platform-dependent definitions */
+/* which are contained in the platform-specific file platform.h */
+/*==========================================================================*/
+
+#include "./ptypes.h" /* platform.h is included in ptypes.h */
+
+/*
+ * ANSI C headers:
+ * stddef.h defines wchar_t
+ */
+#include <stddef.h>
+
+/*==========================================================================*/
+/* For C wrappers, we use the symbol U_STABLE. */
+/* This works properly if the includer is C or C++. */
+/* Functions are declared U_STABLE return-type U_EXPORT2 function-name()... */
+/*==========================================================================*/
+
+/**
+ * \def U_CFUNC
+ * This is used in a declaration of a library private ICU C function.
+ * @stable ICU 2.4
+ */
+
+/**
+ * \def U_CDECL_BEGIN
+ * This is used to begin a declaration of a library private ICU C API.
+ * @stable ICU 2.4
+ */
+
+/**
+ * \def U_CDECL_END
+ * This is used to end a declaration of a library private ICU C API
+ * @stable ICU 2.4
+ */
+
+#ifdef __cplusplus
+# define U_CFUNC extern "C"
+# define U_CDECL_BEGIN extern "C" {
+# define U_CDECL_END }
+#else
+# define U_CFUNC extern
+# define U_CDECL_BEGIN
+# define U_CDECL_END
+#endif
+
+#ifndef U_ATTRIBUTE_DEPRECATED
+/**
+ * \def U_ATTRIBUTE_DEPRECATED
+ * This is used for GCC specific attributes
+ * @internal
+ */
+#if U_GCC_MAJOR_MINOR >= 302
+# define U_ATTRIBUTE_DEPRECATED __attribute__ ((deprecated))
+/**
+ * \def U_ATTRIBUTE_DEPRECATED
+ * This is used for Visual C++ specific attributes
+ * @internal
+ */
+#elif defined(_MSC_VER) && (_MSC_VER >= 1400)
+# define U_ATTRIBUTE_DEPRECATED __declspec(deprecated)
+#else
+# define U_ATTRIBUTE_DEPRECATED
+#endif
+#endif
+
+/** This is used to declare a function as a public ICU C API @stable ICU 2.0*/
+#define U_CAPI U_CFUNC U_EXPORT
+/** This is used to declare a function as a stable public ICU C API*/
+#define U_STABLE U_CAPI
+/** This is used to declare a function as a draft public ICU C API */
+#define U_DRAFT U_CAPI
+/** This is used to declare a function as a deprecated public ICU C API */
+#define U_DEPRECATED U_CAPI U_ATTRIBUTE_DEPRECATED
+/** This is used to declare a function as an obsolete public ICU C API */
+#define U_OBSOLETE U_CAPI
+/** This is used to declare a function as an internal ICU C API */
+#define U_INTERNAL U_CAPI
+
+/**
+ * \def U_OVERRIDE
+ * Defined to the C++11 "override" keyword if available.
+ * Denotes a class or member which is an override of the base class.
+ * May result in an error if it applied to something not an override.
+ * @internal
+ */
+#ifndef U_OVERRIDE
+#define U_OVERRIDE override
+#endif
+
+/**
+ * \def U_FINAL
+ * Defined to the C++11 "final" keyword if available.
+ * Denotes a class or member which may not be overridden in subclasses.
+ * May result in an error if subclasses attempt to override.
+ * @internal
+ */
+#if !defined(U_FINAL) || defined(U_IN_DOXYGEN)
+#define U_FINAL final
+#endif
+
+// Before ICU 65, function-like, multi-statement ICU macros were just defined as
+// series of statements wrapped in { } blocks and the caller could choose to
+// either treat them as if they were actual functions and end the invocation
+// with a trailing ; creating an empty statement after the block or else omit
+// this trailing ; using the knowledge that the macro would expand to { }.
+//
+// But doing so doesn't work well with macros that look like functions and
+// compiler warnings about empty statements (ICU-20601) and ICU 65 therefore
+// switches to the standard solution of wrapping such macros in do { } while.
+//
+// This will however break existing code that depends on being able to invoke
+// these macros without a trailing ; so to be able to remain compatible with
+// such code the wrapper is itself defined as macros so that it's possible to
+// build ICU 65 and later with the old macro behaviour, like this:
+//
+// CPPFLAGS='-DUPRV_BLOCK_MACRO_BEGIN="" -DUPRV_BLOCK_MACRO_END=""'
+// runConfigureICU ...
+
+/**
+ * \def UPRV_BLOCK_MACRO_BEGIN
+ * Defined as the "do" keyword by default.
+ * @internal
+ */
+#ifndef UPRV_BLOCK_MACRO_BEGIN
+#define UPRV_BLOCK_MACRO_BEGIN do
+#endif
+
+/**
+ * \def UPRV_BLOCK_MACRO_END
+ * Defined as "while (FALSE)" by default.
+ * @internal
+ */
+#ifndef UPRV_BLOCK_MACRO_END
+#define UPRV_BLOCK_MACRO_END while (FALSE)
+#endif
+
+/*==========================================================================*/
+/* limits for int32_t etc., like in POSIX inttypes.h */
+/*==========================================================================*/
+
+#ifndef INT8_MIN
+/** The smallest value an 8 bit signed integer can hold @stable ICU 2.0 */
+# define INT8_MIN ((int8_t)(-128))
+#endif
+#ifndef INT16_MIN
+/** The smallest value a 16 bit signed integer can hold @stable ICU 2.0 */
+# define INT16_MIN ((int16_t)(-32767-1))
+#endif
+#ifndef INT32_MIN
+/** The smallest value a 32 bit signed integer can hold @stable ICU 2.0 */
+# define INT32_MIN ((int32_t)(-2147483647-1))
+#endif
+
+#ifndef INT8_MAX
+/** The largest value an 8 bit signed integer can hold @stable ICU 2.0 */
+# define INT8_MAX ((int8_t)(127))
+#endif
+#ifndef INT16_MAX
+/** The largest value a 16 bit signed integer can hold @stable ICU 2.0 */
+# define INT16_MAX ((int16_t)(32767))
+#endif
+#ifndef INT32_MAX
+/** The largest value a 32 bit signed integer can hold @stable ICU 2.0 */
+# define INT32_MAX ((int32_t)(2147483647))
+#endif
+
+#ifndef UINT8_MAX
+/** The largest value an 8 bit unsigned integer can hold @stable ICU 2.0 */
+# define UINT8_MAX ((uint8_t)(255U))
+#endif
+#ifndef UINT16_MAX
+/** The largest value a 16 bit unsigned integer can hold @stable ICU 2.0 */
+# define UINT16_MAX ((uint16_t)(65535U))
+#endif
+#ifndef UINT32_MAX
+/** The largest value a 32 bit unsigned integer can hold @stable ICU 2.0 */
+# define UINT32_MAX ((uint32_t)(4294967295U))
+#endif
+
+#if defined(U_INT64_T_UNAVAILABLE)
+# error int64_t is required for decimal format and rule-based number format.
+#else
+# ifndef INT64_C
+/**
+ * Provides a platform independent way to specify a signed 64-bit integer constant.
+ * note: may be wrong for some 64 bit platforms - ensure your compiler provides INT64_C
+ * @stable ICU 2.8
+ */
+# define INT64_C(c) c ## LL
+# endif
+# ifndef UINT64_C
+/**
+ * Provides a platform independent way to specify an unsigned 64-bit integer constant.
+ * note: may be wrong for some 64 bit platforms - ensure your compiler provides UINT64_C
+ * @stable ICU 2.8
+ */
+# define UINT64_C(c) c ## ULL
+# endif
+# ifndef U_INT64_MIN
+/** The smallest value a 64 bit signed integer can hold @stable ICU 2.8 */
+# define U_INT64_MIN ((int64_t)(INT64_C(-9223372036854775807)-1))
+# endif
+# ifndef U_INT64_MAX
+/** The largest value a 64 bit signed integer can hold @stable ICU 2.8 */
+# define U_INT64_MAX ((int64_t)(INT64_C(9223372036854775807)))
+# endif
+# ifndef U_UINT64_MAX
+/** The largest value a 64 bit unsigned integer can hold @stable ICU 2.8 */
+# define U_UINT64_MAX ((uint64_t)(UINT64_C(18446744073709551615)))
+# endif
+#endif
+
+/*==========================================================================*/
+/* Boolean data type */
+/*==========================================================================*/
+
+/** The ICU boolean type @stable ICU 2.0 */
+typedef int8_t UBool;
+
+#ifndef TRUE
+/** The TRUE value of a UBool @stable ICU 2.0 */
+# define TRUE 1
+#endif
+#ifndef FALSE
+/** The FALSE value of a UBool @stable ICU 2.0 */
+# define FALSE 0
+#endif
+
+
+/*==========================================================================*/
+/* Unicode data types */
+/*==========================================================================*/
+
+/* wchar_t-related definitions -------------------------------------------- */
+
+/*
+ * \def U_WCHAR_IS_UTF16
+ * Defined if wchar_t uses UTF-16.
+ *
+ * @stable ICU 2.0
+ */
+/*
+ * \def U_WCHAR_IS_UTF32
+ * Defined if wchar_t uses UTF-32.
+ *
+ * @stable ICU 2.0
+ */
+#if !defined(U_WCHAR_IS_UTF16) && !defined(U_WCHAR_IS_UTF32)
+# ifdef __STDC_ISO_10646__
+# if (U_SIZEOF_WCHAR_T==2)
+# define U_WCHAR_IS_UTF16
+# elif (U_SIZEOF_WCHAR_T==4)
+# define U_WCHAR_IS_UTF32
+# endif
+# elif defined __UCS2__
+# if (U_PF_OS390 <= U_PLATFORM && U_PLATFORM <= U_PF_OS400) && (U_SIZEOF_WCHAR_T==2)
+# define U_WCHAR_IS_UTF16
+# endif
+# elif defined(__UCS4__) || (U_PLATFORM == U_PF_OS400 && defined(__UTF32__))
+# if (U_SIZEOF_WCHAR_T==4)
+# define U_WCHAR_IS_UTF32
+# endif
+# elif U_PLATFORM_IS_DARWIN_BASED || (U_SIZEOF_WCHAR_T==4 && U_PLATFORM_IS_LINUX_BASED)
+# define U_WCHAR_IS_UTF32
+# elif U_PLATFORM_HAS_WIN32_API
+# define U_WCHAR_IS_UTF16
+# endif
+#endif
+
+/* UChar and UChar32 definitions -------------------------------------------- */
+
+/** Number of bytes in a UChar. @stable ICU 2.0 */
+#define U_SIZEOF_UCHAR 2
+
+/**
+ * \def U_CHAR16_IS_TYPEDEF
+ * If 1, then char16_t is a typedef and not a real type (yet)
+ * @internal
+ */
+#if (U_PLATFORM == U_PF_AIX) && defined(__cplusplus) &&(U_CPLUSPLUS_VERSION < 11)
+// for AIX, uchar.h needs to be included
+# include <uchar.h>
+# define U_CHAR16_IS_TYPEDEF 1
+#elif defined(_MSC_VER) && (_MSC_VER < 1900)
+// Versions of Visual Studio/MSVC below 2015 do not support char16_t as a real type,
+// and instead use a typedef. https://msdn.microsoft.com/library/bb531344.aspx
+# define U_CHAR16_IS_TYPEDEF 1
+#else
+# define U_CHAR16_IS_TYPEDEF 0
+#endif
+
+
+/**
+ * \var UChar
+ *
+ * The base type for UTF-16 code units and pointers.
+ * Unsigned 16-bit integer.
+ * Starting with ICU 59, C++ API uses char16_t directly, while C API continues to use UChar.
+ *
+ * UChar is configurable by defining the macro UCHAR_TYPE
+ * on the preprocessor or compiler command line:
+ * -DUCHAR_TYPE=uint16_t or -DUCHAR_TYPE=wchar_t (if U_SIZEOF_WCHAR_T==2) etc.
+ * (The UCHAR_TYPE can also be \#defined earlier in this file, for outside the ICU library code.)
+ * This is for transitional use from application code that uses uint16_t or wchar_t for UTF-16.
+ *
+ * The default is UChar=char16_t.
+ *
+ * C++11 defines char16_t as bit-compatible with uint16_t, but as a distinct type.
+ *
+ * In C, char16_t is a simple typedef of uint_least16_t.
+ * ICU requires uint_least16_t=uint16_t for data memory mapping.
+ * On macOS, char16_t is not available because the uchar.h standard header is missing.
+ *
+ * @stable ICU 4.4
+ */
+
+#if 1
+ // #if 1 is normal. UChar defaults to char16_t in C++.
+ // For configuration testing of UChar=uint16_t temporarily change this to #if 0.
+ // The intltest Makefile #defines UCHAR_TYPE=char16_t,
+ // so we only #define it to uint16_t if it is undefined so far.
+#elif !defined(UCHAR_TYPE)
+# define UCHAR_TYPE uint16_t
+#endif
+
+#if defined(U_COMBINED_IMPLEMENTATION) || defined(U_COMMON_IMPLEMENTATION) || \
+ defined(U_I18N_IMPLEMENTATION) || defined(U_IO_IMPLEMENTATION)
+ // Inside the ICU library code, never configurable.
+ typedef char16_t UChar;
+#elif defined(UCHAR_TYPE)
+ typedef UCHAR_TYPE UChar;
+#elif defined(__cplusplus)
+ typedef char16_t UChar;
+#else
+ typedef uint16_t UChar;
+#endif
+
+/**
+ * \var OldUChar
+ * Default ICU 58 definition of UChar.
+ * A base type for UTF-16 code units and pointers.
+ * Unsigned 16-bit integer.
+ *
+ * Define OldUChar to be wchar_t if that is 16 bits wide.
+ * If wchar_t is not 16 bits wide, then define UChar to be uint16_t.
+ *
+ * This makes the definition of OldUChar platform-dependent
+ * but allows direct string type compatibility with platforms with
+ * 16-bit wchar_t types.
+ *
+ * This is how UChar was defined in ICU 58, for transition convenience.
+ * Exception: ICU 58 UChar was defined to UCHAR_TYPE if that macro was defined.
+ * The current UChar responds to UCHAR_TYPE but OldUChar does not.
+ *
+ * @stable ICU 59
+ */
+#if U_SIZEOF_WCHAR_T==2
+ typedef wchar_t OldUChar;
+#elif defined(__CHAR16_TYPE__)
+ typedef __CHAR16_TYPE__ OldUChar;
+#else
+ typedef uint16_t OldUChar;
+#endif
+
+/**
+ * Define UChar32 as a type for single Unicode code points.
+ * UChar32 is a signed 32-bit integer (same as int32_t).
+ *
+ * The Unicode code point range is 0..0x10ffff.
+ * All other values (negative or >=0x110000) are illegal as Unicode code points.
+ * They may be used as sentinel values to indicate "done", "error"
+ * or similar non-code point conditions.
+ *
+ * Before ICU 2.4 (Jitterbug 2146), UChar32 was defined
+ * to be wchar_t if that is 32 bits wide (wchar_t may be signed or unsigned)
+ * or else to be uint32_t.
+ * That is, the definition of UChar32 was platform-dependent.
+ *
+ * @see U_SENTINEL
+ * @stable ICU 2.4
+ */
+typedef int32_t UChar32;
+
+/**
+ * This value is intended for sentinel values for APIs that
+ * (take or) return single code points (UChar32).
+ * It is outside of the Unicode code point range 0..0x10ffff.
+ *
+ * For example, a "done" or "error" value in a new API
+ * could be indicated with U_SENTINEL.
+ *
+ * ICU APIs designed before ICU 2.4 usually define service-specific "done"
+ * values, mostly 0xffff.
+ * Those may need to be distinguished from
+ * actual U+ffff text contents by calling functions like
+ * CharacterIterator::hasNext() or UnicodeString::length().
+ *
+ * @return -1
+ * @see UChar32
+ * @stable ICU 2.4
+ */
+#define U_SENTINEL (-1)
+
+#include "./urename.h"
+
+#endif
diff --git a/src/tree_sitter/unicode/urename.h b/src/tree_sitter/unicode/urename.h
new file mode 100644
index 0000000000..ac79ad0f98
--- /dev/null
+++ b/src/tree_sitter/unicode/urename.h
@@ -0,0 +1 @@
+// This file must exist in order for `utf8.h` and `utf16.h` to be used.
diff --git a/src/tree_sitter/unicode/utf.h b/src/tree_sitter/unicode/utf.h
new file mode 100644
index 0000000000..ac79ad0f98
--- /dev/null
+++ b/src/tree_sitter/unicode/utf.h
@@ -0,0 +1 @@
+// This file must exist in order for `utf8.h` and `utf16.h` to be used.
diff --git a/src/tree_sitter/unicode/utf16.h b/src/tree_sitter/unicode/utf16.h
new file mode 100644
index 0000000000..b547922441
--- /dev/null
+++ b/src/tree_sitter/unicode/utf16.h
@@ -0,0 +1,733 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+*******************************************************************************
+*
+* Copyright (C) 1999-2012, International Business Machines
+* Corporation and others. All Rights Reserved.
+*
+*******************************************************************************
+* file name: utf16.h
+* encoding: UTF-8
+* tab size: 8 (not used)
+* indentation:4
+*
+* created on: 1999sep09
+* created by: Markus W. Scherer
+*/
+
+/**
+ * \file
+ * \brief C API: 16-bit Unicode handling macros
+ *
+ * This file defines macros to deal with 16-bit Unicode (UTF-16) code units and strings.
+ *
+ * For more information see utf.h and the ICU User Guide Strings chapter
+ * (http://userguide.icu-project.org/strings).
+ *
+ * <em>Usage:</em>
+ * ICU coding guidelines for if() statements should be followed when using these macros.
+ * Compound statements (curly braces {}) must be used for if-else-while...
+ * bodies and all macro statements should be terminated with semicolon.
+ */
+
+#ifndef __UTF16_H__
+#define __UTF16_H__
+
+#include "./umachine.h"
+#ifndef __UTF_H__
+# include "./utf.h"
+#endif
+
+/* single-code point definitions -------------------------------------------- */
+
+/**
+ * Does this code unit alone encode a code point (BMP, not a surrogate)?
+ * @param c 16-bit code unit
+ * @return TRUE or FALSE
+ * @stable ICU 2.4
+ */
+#define U16_IS_SINGLE(c) !U_IS_SURROGATE(c)
+
+/**
+ * Is this code unit a lead surrogate (U+d800..U+dbff)?
+ * @param c 16-bit code unit
+ * @return TRUE or FALSE
+ * @stable ICU 2.4
+ */
+#define U16_IS_LEAD(c) (((c)&0xfffffc00)==0xd800)
+
+/**
+ * Is this code unit a trail surrogate (U+dc00..U+dfff)?
+ * @param c 16-bit code unit
+ * @return TRUE or FALSE
+ * @stable ICU 2.4
+ */
+#define U16_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00)
+
+/**
+ * Is this code unit a surrogate (U+d800..U+dfff)?
+ * @param c 16-bit code unit
+ * @return TRUE or FALSE
+ * @stable ICU 2.4
+ */
+#define U16_IS_SURROGATE(c) U_IS_SURROGATE(c)
+
+/**
+ * Assuming c is a surrogate code point (U16_IS_SURROGATE(c)),
+ * is it a lead surrogate?
+ * @param c 16-bit code unit
+ * @return TRUE or FALSE
+ * @stable ICU 2.4
+ */
+#define U16_IS_SURROGATE_LEAD(c) (((c)&0x400)==0)
+
+/**
+ * Assuming c is a surrogate code point (U16_IS_SURROGATE(c)),
+ * is it a trail surrogate?
+ * @param c 16-bit code unit
+ * @return TRUE or FALSE
+ * @stable ICU 4.2
+ */
+#define U16_IS_SURROGATE_TRAIL(c) (((c)&0x400)!=0)
+
+/**
+ * Helper constant for U16_GET_SUPPLEMENTARY.
+ * @internal
+ */
+#define U16_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000)
+
+/**
+ * Get a supplementary code point value (U+10000..U+10ffff)
+ * from its lead and trail surrogates.
+ * The result is undefined if the input values are not
+ * lead and trail surrogates.
+ *
+ * @param lead lead surrogate (U+d800..U+dbff)
+ * @param trail trail surrogate (U+dc00..U+dfff)
+ * @return supplementary code point (U+10000..U+10ffff)
+ * @stable ICU 2.4
+ */
+#define U16_GET_SUPPLEMENTARY(lead, trail) \
+ (((UChar32)(lead)<<10UL)+(UChar32)(trail)-U16_SURROGATE_OFFSET)
+
+
+/**
+ * Get the lead surrogate (0xd800..0xdbff) for a
+ * supplementary code point (0x10000..0x10ffff).
+ * @param supplementary 32-bit code point (U+10000..U+10ffff)
+ * @return lead surrogate (U+d800..U+dbff) for supplementary
+ * @stable ICU 2.4
+ */
+#define U16_LEAD(supplementary) (UChar)(((supplementary)>>10)+0xd7c0)
+
+/**
+ * Get the trail surrogate (0xdc00..0xdfff) for a
+ * supplementary code point (0x10000..0x10ffff).
+ * @param supplementary 32-bit code point (U+10000..U+10ffff)
+ * @return trail surrogate (U+dc00..U+dfff) for supplementary
+ * @stable ICU 2.4
+ */
+#define U16_TRAIL(supplementary) (UChar)(((supplementary)&0x3ff)|0xdc00)
+
+/**
+ * How many 16-bit code units are used to encode this Unicode code point? (1 or 2)
+ * The result is not defined if c is not a Unicode code point (U+0000..U+10ffff).
+ * @param c 32-bit code point
+ * @return 1 or 2
+ * @stable ICU 2.4
+ */
+#define U16_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2)
+
+/**
+ * The maximum number of 16-bit code units per Unicode code point (U+0000..U+10ffff).
+ * @return 2
+ * @stable ICU 2.4
+ */
+#define U16_MAX_LENGTH 2
+
+/**
+ * Get a code point from a string at a random-access offset,
+ * without changing the offset.
+ * "Unsafe" macro, assumes well-formed UTF-16.
+ *
+ * The offset may point to either the lead or trail surrogate unit
+ * for a supplementary code point, in which case the macro will read
+ * the adjacent matching surrogate as well.
+ * The result is undefined if the offset points to a single, unpaired surrogate.
+ * Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT.
+ *
+ * @param s const UChar * string
+ * @param i string offset
+ * @param c output UChar32 variable
+ * @see U16_GET
+ * @stable ICU 2.4
+ */
+#define U16_GET_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
+ (c)=(s)[i]; \
+ if(U16_IS_SURROGATE(c)) { \
+ if(U16_IS_SURROGATE_LEAD(c)) { \
+ (c)=U16_GET_SUPPLEMENTARY((c), (s)[(i)+1]); \
+ } else { \
+ (c)=U16_GET_SUPPLEMENTARY((s)[(i)-1], (c)); \
+ } \
+ } \
+} UPRV_BLOCK_MACRO_END
+
+/**
+ * Get a code point from a string at a random-access offset,
+ * without changing the offset.
+ * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
+ *
+ * The offset may point to either the lead or trail surrogate unit
+ * for a supplementary code point, in which case the macro will read
+ * the adjacent matching surrogate as well.
+ *
+ * The length can be negative for a NUL-terminated string.
+ *
+ * If the offset points to a single, unpaired surrogate, then
+ * c is set to that unpaired surrogate.
+ * Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT.
+ *
+ * @param s const UChar * string
+ * @param start starting string offset (usually 0)
+ * @param i string offset, must be start<=i<length
+ * @param length string length
+ * @param c output UChar32 variable
+ * @see U16_GET_UNSAFE
+ * @stable ICU 2.4
+ */
+#define U16_GET(s, start, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
+ (c)=(s)[i]; \
+ if(U16_IS_SURROGATE(c)) { \
+ uint16_t __c2; \
+ if(U16_IS_SURROGATE_LEAD(c)) { \
+ if((i)+1!=(length) && U16_IS_TRAIL(__c2=(s)[(i)+1])) { \
+ (c)=U16_GET_SUPPLEMENTARY((c), __c2); \
+ } \
+ } else { \
+ if((i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
+ (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
+ } \
+ } \
+ } \
+} UPRV_BLOCK_MACRO_END
+
+/**
+ * Get a code point from a string at a random-access offset,
+ * without changing the offset.
+ * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
+ *
+ * The offset may point to either the lead or trail surrogate unit
+ * for a supplementary code point, in which case the macro will read
+ * the adjacent matching surrogate as well.
+ *
+ * The length can be negative for a NUL-terminated string.
+ *
+ * If the offset points to a single, unpaired surrogate, then
+ * c is set to U+FFFD.
+ * Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT_OR_FFFD.
+ *
+ * @param s const UChar * string
+ * @param start starting string offset (usually 0)
+ * @param i string offset, must be start<=i<length
+ * @param length string length
+ * @param c output UChar32 variable
+ * @see U16_GET_UNSAFE
+ * @stable ICU 60
+ */
+#define U16_GET_OR_FFFD(s, start, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
+ (c)=(s)[i]; \
+ if(U16_IS_SURROGATE(c)) { \
+ uint16_t __c2; \
+ if(U16_IS_SURROGATE_LEAD(c)) { \
+ if((i)+1!=(length) && U16_IS_TRAIL(__c2=(s)[(i)+1])) { \
+ (c)=U16_GET_SUPPLEMENTARY((c), __c2); \
+ } else { \
+ (c)=0xfffd; \
+ } \
+ } else { \
+ if((i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
+ (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
+ } else { \
+ (c)=0xfffd; \
+ } \
+ } \
+ } \
+} UPRV_BLOCK_MACRO_END
+
+/* definitions with forward iteration --------------------------------------- */
+
+/**
+ * Get a code point from a string at a code point boundary offset,
+ * and advance the offset to the next code point boundary.
+ * (Post-incrementing forward iteration.)
+ * "Unsafe" macro, assumes well-formed UTF-16.
+ *
+ * The offset may point to the lead surrogate unit
+ * for a supplementary code point, in which case the macro will read
+ * the following trail surrogate as well.
+ * If the offset points to a trail surrogate, then that itself
+ * will be returned as the code point.
+ * The result is undefined if the offset points to a single, unpaired lead surrogate.
+ *
+ * @param s const UChar * string
+ * @param i string offset
+ * @param c output UChar32 variable
+ * @see U16_NEXT
+ * @stable ICU 2.4
+ */
+#define U16_NEXT_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
+ (c)=(s)[(i)++]; \
+ if(U16_IS_LEAD(c)) { \
+ (c)=U16_GET_SUPPLEMENTARY((c), (s)[(i)++]); \
+ } \
+} UPRV_BLOCK_MACRO_END
+
+/**
+ * Get a code point from a string at a code point boundary offset,
+ * and advance the offset to the next code point boundary.
+ * (Post-incrementing forward iteration.)
+ * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
+ *
+ * The length can be negative for a NUL-terminated string.
+ *
+ * The offset may point to the lead surrogate unit
+ * for a supplementary code point, in which case the macro will read
+ * the following trail surrogate as well.
+ * If the offset points to a trail surrogate or
+ * to a single, unpaired lead surrogate, then c is set to that unpaired surrogate.
+ *
+ * @param s const UChar * string
+ * @param i string offset, must be i<length
+ * @param length string length
+ * @param c output UChar32 variable
+ * @see U16_NEXT_UNSAFE
+ * @stable ICU 2.4
+ */
+#define U16_NEXT(s, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
+ (c)=(s)[(i)++]; \
+ if(U16_IS_LEAD(c)) { \
+ uint16_t __c2; \
+ if((i)!=(length) && U16_IS_TRAIL(__c2=(s)[(i)])) { \
+ ++(i); \
+ (c)=U16_GET_SUPPLEMENTARY((c), __c2); \
+ } \
+ } \
+} UPRV_BLOCK_MACRO_END
+
+/**
+ * Get a code point from a string at a code point boundary offset,
+ * and advance the offset to the next code point boundary.
+ * (Post-incrementing forward iteration.)
+ * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
+ *
+ * The length can be negative for a NUL-terminated string.
+ *
+ * The offset may point to the lead surrogate unit
+ * for a supplementary code point, in which case the macro will read
+ * the following trail surrogate as well.
+ * If the offset points to a trail surrogate or
+ * to a single, unpaired lead surrogate, then c is set to U+FFFD.
+ *
+ * @param s const UChar * string
+ * @param i string offset, must be i<length
+ * @param length string length
+ * @param c output UChar32 variable
+ * @see U16_NEXT_UNSAFE
+ * @stable ICU 60
+ */
+#define U16_NEXT_OR_FFFD(s, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
+ (c)=(s)[(i)++]; \
+ if(U16_IS_SURROGATE(c)) { \
+ uint16_t __c2; \
+ if(U16_IS_SURROGATE_LEAD(c) && (i)!=(length) && U16_IS_TRAIL(__c2=(s)[(i)])) { \
+ ++(i); \
+ (c)=U16_GET_SUPPLEMENTARY((c), __c2); \
+ } else { \
+ (c)=0xfffd; \
+ } \
+ } \
+} UPRV_BLOCK_MACRO_END
+
+/**
+ * Append a code point to a string, overwriting 1 or 2 code units.
+ * The offset points to the current end of the string contents
+ * and is advanced (post-increment).
+ * "Unsafe" macro, assumes a valid code point and sufficient space in the string.
+ * Otherwise, the result is undefined.
+ *
+ * @param s const UChar * string buffer
+ * @param i string offset
+ * @param c code point to append
+ * @see U16_APPEND
+ * @stable ICU 2.4
+ */
+#define U16_APPEND_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
+ if((uint32_t)(c)<=0xffff) { \
+ (s)[(i)++]=(uint16_t)(c); \
+ } else { \
+ (s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \
+ (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
+ } \
+} UPRV_BLOCK_MACRO_END
+
+/**
+ * Append a code point to a string, overwriting 1 or 2 code units.
+ * The offset points to the current end of the string contents
+ * and is advanced (post-increment).
+ * "Safe" macro, checks for a valid code point.
+ * If a surrogate pair is written, checks for sufficient space in the string.
+ * If the code point is not valid or a trail surrogate does not fit,
+ * then isError is set to TRUE.
+ *
+ * @param s const UChar * string buffer
+ * @param i string offset, must be i<capacity
+ * @param capacity size of the string buffer
+ * @param c code point to append
+ * @param isError output UBool set to TRUE if an error occurs, otherwise not modified
+ * @see U16_APPEND_UNSAFE
+ * @stable ICU 2.4
+ */
+#define U16_APPEND(s, i, capacity, c, isError) UPRV_BLOCK_MACRO_BEGIN { \
+ if((uint32_t)(c)<=0xffff) { \
+ (s)[(i)++]=(uint16_t)(c); \
+ } else if((uint32_t)(c)<=0x10ffff && (i)+1<(capacity)) { \
+ (s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \
+ (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
+ } else /* c>0x10ffff or not enough space */ { \
+ (isError)=TRUE; \
+ } \
+} UPRV_BLOCK_MACRO_END
+
+/**
+ * Advance the string offset from one code point boundary to the next.
+ * (Post-incrementing iteration.)
+ * "Unsafe" macro, assumes well-formed UTF-16.
+ *
+ * @param s const UChar * string
+ * @param i string offset
+ * @see U16_FWD_1
+ * @stable ICU 2.4
+ */
+#define U16_FWD_1_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
+ if(U16_IS_LEAD((s)[(i)++])) { \
+ ++(i); \
+ } \
+} UPRV_BLOCK_MACRO_END
+
+/**
+ * Advance the string offset from one code point boundary to the next.
+ * (Post-incrementing iteration.)
+ * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
+ *
+ * The length can be negative for a NUL-terminated string.
+ *
+ * @param s const UChar * string
+ * @param i string offset, must be i<length
+ * @param length string length
+ * @see U16_FWD_1_UNSAFE
+ * @stable ICU 2.4
+ */
+#define U16_FWD_1(s, i, length) UPRV_BLOCK_MACRO_BEGIN { \
+ if(U16_IS_LEAD((s)[(i)++]) && (i)!=(length) && U16_IS_TRAIL((s)[i])) { \
+ ++(i); \
+ } \
+} UPRV_BLOCK_MACRO_END
+
+/**
+ * Advance the string offset from one code point boundary to the n-th next one,
+ * i.e., move forward by n code points.
+ * (Post-incrementing iteration.)
+ * "Unsafe" macro, assumes well-formed UTF-16.
+ *
+ * @param s const UChar * string
+ * @param i string offset
+ * @param n number of code points to skip
+ * @see U16_FWD_N
+ * @stable ICU 2.4
+ */
+#define U16_FWD_N_UNSAFE(s, i, n) UPRV_BLOCK_MACRO_BEGIN { \
+ int32_t __N=(n); \
+ while(__N>0) { \
+ U16_FWD_1_UNSAFE(s, i); \
+ --__N; \
+ } \
+} UPRV_BLOCK_MACRO_END
+
+/**
+ * Advance the string offset from one code point boundary to the n-th next one,
+ * i.e., move forward by n code points.
+ * (Post-incrementing iteration.)
+ * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
+ *
+ * The length can be negative for a NUL-terminated string.
+ *
+ * @param s const UChar * string
+ * @param i int32_t string offset, must be i<length
+ * @param length int32_t string length
+ * @param n number of code points to skip
+ * @see U16_FWD_N_UNSAFE
+ * @stable ICU 2.4
+ */
+#define U16_FWD_N(s, i, length, n) UPRV_BLOCK_MACRO_BEGIN { \
+ int32_t __N=(n); \
+ while(__N>0 && ((i)<(length) || ((length)<0 && (s)[i]!=0))) { \
+ U16_FWD_1(s, i, length); \
+ --__N; \
+ } \
+} UPRV_BLOCK_MACRO_END
+
+/**
+ * Adjust a random-access offset to a code point boundary
+ * at the start of a code point.
+ * If the offset points to the trail surrogate of a surrogate pair,
+ * then the offset is decremented.
+ * Otherwise, it is not modified.
+ * "Unsafe" macro, assumes well-formed UTF-16.
+ *
+ * @param s const UChar * string
+ * @param i string offset
+ * @see U16_SET_CP_START
+ * @stable ICU 2.4
+ */
+#define U16_SET_CP_START_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
+ if(U16_IS_TRAIL((s)[i])) { \
+ --(i); \
+ } \
+} UPRV_BLOCK_MACRO_END
+
+/**
+ * Adjust a random-access offset to a code point boundary
+ * at the start of a code point.
+ * If the offset points to the trail surrogate of a surrogate pair,
+ * then the offset is decremented.
+ * Otherwise, it is not modified.
+ * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
+ *
+ * @param s const UChar * string
+ * @param start starting string offset (usually 0)
+ * @param i string offset, must be start<=i
+ * @see U16_SET_CP_START_UNSAFE
+ * @stable ICU 2.4
+ */
+#define U16_SET_CP_START(s, start, i) UPRV_BLOCK_MACRO_BEGIN { \
+ if(U16_IS_TRAIL((s)[i]) && (i)>(start) && U16_IS_LEAD((s)[(i)-1])) { \
+ --(i); \
+ } \
+} UPRV_BLOCK_MACRO_END
+
+/* definitions with backward iteration -------------------------------------- */
+
+/**
+ * Move the string offset from one code point boundary to the previous one
+ * and get the code point between them.
+ * (Pre-decrementing backward iteration.)
+ * "Unsafe" macro, assumes well-formed UTF-16.
+ *
+ * The input offset may be the same as the string length.
+ * If the offset is behind a trail surrogate unit
+ * for a supplementary code point, then the macro will read
+ * the preceding lead surrogate as well.
+ * If the offset is behind a lead surrogate, then that itself
+ * will be returned as the code point.
+ * The result is undefined if the offset is behind a single, unpaired trail surrogate.
+ *
+ * @param s const UChar * string
+ * @param i string offset
+ * @param c output UChar32 variable
+ * @see U16_PREV
+ * @stable ICU 2.4
+ */
+#define U16_PREV_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
+ (c)=(s)[--(i)]; \
+ if(U16_IS_TRAIL(c)) { \
+ (c)=U16_GET_SUPPLEMENTARY((s)[--(i)], (c)); \
+ } \
+} UPRV_BLOCK_MACRO_END
+
+/**
+ * Move the string offset from one code point boundary to the previous one
+ * and get the code point between them.
+ * (Pre-decrementing backward iteration.)
+ * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
+ *
+ * The input offset may be the same as the string length.
+ * If the offset is behind a trail surrogate unit
+ * for a supplementary code point, then the macro will read
+ * the preceding lead surrogate as well.
+ * If the offset is behind a lead surrogate or behind a single, unpaired
+ * trail surrogate, then c is set to that unpaired surrogate.
+ *
+ * @param s const UChar * string
+ * @param start starting string offset (usually 0)
+ * @param i string offset, must be start<i
+ * @param c output UChar32 variable
+ * @see U16_PREV_UNSAFE
+ * @stable ICU 2.4
+ */
+#define U16_PREV(s, start, i, c) UPRV_BLOCK_MACRO_BEGIN { \
+ (c)=(s)[--(i)]; \
+ if(U16_IS_TRAIL(c)) { \
+ uint16_t __c2; \
+ if((i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
+ --(i); \
+ (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
+ } \
+ } \
+} UPRV_BLOCK_MACRO_END
+
+/**
+ * Move the string offset from one code point boundary to the previous one
+ * and get the code point between them.
+ * (Pre-decrementing backward iteration.)
+ * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
+ *
+ * The input offset may be the same as the string length.
+ * If the offset is behind a trail surrogate unit
+ * for a supplementary code point, then the macro will read
+ * the preceding lead surrogate as well.
+ * If the offset is behind a lead surrogate or behind a single, unpaired
+ * trail surrogate, then c is set to U+FFFD.
+ *
+ * @param s const UChar * string
+ * @param start starting string offset (usually 0)
+ * @param i string offset, must be start<i
+ * @param c output UChar32 variable
+ * @see U16_PREV_UNSAFE
+ * @stable ICU 60
+ */
+#define U16_PREV_OR_FFFD(s, start, i, c) UPRV_BLOCK_MACRO_BEGIN { \
+ (c)=(s)[--(i)]; \
+ if(U16_IS_SURROGATE(c)) { \
+ uint16_t __c2; \
+ if(U16_IS_SURROGATE_TRAIL(c) && (i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
+ --(i); \
+ (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
+ } else { \
+ (c)=0xfffd; \
+ } \
+ } \
+} UPRV_BLOCK_MACRO_END
+
+/**
+ * Move the string offset from one code point boundary to the previous one.
+ * (Pre-decrementing backward iteration.)
+ * The input offset may be the same as the string length.
+ * "Unsafe" macro, assumes well-formed UTF-16.
+ *
+ * @param s const UChar * string
+ * @param i string offset
+ * @see U16_BACK_1
+ * @stable ICU 2.4
+ */
+#define U16_BACK_1_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
+ if(U16_IS_TRAIL((s)[--(i)])) { \
+ --(i); \
+ } \
+} UPRV_BLOCK_MACRO_END
+
+/**
+ * Move the string offset from one code point boundary to the previous one.
+ * (Pre-decrementing backward iteration.)
+ * The input offset may be the same as the string length.
+ * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
+ *
+ * @param s const UChar * string
+ * @param start starting string offset (usually 0)
+ * @param i string offset, must be start<i
+ * @see U16_BACK_1_UNSAFE
+ * @stable ICU 2.4
+ */
+#define U16_BACK_1(s, start, i) UPRV_BLOCK_MACRO_BEGIN { \
+ if(U16_IS_TRAIL((s)[--(i)]) && (i)>(start) && U16_IS_LEAD((s)[(i)-1])) { \
+ --(i); \
+ } \
+} UPRV_BLOCK_MACRO_END
+
+/**
+ * Move the string offset from one code point boundary to the n-th one before it,
+ * i.e., move backward by n code points.
+ * (Pre-decrementing backward iteration.)
+ * The input offset may be the same as the string length.
+ * "Unsafe" macro, assumes well-formed UTF-16.
+ *
+ * @param s const UChar * string
+ * @param i string offset
+ * @param n number of code points to skip
+ * @see U16_BACK_N
+ * @stable ICU 2.4
+ */
+#define U16_BACK_N_UNSAFE(s, i, n) UPRV_BLOCK_MACRO_BEGIN { \
+ int32_t __N=(n); \
+ while(__N>0) { \
+ U16_BACK_1_UNSAFE(s, i); \
+ --__N; \
+ } \
+} UPRV_BLOCK_MACRO_END
+
+/**
+ * Move the string offset from one code point boundary to the n-th one before it,
+ * i.e., move backward by n code points.
+ * (Pre-decrementing backward iteration.)
+ * The input offset may be the same as the string length.
+ * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
+ *
+ * @param s const UChar * string
+ * @param start start of string
+ * @param i string offset, must be start<i
+ * @param n number of code points to skip
+ * @see U16_BACK_N_UNSAFE
+ * @stable ICU 2.4
+ */
+#define U16_BACK_N(s, start, i, n) UPRV_BLOCK_MACRO_BEGIN { \
+ int32_t __N=(n); \
+ while(__N>0 && (i)>(start)) { \
+ U16_BACK_1(s, start, i); \
+ --__N; \
+ } \
+} UPRV_BLOCK_MACRO_END
+
+/**
+ * Adjust a random-access offset to a code point boundary after a code point.
+ * If the offset is behind the lead surrogate of a surrogate pair,
+ * then the offset is incremented.
+ * Otherwise, it is not modified.
+ * The input offset may be the same as the string length.
+ * "Unsafe" macro, assumes well-formed UTF-16.
+ *
+ * @param s const UChar * string
+ * @param i string offset
+ * @see U16_SET_CP_LIMIT
+ * @stable ICU 2.4
+ */
+#define U16_SET_CP_LIMIT_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
+ if(U16_IS_LEAD((s)[(i)-1])) { \
+ ++(i); \
+ } \
+} UPRV_BLOCK_MACRO_END
+
+/**
+ * Adjust a random-access offset to a code point boundary after a code point.
+ * If the offset is behind the lead surrogate of a surrogate pair,
+ * then the offset is incremented.
+ * Otherwise, it is not modified.
+ * The input offset may be the same as the string length.
+ * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
+ *
+ * The length can be negative for a NUL-terminated string.
+ *
+ * @param s const UChar * string
+ * @param start int32_t starting string offset (usually 0)
+ * @param i int32_t string offset, start<=i<=length
+ * @param length int32_t string length
+ * @see U16_SET_CP_LIMIT_UNSAFE
+ * @stable ICU 2.4
+ */
+#define U16_SET_CP_LIMIT(s, start, i, length) UPRV_BLOCK_MACRO_BEGIN { \
+ if((start)<(i) && ((i)<(length) || (length)<0) && U16_IS_LEAD((s)[(i)-1]) && U16_IS_TRAIL((s)[i])) { \
+ ++(i); \
+ } \
+} UPRV_BLOCK_MACRO_END
+
+#endif
diff --git a/src/tree_sitter/unicode/utf8.h b/src/tree_sitter/unicode/utf8.h
new file mode 100644
index 0000000000..3b37873e37
--- /dev/null
+++ b/src/tree_sitter/unicode/utf8.h
@@ -0,0 +1,881 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+*******************************************************************************
+*
+* Copyright (C) 1999-2015, International Business Machines
+* Corporation and others. All Rights Reserved.
+*
+*******************************************************************************
+* file name: utf8.h
+* encoding: UTF-8
+* tab size: 8 (not used)
+* indentation:4
+*
+* created on: 1999sep13
+* created by: Markus W. Scherer
+*/
+
+/**
+ * \file
+ * \brief C API: 8-bit Unicode handling macros
+ *
+ * This file defines macros to deal with 8-bit Unicode (UTF-8) code units (bytes) and strings.
+ *
+ * For more information see utf.h and the ICU User Guide Strings chapter
+ * (http://userguide.icu-project.org/strings).
+ *
+ * <em>Usage:</em>
+ * ICU coding guidelines for if() statements should be followed when using these macros.
+ * Compound statements (curly braces {}) must be used for if-else-while...
+ * bodies and all macro statements should be terminated with semicolon.
+ */
+
+#ifndef __UTF8_H__
+#define __UTF8_H__
+
+#include "./umachine.h"
+#ifndef __UTF_H__
+# include "./utf.h"
+#endif
+
+/* internal definitions ----------------------------------------------------- */
+
+/**
+ * Counts the trail bytes for a UTF-8 lead byte.
+ * Returns 0 for 0..0xc1 as well as for 0xf5..0xff.
+ * leadByte might be evaluated multiple times.
+ *
+ * This is internal since it is not meant to be called directly by external clients;
+ * however it is called by public macros in this file and thus must remain stable.
+ *
+ * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.
+ * @internal
+ */
+#define U8_COUNT_TRAIL_BYTES(leadByte) \
+ (U8_IS_LEAD(leadByte) ? \
+ ((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0)+1 : 0)
+
+/**
+ * Counts the trail bytes for a UTF-8 lead byte of a valid UTF-8 sequence.
+ * Returns 0 for 0..0xc1. Undefined for 0xf5..0xff.
+ * leadByte might be evaluated multiple times.
+ *
+ * This is internal since it is not meant to be called directly by external clients;
+ * however it is called by public macros in this file and thus must remain stable.
+ *
+ * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.
+ * @internal
+ */
+#define U8_COUNT_TRAIL_BYTES_UNSAFE(leadByte) \
+ (((uint8_t)(leadByte)>=0xc2)+((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0))
+
+/**
+ * Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value.
+ *
+ * This is internal since it is not meant to be called directly by external clients;
+ * however it is called by public macros in this file and thus must remain stable.
+ * @internal
+ */
+#define U8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1)
+
+/**
+ * Internal bit vector for 3-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD3_AND_T1.
+ * Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence.
+ * Lead byte E0..EF bits 3..0 are used as byte index,
+ * first trail byte bits 7..5 are used as bit index into that byte.
+ * @see U8_IS_VALID_LEAD3_AND_T1
+ * @internal
+ */
+#define U8_LEAD3_T1_BITS "\x20\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x10\x30\x30"
+
+/**
+ * Internal 3-byte UTF-8 validity check.
+ * Non-zero if lead byte E0..EF and first trail byte 00..FF start a valid sequence.
+ * @internal
+ */
+#define U8_IS_VALID_LEAD3_AND_T1(lead, t1) (U8_LEAD3_T1_BITS[(lead)&0xf]&(1<<((uint8_t)(t1)>>5)))
+
+/**
+ * Internal bit vector for 4-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD4_AND_T1.
+ * Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence.
+ * First trail byte bits 7..4 are used as byte index,
+ * lead byte F0..F4 bits 2..0 are used as bit index into that byte.
+ * @see U8_IS_VALID_LEAD4_AND_T1
+ * @internal
+ */
+#define U8_LEAD4_T1_BITS "\x00\x00\x00\x00\x00\x00\x00\x00\x1E\x0F\x0F\x0F\x00\x00\x00\x00"
+
+/**
+ * Internal 4-byte UTF-8 validity check.
+ * Non-zero if lead byte F0..F4 and first trail byte 00..FF start a valid sequence.
+ * @internal
+ */
+#define U8_IS_VALID_LEAD4_AND_T1(lead, t1) (U8_LEAD4_T1_BITS[(uint8_t)(t1)>>4]&(1<<((lead)&7)))
+
+/**
+ * Function for handling "next code point" with error-checking.
+ *
+ * This is internal since it is not meant to be called directly by external clients;
+ * however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this
+ * file and thus must remain stable, and should not be hidden when other internal
+ * functions are hidden (otherwise public macros would fail to compile).
+ * @internal
+ */
+U_STABLE UChar32 U_EXPORT2
+utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict);
+
+/**
+ * Function for handling "append code point" with error-checking.
+ *
+ * This is internal since it is not meant to be called directly by external clients;
+ * however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this
+ * file and thus must remain stable, and should not be hidden when other internal
+ * functions are hidden (otherwise public macros would fail to compile).
+ * @internal
+ */
+U_STABLE int32_t U_EXPORT2
+utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c, UBool *pIsError);
+
+/**
+ * Function for handling "previous code point" with error-checking.
+ *
+ * This is internal since it is not meant to be called directly by external clients;
+ * however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this
+ * file and thus must remain stable, and should not be hidden when other internal
+ * functions are hidden (otherwise public macros would fail to compile).
+ * @internal
+ */
+U_STABLE UChar32 U_EXPORT2
+utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, UBool strict);
+
+/**
+ * Function for handling "skip backward one code point" with error-checking.
+ *
+ * This is internal since it is not meant to be called directly by external clients;
+ * however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this
+ * file and thus must remain stable, and should not be hidden when other internal
+ * functions are hidden (otherwise public macros would fail to compile).
+ * @internal
+ */
+U_STABLE int32_t U_EXPORT2
+utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
+
+/* single-code point definitions -------------------------------------------- */
+
+/**
+ * Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)?
+ * @param c 8-bit code unit (byte)
+ * @return TRUE or FALSE
+ * @stable ICU 2.4
+ */
+#define U8_IS_SINGLE(c) (((c)&0x80)==0)
+
+/**
+ * Is this code unit (byte) a UTF-8 lead byte? (0xC2..0xF4)
+ * @param c 8-bit code unit (byte)
+ * @return TRUE or FALSE
+ * @stable ICU 2.4
+ */
+#define U8_IS_LEAD(c) ((uint8_t)((c)-0xc2)<=0x32)
+// 0x32=0xf4-0xc2
+
+/**
+ * Is this code unit (byte) a UTF-8 trail byte? (0x80..0xBF)
+ * @param c 8-bit code unit (byte)
+ * @return TRUE or FALSE
+ * @stable ICU 2.4
+ */
+#define U8_IS_TRAIL(c) ((int8_t)(c)<-0x40)
+
+/**
+ * How many code units (bytes) are used for the UTF-8 encoding
+ * of this Unicode code point?
+ * @param c 32-bit code point
+ * @return 1..4, or 0 if c is a surrogate or not a Unicode code point
+ * @stable ICU 2.4
+ */
+#define U8_LENGTH(c) \
+ ((uint32_t)(c)<=0x7f ? 1 : \
+ ((uint32_t)(c)<=0x7ff ? 2 : \
+ ((uint32_t)(c)<=0xd7ff ? 3 : \
+ ((uint32_t)(c)<=0xdfff || (uint32_t)(c)>0x10ffff ? 0 : \
+ ((uint32_t)(c)<=0xffff ? 3 : 4)\
+ ) \
+ ) \
+ ) \
+ )
+
+/**
+ * The maximum number of UTF-8 code units (bytes) per Unicode code point (U+0000..U+10ffff).
+ * @return 4
+ * @stable ICU 2.4
+ */
+#define U8_MAX_LENGTH 4
+
+/**
+ * Get a code point from a string at a random-access offset,
+ * without changing the offset.
+ * The offset may point to either the lead byte or one of the trail bytes
+ * for a code point, in which case the macro will read all of the bytes
+ * for the code point.
+ * The result is undefined if the offset points to an illegal UTF-8
+ * byte sequence.
+ * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT.
+ *
+ * @param s const uint8_t * string
+ * @param i string offset
+ * @param c output UChar32 variable
+ * @see U8_GET
+ * @stable ICU 2.4
+ */
+#define U8_GET_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
+ int32_t _u8_get_unsafe_index=(int32_t)(i); \
+ U8_SET_CP_START_UNSAFE(s, _u8_get_unsafe_index); \
+ U8_NEXT_UNSAFE(s, _u8_get_unsafe_index, c); \
+} UPRV_BLOCK_MACRO_END
+
+/**
+ * Get a code point from a string at a random-access offset,
+ * without changing the offset.
+ * The offset may point to either the lead byte or one of the trail bytes
+ * for a code point, in which case the macro will read all of the bytes
+ * for the code point.
+ *
+ * The length can be negative for a NUL-terminated string.
+ *
+ * If the offset points to an illegal UTF-8 byte sequence, then
+ * c is set to a negative value.
+ * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT.
+ *
+ * @param s const uint8_t * string
+ * @param start int32_t starting string offset
+ * @param i int32_t string offset, must be start<=i<length
+ * @param length int32_t string length
+ * @param c output UChar32 variable, set to <0 in case of an error
+ * @see U8_GET_UNSAFE
+ * @stable ICU 2.4
+ */
+#define U8_GET(s, start, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
+ int32_t _u8_get_index=(i); \
+ U8_SET_CP_START(s, start, _u8_get_index); \
+ U8_NEXT(s, _u8_get_index, length, c); \
+} UPRV_BLOCK_MACRO_END
+
+/**
+ * Get a code point from a string at a random-access offset,
+ * without changing the offset.
+ * The offset may point to either the lead byte or one of the trail bytes
+ * for a code point, in which case the macro will read all of the bytes
+ * for the code point.
+ *
+ * The length can be negative for a NUL-terminated string.
+ *
+ * If the offset points to an illegal UTF-8 byte sequence, then
+ * c is set to U+FFFD.
+ * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT_OR_FFFD.
+ *
+ * This macro does not distinguish between a real U+FFFD in the text
+ * and U+FFFD returned for an ill-formed sequence.
+ * Use U8_GET() if that distinction is important.
+ *
+ * @param s const uint8_t * string
+ * @param start int32_t starting string offset
+ * @param i int32_t string offset, must be start<=i<length
+ * @param length int32_t string length
+ * @param c output UChar32 variable, set to U+FFFD in case of an error
+ * @see U8_GET
+ * @stable ICU 51
+ */
+#define U8_GET_OR_FFFD(s, start, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
+ int32_t _u8_get_index=(i); \
+ U8_SET_CP_START(s, start, _u8_get_index); \
+ U8_NEXT_OR_FFFD(s, _u8_get_index, length, c); \
+} UPRV_BLOCK_MACRO_END
+
+/* definitions with forward iteration --------------------------------------- */
+
+/**
+ * Get a code point from a string at a code point boundary offset,
+ * and advance the offset to the next code point boundary.
+ * (Post-incrementing forward iteration.)
+ * "Unsafe" macro, assumes well-formed UTF-8.
+ *
+ * The offset may point to the lead byte of a multi-byte sequence,
+ * in which case the macro will read the whole sequence.
+ * The result is undefined if the offset points to a trail byte
+ * or an illegal UTF-8 sequence.
+ *
+ * @param s const uint8_t * string
+ * @param i string offset
+ * @param c output UChar32 variable
+ * @see U8_NEXT
+ * @stable ICU 2.4
+ */
+#define U8_NEXT_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
+ (c)=(uint8_t)(s)[(i)++]; \
+ if(!U8_IS_SINGLE(c)) { \
+ if((c)<0xe0) { \
+ (c)=(((c)&0x1f)<<6)|((s)[(i)++]&0x3f); \
+ } else if((c)<0xf0) { \
+ /* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \
+ (c)=(UChar)(((c)<<12)|(((s)[i]&0x3f)<<6)|((s)[(i)+1]&0x3f)); \
+ (i)+=2; \
+ } else { \
+ (c)=(((c)&7)<<18)|(((s)[i]&0x3f)<<12)|(((s)[(i)+1]&0x3f)<<6)|((s)[(i)+2]&0x3f); \
+ (i)+=3; \
+ } \
+ } \
+} UPRV_BLOCK_MACRO_END
+
+/**
+ * Get a code point from a string at a code point boundary offset,
+ * and advance the offset to the next code point boundary.
+ * (Post-incrementing forward iteration.)
+ * "Safe" macro, checks for illegal sequences and for string boundaries.
+ *
+ * The length can be negative for a NUL-terminated string.
+ *
+ * The offset may point to the lead byte of a multi-byte sequence,
+ * in which case the macro will read the whole sequence.
+ * If the offset points to a trail byte or an illegal UTF-8 sequence, then
+ * c is set to a negative value.
+ *
+ * @param s const uint8_t * string
+ * @param i int32_t string offset, must be i<length
+ * @param length int32_t string length
+ * @param c output UChar32 variable, set to <0 in case of an error
+ * @see U8_NEXT_UNSAFE
+ * @stable ICU 2.4
+ */
+#define U8_NEXT(s, i, length, c) U8_INTERNAL_NEXT_OR_SUB(s, i, length, c, U_SENTINEL)
+
+/**
+ * Get a code point from a string at a code point boundary offset,
+ * and advance the offset to the next code point boundary.
+ * (Post-incrementing forward iteration.)
+ * "Safe" macro, checks for illegal sequences and for string boundaries.
+ *
+ * The length can be negative for a NUL-terminated string.
+ *
+ * The offset may point to the lead byte of a multi-byte sequence,
+ * in which case the macro will read the whole sequence.
+ * If the offset points to a trail byte or an illegal UTF-8 sequence, then
+ * c is set to U+FFFD.
+ *
+ * This macro does not distinguish between a real U+FFFD in the text
+ * and U+FFFD returned for an ill-formed sequence.
+ * Use U8_NEXT() if that distinction is important.
+ *
+ * @param s const uint8_t * string
+ * @param i int32_t string offset, must be i<length
+ * @param length int32_t string length
+ * @param c output UChar32 variable, set to U+FFFD in case of an error
+ * @see U8_NEXT
+ * @stable ICU 51
+ */
+#define U8_NEXT_OR_FFFD(s, i, length, c) U8_INTERNAL_NEXT_OR_SUB(s, i, length, c, 0xfffd)
+
+/** @internal */
+#define U8_INTERNAL_NEXT_OR_SUB(s, i, length, c, sub) UPRV_BLOCK_MACRO_BEGIN { \
+ (c)=(uint8_t)(s)[(i)++]; \
+ if(!U8_IS_SINGLE(c)) { \
+ uint8_t __t = 0; \
+ if((i)!=(length) && \
+ /* fetch/validate/assemble all but last trail byte */ \
+ ((c)>=0xe0 ? \
+ ((c)<0xf0 ? /* U+0800..U+FFFF except surrogates */ \
+ U8_LEAD3_T1_BITS[(c)&=0xf]&(1<<((__t=(s)[i])>>5)) && \
+ (__t&=0x3f, 1) \
+ : /* U+10000..U+10FFFF */ \
+ ((c)-=0xf0)<=4 && \
+ U8_LEAD4_T1_BITS[(__t=(s)[i])>>4]&(1<<(c)) && \
+ ((c)=((c)<<6)|(__t&0x3f), ++(i)!=(length)) && \
+ (__t=(s)[i]-0x80)<=0x3f) && \
+ /* valid second-to-last trail byte */ \
+ ((c)=((c)<<6)|__t, ++(i)!=(length)) \
+ : /* U+0080..U+07FF */ \
+ (c)>=0xc2 && ((c)&=0x1f, 1)) && \
+ /* last trail byte */ \
+ (__t=(s)[i]-0x80)<=0x3f && \
+ ((c)=((c)<<6)|__t, ++(i), 1)) { \
+ } else { \
+ (c)=(sub); /* ill-formed*/ \
+ } \
+ } \
+} UPRV_BLOCK_MACRO_END
+
+/**
+ * Append a code point to a string, overwriting 1 to 4 bytes.
+ * The offset points to the current end of the string contents
+ * and is advanced (post-increment).
+ * "Unsafe" macro, assumes a valid code point and sufficient space in the string.
+ * Otherwise, the result is undefined.
+ *
+ * @param s const uint8_t * string buffer
+ * @param i string offset
+ * @param c code point to append
+ * @see U8_APPEND
+ * @stable ICU 2.4
+ */
+#define U8_APPEND_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
+ uint32_t __uc=(c); \
+ if(__uc<=0x7f) { \
+ (s)[(i)++]=(uint8_t)__uc; \
+ } else { \
+ if(__uc<=0x7ff) { \
+ (s)[(i)++]=(uint8_t)((__uc>>6)|0xc0); \
+ } else { \
+ if(__uc<=0xffff) { \
+ (s)[(i)++]=(uint8_t)((__uc>>12)|0xe0); \
+ } else { \
+ (s)[(i)++]=(uint8_t)((__uc>>18)|0xf0); \
+ (s)[(i)++]=(uint8_t)(((__uc>>12)&0x3f)|0x80); \
+ } \
+ (s)[(i)++]=(uint8_t)(((__uc>>6)&0x3f)|0x80); \
+ } \
+ (s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \
+ } \
+} UPRV_BLOCK_MACRO_END
+
+/**
+ * Append a code point to a string, overwriting 1 to 4 bytes.
+ * The offset points to the current end of the string contents
+ * and is advanced (post-increment).
+ * "Safe" macro, checks for a valid code point.
+ * If a non-ASCII code point is written, checks for sufficient space in the string.
+ * If the code point is not valid or trail bytes do not fit,
+ * then isError is set to TRUE.
+ *
+ * @param s const uint8_t * string buffer
+ * @param i int32_t string offset, must be i<capacity
+ * @param capacity int32_t size of the string buffer
+ * @param c UChar32 code point to append
+ * @param isError output UBool set to TRUE if an error occurs, otherwise not modified
+ * @see U8_APPEND_UNSAFE
+ * @stable ICU 2.4
+ */
+#define U8_APPEND(s, i, capacity, c, isError) UPRV_BLOCK_MACRO_BEGIN { \
+ uint32_t __uc=(c); \
+ if(__uc<=0x7f) { \
+ (s)[(i)++]=(uint8_t)__uc; \
+ } else if(__uc<=0x7ff && (i)+1<(capacity)) { \
+ (s)[(i)++]=(uint8_t)((__uc>>6)|0xc0); \
+ (s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \
+ } else if((__uc<=0xd7ff || (0xe000<=__uc && __uc<=0xffff)) && (i)+2<(capacity)) { \
+ (s)[(i)++]=(uint8_t)((__uc>>12)|0xe0); \
+ (s)[(i)++]=(uint8_t)(((__uc>>6)&0x3f)|0x80); \
+ (s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \
+ } else if(0xffff<__uc && __uc<=0x10ffff && (i)+3<(capacity)) { \
+ (s)[(i)++]=(uint8_t)((__uc>>18)|0xf0); \
+ (s)[(i)++]=(uint8_t)(((__uc>>12)&0x3f)|0x80); \
+ (s)[(i)++]=(uint8_t)(((__uc>>6)&0x3f)|0x80); \
+ (s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \
+ } else { \
+ (isError)=TRUE; \
+ } \
+} UPRV_BLOCK_MACRO_END
+
+/**
+ * Advance the string offset from one code point boundary to the next.
+ * (Post-incrementing iteration.)
+ * "Unsafe" macro, assumes well-formed UTF-8.
+ *
+ * @param s const uint8_t * string
+ * @param i string offset
+ * @see U8_FWD_1
+ * @stable ICU 2.4
+ */
+#define U8_FWD_1_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
+ (i)+=1+U8_COUNT_TRAIL_BYTES_UNSAFE((s)[i]); \
+} UPRV_BLOCK_MACRO_END
+
+/**
+ * Advance the string offset from one code point boundary to the next.
+ * (Post-incrementing iteration.)
+ * "Safe" macro, checks for illegal sequences and for string boundaries.
+ *
+ * The length can be negative for a NUL-terminated string.
+ *
+ * @param s const uint8_t * string
+ * @param i int32_t string offset, must be i<length
+ * @param length int32_t string length
+ * @see U8_FWD_1_UNSAFE
+ * @stable ICU 2.4
+ */
+#define U8_FWD_1(s, i, length) UPRV_BLOCK_MACRO_BEGIN { \
+ uint8_t __b=(s)[(i)++]; \
+ if(U8_IS_LEAD(__b) && (i)!=(length)) { \
+ uint8_t __t1=(s)[i]; \
+ if((0xe0<=__b && __b<0xf0)) { \
+ if(U8_IS_VALID_LEAD3_AND_T1(__b, __t1) && \
+ ++(i)!=(length) && U8_IS_TRAIL((s)[i])) { \
+ ++(i); \
+ } \
+ } else if(__b<0xe0) { \
+ if(U8_IS_TRAIL(__t1)) { \
+ ++(i); \
+ } \
+ } else /* c>=0xf0 */ { \
+ if(U8_IS_VALID_LEAD4_AND_T1(__b, __t1) && \
+ ++(i)!=(length) && U8_IS_TRAIL((s)[i]) && \
+ ++(i)!=(length) && U8_IS_TRAIL((s)[i])) { \
+ ++(i); \
+ } \
+ } \
+ } \
+} UPRV_BLOCK_MACRO_END
+
+/**
+ * Advance the string offset from one code point boundary to the n-th next one,
+ * i.e., move forward by n code points.
+ * (Post-incrementing iteration.)
+ * "Unsafe" macro, assumes well-formed UTF-8.
+ *
+ * @param s const uint8_t * string
+ * @param i string offset
+ * @param n number of code points to skip
+ * @see U8_FWD_N
+ * @stable ICU 2.4
+ */
+#define U8_FWD_N_UNSAFE(s, i, n) UPRV_BLOCK_MACRO_BEGIN { \
+ int32_t __N=(n); \
+ while(__N>0) { \
+ U8_FWD_1_UNSAFE(s, i); \
+ --__N; \
+ } \
+} UPRV_BLOCK_MACRO_END
+
+/**
+ * Advance the string offset from one code point boundary to the n-th next one,
+ * i.e., move forward by n code points.
+ * (Post-incrementing iteration.)
+ * "Safe" macro, checks for illegal sequences and for string boundaries.
+ *
+ * The length can be negative for a NUL-terminated string.
+ *
+ * @param s const uint8_t * string
+ * @param i int32_t string offset, must be i<length
+ * @param length int32_t string length
+ * @param n number of code points to skip
+ * @see U8_FWD_N_UNSAFE
+ * @stable ICU 2.4
+ */
+#define U8_FWD_N(s, i, length, n) UPRV_BLOCK_MACRO_BEGIN { \
+ int32_t __N=(n); \
+ while(__N>0 && ((i)<(length) || ((length)<0 && (s)[i]!=0))) { \
+ U8_FWD_1(s, i, length); \
+ --__N; \
+ } \
+} UPRV_BLOCK_MACRO_END
+
+/**
+ * Adjust a random-access offset to a code point boundary
+ * at the start of a code point.
+ * If the offset points to a UTF-8 trail byte,
+ * then the offset is moved backward to the corresponding lead byte.
+ * Otherwise, it is not modified.
+ * "Unsafe" macro, assumes well-formed UTF-8.
+ *
+ * @param s const uint8_t * string
+ * @param i string offset
+ * @see U8_SET_CP_START
+ * @stable ICU 2.4
+ */
+#define U8_SET_CP_START_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
+ while(U8_IS_TRAIL((s)[i])) { --(i); } \
+} UPRV_BLOCK_MACRO_END
+
+/**
+ * Adjust a random-access offset to a code point boundary
+ * at the start of a code point.
+ * If the offset points to a UTF-8 trail byte,
+ * then the offset is moved backward to the corresponding lead byte.
+ * Otherwise, it is not modified.
+ *
+ * "Safe" macro, checks for illegal sequences and for string boundaries.
+ * Unlike U8_TRUNCATE_IF_INCOMPLETE(), this macro always reads s[i].
+ *
+ * @param s const uint8_t * string
+ * @param start int32_t starting string offset (usually 0)
+ * @param i int32_t string offset, must be start<=i
+ * @see U8_SET_CP_START_UNSAFE
+ * @see U8_TRUNCATE_IF_INCOMPLETE
+ * @stable ICU 2.4
+ */
+#define U8_SET_CP_START(s, start, i) UPRV_BLOCK_MACRO_BEGIN { \
+ if(U8_IS_TRAIL((s)[(i)])) { \
+ (i)=utf8_back1SafeBody(s, start, (i)); \
+ } \
+} UPRV_BLOCK_MACRO_END
+
+/**
+ * If the string ends with a UTF-8 byte sequence that is valid so far
+ * but incomplete, then reduce the length of the string to end before
+ * the lead byte of that incomplete sequence.
+ * For example, if the string ends with E1 80, the length is reduced by 2.
+ *
+ * In all other cases (the string ends with a complete sequence, or it is not
+ * possible for any further trail byte to extend the trailing sequence)
+ * the length remains unchanged.
+ *
+ * Useful for processing text split across multiple buffers
+ * (save the incomplete sequence for later)
+ * and for optimizing iteration
+ * (check for string length only once per character).
+ *
+ * "Safe" macro, checks for illegal sequences and for string boundaries.
+ * Unlike U8_SET_CP_START(), this macro never reads s[length].
+ *
+ * (In UTF-16, simply check for U16_IS_LEAD(last code unit).)
+ *
+ * @param s const uint8_t * string
+ * @param start int32_t starting string offset (usually 0)
+ * @param length int32_t string length (usually start<=length)
+ * @see U8_SET_CP_START
+ * @stable ICU 61
+ */
+#define U8_TRUNCATE_IF_INCOMPLETE(s, start, length) UPRV_BLOCK_MACRO_BEGIN { \
+ if((length)>(start)) { \
+ uint8_t __b1=s[(length)-1]; \
+ if(U8_IS_SINGLE(__b1)) { \
+ /* common ASCII character */ \
+ } else if(U8_IS_LEAD(__b1)) { \
+ --(length); \
+ } else if(U8_IS_TRAIL(__b1) && ((length)-2)>=(start)) { \
+ uint8_t __b2=s[(length)-2]; \
+ if(0xe0<=__b2 && __b2<=0xf4) { \
+ if(__b2<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(__b2, __b1) : \
+ U8_IS_VALID_LEAD4_AND_T1(__b2, __b1)) { \
+ (length)-=2; \
+ } \
+ } else if(U8_IS_TRAIL(__b2) && ((length)-3)>=(start)) { \
+ uint8_t __b3=s[(length)-3]; \
+ if(0xf0<=__b3 && __b3<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(__b3, __b2)) { \
+ (length)-=3; \
+ } \
+ } \
+ } \
+ } \
+} UPRV_BLOCK_MACRO_END
+
+/* definitions with backward iteration -------------------------------------- */
+
+/**
+ * Move the string offset from one code point boundary to the previous one
+ * and get the code point between them.
+ * (Pre-decrementing backward iteration.)
+ * "Unsafe" macro, assumes well-formed UTF-8.
+ *
+ * The input offset may be the same as the string length.
+ * If the offset is behind a multi-byte sequence, then the macro will read
+ * the whole sequence.
+ * If the offset is behind a lead byte, then that itself
+ * will be returned as the code point.
+ * The result is undefined if the offset is behind an illegal UTF-8 sequence.
+ *
+ * @param s const uint8_t * string
+ * @param i string offset
+ * @param c output UChar32 variable
+ * @see U8_PREV
+ * @stable ICU 2.4
+ */
+#define U8_PREV_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
+ (c)=(uint8_t)(s)[--(i)]; \
+ if(U8_IS_TRAIL(c)) { \
+ uint8_t __b, __count=1, __shift=6; \
+\
+ /* c is a trail byte */ \
+ (c)&=0x3f; \
+ for(;;) { \
+ __b=(s)[--(i)]; \
+ if(__b>=0xc0) { \
+ U8_MASK_LEAD_BYTE(__b, __count); \
+ (c)|=(UChar32)__b<<__shift; \
+ break; \
+ } else { \
+ (c)|=(UChar32)(__b&0x3f)<<__shift; \
+ ++__count; \
+ __shift+=6; \
+ } \
+ } \
+ } \
+} UPRV_BLOCK_MACRO_END
+
+/**
+ * Move the string offset from one code point boundary to the previous one
+ * and get the code point between them.
+ * (Pre-decrementing backward iteration.)
+ * "Safe" macro, checks for illegal sequences and for string boundaries.
+ *
+ * The input offset may be the same as the string length.
+ * If the offset is behind a multi-byte sequence, then the macro will read
+ * the whole sequence.
+ * If the offset is behind a lead byte, then that itself
+ * will be returned as the code point.
+ * If the offset is behind an illegal UTF-8 sequence, then c is set to a negative value.
+ *
+ * @param s const uint8_t * string
+ * @param start int32_t starting string offset (usually 0)
+ * @param i int32_t string offset, must be start<i
+ * @param c output UChar32 variable, set to <0 in case of an error
+ * @see U8_PREV_UNSAFE
+ * @stable ICU 2.4
+ */
+#define U8_PREV(s, start, i, c) UPRV_BLOCK_MACRO_BEGIN { \
+ (c)=(uint8_t)(s)[--(i)]; \
+ if(!U8_IS_SINGLE(c)) { \
+ (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -1); \
+ } \
+} UPRV_BLOCK_MACRO_END
+
+/**
+ * Move the string offset from one code point boundary to the previous one
+ * and get the code point between them.
+ * (Pre-decrementing backward iteration.)
+ * "Safe" macro, checks for illegal sequences and for string boundaries.
+ *
+ * The input offset may be the same as the string length.
+ * If the offset is behind a multi-byte sequence, then the macro will read
+ * the whole sequence.
+ * If the offset is behind a lead byte, then that itself
+ * will be returned as the code point.
+ * If the offset is behind an illegal UTF-8 sequence, then c is set to U+FFFD.
+ *
+ * This macro does not distinguish between a real U+FFFD in the text
+ * and U+FFFD returned for an ill-formed sequence.
+ * Use U8_PREV() if that distinction is important.
+ *
+ * @param s const uint8_t * string
+ * @param start int32_t starting string offset (usually 0)
+ * @param i int32_t string offset, must be start<i
+ * @param c output UChar32 variable, set to U+FFFD in case of an error
+ * @see U8_PREV
+ * @stable ICU 51
+ */
+#define U8_PREV_OR_FFFD(s, start, i, c) UPRV_BLOCK_MACRO_BEGIN { \
+ (c)=(uint8_t)(s)[--(i)]; \
+ if(!U8_IS_SINGLE(c)) { \
+ (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -3); \
+ } \
+} UPRV_BLOCK_MACRO_END
+
+/**
+ * Move the string offset from one code point boundary to the previous one.
+ * (Pre-decrementing backward iteration.)
+ * The input offset may be the same as the string length.
+ * "Unsafe" macro, assumes well-formed UTF-8.
+ *
+ * @param s const uint8_t * string
+ * @param i string offset
+ * @see U8_BACK_1
+ * @stable ICU 2.4
+ */
+#define U8_BACK_1_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
+ while(U8_IS_TRAIL((s)[--(i)])) {} \
+} UPRV_BLOCK_MACRO_END
+
+/**
+ * Move the string offset from one code point boundary to the previous one.
+ * (Pre-decrementing backward iteration.)
+ * The input offset may be the same as the string length.
+ * "Safe" macro, checks for illegal sequences and for string boundaries.
+ *
+ * @param s const uint8_t * string
+ * @param start int32_t starting string offset (usually 0)
+ * @param i int32_t string offset, must be start<i
+ * @see U8_BACK_1_UNSAFE
+ * @stable ICU 2.4
+ */
+#define U8_BACK_1(s, start, i) UPRV_BLOCK_MACRO_BEGIN { \
+ if(U8_IS_TRAIL((s)[--(i)])) { \
+ (i)=utf8_back1SafeBody(s, start, (i)); \
+ } \
+} UPRV_BLOCK_MACRO_END
+
+/**
+ * Move the string offset from one code point boundary to the n-th one before it,
+ * i.e., move backward by n code points.
+ * (Pre-decrementing backward iteration.)
+ * The input offset may be the same as the string length.
+ * "Unsafe" macro, assumes well-formed UTF-8.
+ *
+ * @param s const uint8_t * string
+ * @param i string offset
+ * @param n number of code points to skip
+ * @see U8_BACK_N
+ * @stable ICU 2.4
+ */
+#define U8_BACK_N_UNSAFE(s, i, n) UPRV_BLOCK_MACRO_BEGIN { \
+ int32_t __N=(n); \
+ while(__N>0) { \
+ U8_BACK_1_UNSAFE(s, i); \
+ --__N; \
+ } \
+} UPRV_BLOCK_MACRO_END
+
+/**
+ * Move the string offset from one code point boundary to the n-th one before it,
+ * i.e., move backward by n code points.
+ * (Pre-decrementing backward iteration.)
+ * The input offset may be the same as the string length.
+ * "Safe" macro, checks for illegal sequences and for string boundaries.
+ *
+ * @param s const uint8_t * string
+ * @param start int32_t index of the start of the string
+ * @param i int32_t string offset, must be start<i
+ * @param n number of code points to skip
+ * @see U8_BACK_N_UNSAFE
+ * @stable ICU 2.4
+ */
+#define U8_BACK_N(s, start, i, n) UPRV_BLOCK_MACRO_BEGIN { \
+ int32_t __N=(n); \
+ while(__N>0 && (i)>(start)) { \
+ U8_BACK_1(s, start, i); \
+ --__N; \
+ } \
+} UPRV_BLOCK_MACRO_END
+
+/**
+ * Adjust a random-access offset to a code point boundary after a code point.
+ * If the offset is behind a partial multi-byte sequence,
+ * then the offset is incremented to behind the whole sequence.
+ * Otherwise, it is not modified.
+ * The input offset may be the same as the string length.
+ * "Unsafe" macro, assumes well-formed UTF-8.
+ *
+ * @param s const uint8_t * string
+ * @param i string offset
+ * @see U8_SET_CP_LIMIT
+ * @stable ICU 2.4
+ */
+#define U8_SET_CP_LIMIT_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
+ U8_BACK_1_UNSAFE(s, i); \
+ U8_FWD_1_UNSAFE(s, i); \
+} UPRV_BLOCK_MACRO_END
+
+/**
+ * Adjust a random-access offset to a code point boundary after a code point.
+ * If the offset is behind a partial multi-byte sequence,
+ * then the offset is incremented to behind the whole sequence.
+ * Otherwise, it is not modified.
+ * The input offset may be the same as the string length.
+ * "Safe" macro, checks for illegal sequences and for string boundaries.
+ *
+ * The length can be negative for a NUL-terminated string.
+ *
+ * @param s const uint8_t * string
+ * @param start int32_t starting string offset (usually 0)
+ * @param i int32_t string offset, must be start<=i<=length
+ * @param length int32_t string length
+ * @see U8_SET_CP_LIMIT_UNSAFE
+ * @stable ICU 2.4
+ */
+#define U8_SET_CP_LIMIT(s, start, i, length) UPRV_BLOCK_MACRO_BEGIN { \
+ if((start)<(i) && ((i)<(length) || (length)<0)) { \
+ U8_BACK_1(s, start, i); \
+ U8_FWD_1(s, i, length); \
+ } \
+} UPRV_BLOCK_MACRO_END
+
+#endif
diff --git a/test/functional/api/highlight_spec.lua b/test/functional/api/highlight_spec.lua
index 5297c6454e..b6514a105c 100644
--- a/test/functional/api/highlight_spec.lua
+++ b/test/functional/api/highlight_spec.lua
@@ -4,6 +4,9 @@ local Screen = require('test.functional.ui.screen')
local eq, eval = helpers.eq, helpers.eval
local command = helpers.command
local meths = helpers.meths
+local funcs = helpers.funcs
+local pcall_err = helpers.pcall_err
+local ok = helpers.ok
describe('API: highlight',function()
local expected_rgb = {
@@ -110,4 +113,20 @@ describe('API: highlight',function()
meths.get_hl_by_name('cursorline', 0));
end)
+
+ it('nvim_get_hl_id_by_name', function()
+ -- precondition: use a hl group that does not yet exist
+ eq('Invalid highlight name: Shrubbery', pcall_err(meths.get_hl_by_name, "Shrubbery", true))
+ eq(0, funcs.hlID("Shrubbery"))
+
+ local hl_id = meths.get_hl_id_by_name("Shrubbery")
+ ok(hl_id > 0)
+ eq(hl_id, funcs.hlID("Shrubbery"))
+
+ command('hi Shrubbery guifg=#888888 guibg=#888888')
+ eq({foreground=tonumber("0x888888"), background=tonumber("0x888888")},
+ meths.get_hl_by_id(hl_id, true))
+ eq({foreground=tonumber("0x888888"), background=tonumber("0x888888")},
+ meths.get_hl_by_name("Shrubbery", true))
+ end)
end)
diff --git a/test/functional/lua/treesitter_spec.lua b/test/functional/lua/treesitter_spec.lua
index 5a53ca1425..76e9899d34 100644
--- a/test/functional/lua/treesitter_spec.lua
+++ b/test/functional/lua/treesitter_spec.lua
@@ -1,5 +1,6 @@
-- Test suite for testing interactions with API bindings
local helpers = require('test.functional.helpers')(after_each)
+local Screen = require('test.functional.ui.screen')
local clear = helpers.clear
local eq = helpers.eq
@@ -26,122 +27,371 @@ describe('treesitter API', function()
pcall_err(exec_lua, "parser = vim.treesitter.inspect_language('borklang')"))
end)
+end)
+
+describe('treesitter API with C parser', function()
local ts_path = os.getenv("TREE_SITTER_DIR")
- describe('with C parser', function()
- if ts_path == nil then
- it("works", function() pending("TREE_SITTER_PATH not set, skipping treesitter parser tests") end)
- return
- end
-
- before_each(function()
- local path = ts_path .. '/bin/c'..(iswin() and '.dll' or '.so')
- exec_lua([[
- local path = ...
- vim.treesitter.add_language(path,'c')
- ]], path)
- end)
-
- it('parses buffer', function()
- insert([[
- int main() {
- int x = 3;
- }]])
-
- exec_lua([[
- parser = vim.treesitter.get_parser(0, "c")
- tree = parser:parse()
- root = tree:root()
- lang = vim.treesitter.inspect_language('c')
- ]])
-
- eq("<tree>", exec_lua("return tostring(tree)"))
- eq("<node translation_unit>", exec_lua("return tostring(root)"))
- eq({0,0,3,0}, exec_lua("return {root:range()}"))
-
- eq(1, exec_lua("return root:child_count()"))
- exec_lua("child = root:child(0)")
- eq("<node function_definition>", exec_lua("return tostring(child)"))
- eq({0,0,2,1}, exec_lua("return {child:range()}"))
-
- eq("function_definition", exec_lua("return child:type()"))
- eq(true, exec_lua("return child:named()"))
- eq("number", type(exec_lua("return child:symbol()")))
- eq({'function_definition', true}, exec_lua("return lang.symbols[child:symbol()]"))
-
- exec_lua("anon = root:descendant_for_range(0,8,0,9)")
- eq("(", exec_lua("return anon:type()"))
- eq(false, exec_lua("return anon:named()"))
- eq("number", type(exec_lua("return anon:symbol()")))
- eq({'(', false}, exec_lua("return lang.symbols[anon:symbol()]"))
-
- exec_lua("descendant = root:descendant_for_range(1,2,1,12)")
- eq("<node declaration>", exec_lua("return tostring(descendant)"))
- eq({1,2,1,12}, exec_lua("return {descendant:range()}"))
- eq("(declaration type: (primitive_type) declarator: (init_declarator declarator: (identifier) value: (number_literal)))", exec_lua("return descendant:sexpr()"))
-
- eq(true, exec_lua("return child == child"))
- -- separate lua object, but represents same node
- eq(true, exec_lua("return child == root:child(0)"))
- eq(false, exec_lua("return child == descendant2"))
- eq(false, exec_lua("return child == nil"))
- eq(false, exec_lua("return child == tree"))
-
- feed("2G7|ay")
- exec_lua([[
- tree2 = parser:parse()
- root2 = tree2:root()
- descendant2 = root2:descendant_for_range(1,2,1,13)
- ]])
- eq(false, exec_lua("return tree2 == tree1"))
- eq(false, exec_lua("return root2 == root"))
- eq("<node declaration>", exec_lua("return tostring(descendant2)"))
- eq({1,2,1,13}, exec_lua("return {descendant2:range()}"))
-
- -- orginal tree did not change
- eq({1,2,1,12}, exec_lua("return {descendant:range()}"))
-
- -- unchanged buffer: return the same tree
- eq(true, exec_lua("return parser:parse() == tree2"))
- end)
-
- it('inspects language', function()
- local keys, fields, symbols = unpack(exec_lua([[
- local lang = vim.treesitter.inspect_language('c')
- local keys, symbols = {}, {}
- for k,_ in pairs(lang) do
- keys[k] = true
- end
-
- -- symbols array can have "holes" and is thus not a valid msgpack array
- -- but we don't care about the numbers here (checked in the parser test)
- for _, v in pairs(lang.symbols) do
- table.insert(symbols, v)
- end
- return {keys, lang.fields, symbols}
- ]]))
-
- eq({fields=true, symbols=true}, keys)
-
- local fset = {}
- for _,f in pairs(fields) do
- eq("string", type(f))
- fset[f] = true
+ -- The tests after this requires an actual parser
+ if ts_path == nil then
+ it("works", function() pending("TREE_SITTER_PATH not set, skipping treesitter parser tests") end)
+ return
+ end
+
+ before_each(function()
+ local path = ts_path .. '/bin/c'..(iswin() and '.dll' or '.so')
+ exec_lua([[
+ local path = ...
+ vim.treesitter.add_language(path,'c')
+ ]], path)
+ end)
+
+ it('parses buffer', function()
+ insert([[
+ int main() {
+ int x = 3;
+ }]])
+
+ exec_lua([[
+ parser = vim.treesitter.get_parser(0, "c")
+ tree = parser:parse()
+ root = tree:root()
+ lang = vim.treesitter.inspect_language('c')
+ ]])
+
+ eq("<tree>", exec_lua("return tostring(tree)"))
+ eq("<node translation_unit>", exec_lua("return tostring(root)"))
+ eq({0,0,3,0}, exec_lua("return {root:range()}"))
+
+ eq(1, exec_lua("return root:child_count()"))
+ exec_lua("child = root:child(0)")
+ eq("<node function_definition>", exec_lua("return tostring(child)"))
+ eq({0,0,2,1}, exec_lua("return {child:range()}"))
+
+ eq("function_definition", exec_lua("return child:type()"))
+ eq(true, exec_lua("return child:named()"))
+ eq("number", type(exec_lua("return child:symbol()")))
+ eq({'function_definition', true}, exec_lua("return lang.symbols[child:symbol()]"))
+
+ exec_lua("anon = root:descendant_for_range(0,8,0,9)")
+ eq("(", exec_lua("return anon:type()"))
+ eq(false, exec_lua("return anon:named()"))
+ eq("number", type(exec_lua("return anon:symbol()")))
+ eq({'(', false}, exec_lua("return lang.symbols[anon:symbol()]"))
+
+ exec_lua("descendant = root:descendant_for_range(1,2,1,12)")
+ eq("<node declaration>", exec_lua("return tostring(descendant)"))
+ eq({1,2,1,12}, exec_lua("return {descendant:range()}"))
+ eq("(declaration type: (primitive_type) declarator: (init_declarator declarator: (identifier) value: (number_literal)))", exec_lua("return descendant:sexpr()"))
+
+ eq(true, exec_lua("return child == child"))
+ -- separate lua object, but represents same node
+ eq(true, exec_lua("return child == root:child(0)"))
+ eq(false, exec_lua("return child == descendant2"))
+ eq(false, exec_lua("return child == nil"))
+ eq(false, exec_lua("return child == tree"))
+
+ feed("2G7|ay")
+ exec_lua([[
+ tree2 = parser:parse()
+ root2 = tree2:root()
+ descendant2 = root2:descendant_for_range(1,2,1,13)
+ ]])
+ eq(false, exec_lua("return tree2 == tree1"))
+ eq(false, exec_lua("return root2 == root"))
+ eq("<node declaration>", exec_lua("return tostring(descendant2)"))
+ eq({1,2,1,13}, exec_lua("return {descendant2:range()}"))
+
+ -- orginal tree did not change
+ eq({1,2,1,12}, exec_lua("return {descendant:range()}"))
+
+ -- unchanged buffer: return the same tree
+ eq(true, exec_lua("return parser:parse() == tree2"))
+ end)
+
+ local test_text = [[
+void ui_refresh(void)
+{
+ int width = INT_MAX, height = INT_MAX;
+ bool ext_widgets[kUIExtCount];
+ for (UIExtension i = 0; (int)i < kUIExtCount; i++) {
+ ext_widgets[i] = true;
+ }
+
+ bool inclusive = ui_override();
+ for (size_t i = 0; i < ui_count; i++) {
+ UI *ui = uis[i];
+ width = MIN(ui->width, width);
+ height = MIN(ui->height, height);
+ foo = BAR(ui->bazaar, bazaar);
+ for (UIExtension j = 0; (int)j < kUIExtCount; j++) {
+ ext_widgets[j] &= (ui->ui_ext[j] || inclusive);
+ }
+ }
+}]]
+
+ local query = [[
+ ((call_expression function: (identifier) @minfunc (argument_list (identifier) @min_id)) (eq? @minfunc "MIN"))
+ "for" @keyword
+ (primitive_type) @type
+ (field_expression argument: (identifier) @fieldarg)
+ ]]
+
+ it('support query and iter by capture', function()
+ insert(test_text)
+
+ local res = exec_lua([[
+ cquery = vim.treesitter.parse_query("c", ...)
+ parser = vim.treesitter.get_parser(0, "c")
+ tree = parser:parse()
+ res = {}
+ for cid, node in cquery:iter_captures(tree:root(), 0, 7, 14) do
+ -- can't transmit node over RPC. just check the name and range
+ table.insert(res, {cquery.captures[cid], node:type(), node:range()})
+ end
+ return res
+ ]], query)
+
+ eq({
+ { "type", "primitive_type", 8, 2, 8, 6 },
+ { "keyword", "for", 9, 2, 9, 5 },
+ { "type", "primitive_type", 9, 7, 9, 13 },
+ { "minfunc", "identifier", 11, 12, 11, 15 },
+ { "fieldarg", "identifier", 11, 16, 11, 18 },
+ { "min_id", "identifier", 11, 27, 11, 32 },
+ { "minfunc", "identifier", 12, 13, 12, 16 },
+ { "fieldarg", "identifier", 12, 17, 12, 19 },
+ { "min_id", "identifier", 12, 29, 12, 35 },
+ { "fieldarg", "identifier", 13, 14, 13, 16 }
+ }, res)
+ end)
+
+ it('support query and iter by match', function()
+ insert(test_text)
+
+ local res = exec_lua([[
+ cquery = vim.treesitter.parse_query("c", ...)
+ parser = vim.treesitter.get_parser(0, "c")
+ tree = parser:parse()
+ res = {}
+ for pattern, match in cquery:iter_matches(tree:root(), 0, 7, 14) do
+ -- can't transmit node over RPC. just check the name and range
+ local mrepr = {}
+ for cid,node in pairs(match) do
+ table.insert(mrepr, {cquery.captures[cid], node:type(), node:range()})
end
- eq(true, fset["directive"])
- eq(true, fset["initializer"])
-
- local has_named, has_anonymous
- for _,s in pairs(symbols) do
- eq("string", type(s[1]))
- eq("boolean", type(s[2]))
- if s[1] == "for_statement" and s[2] == true then
- has_named = true
- elseif s[1] == "|=" and s[2] == false then
- has_anonymous = true
- end
+ table.insert(res, {pattern, mrepr})
+ end
+ return res
+ ]], query)
+
+ eq({
+ { 3, { { "type", "primitive_type", 8, 2, 8, 6 } } },
+ { 2, { { "keyword", "for", 9, 2, 9, 5 } } },
+ { 3, { { "type", "primitive_type", 9, 7, 9, 13 } } },
+ { 4, { { "fieldarg", "identifier", 11, 16, 11, 18 } } },
+ { 1, { { "minfunc", "identifier", 11, 12, 11, 15 }, { "min_id", "identifier", 11, 27, 11, 32 } } },
+ { 4, { { "fieldarg", "identifier", 12, 17, 12, 19 } } },
+ { 1, { { "minfunc", "identifier", 12, 13, 12, 16 }, { "min_id", "identifier", 12, 29, 12, 35 } } },
+ { 4, { { "fieldarg", "identifier", 13, 14, 13, 16 } } }
+ }, res)
+ end)
+
+ it('supports highlighting', function()
+ local hl_text = [[
+/// Schedule Lua callback on main loop's event queue
+static int nlua_schedule(lua_State *const lstate)
+{
+ if (lua_type(lstate, 1) != LUA_TFUNCTION
+ || lstate != lstate) {
+ lua_pushliteral(lstate, "vim.schedule: expected function");
+ return lua_error(lstate);
+ }
+
+ LuaRef cb = nlua_ref(lstate, 1);
+
+ multiqueue_put(main_loop.events, nlua_schedule_event,
+ 1, (void *)(ptrdiff_t)cb);
+ return 0;
+}]]
+
+ local hl_query = [[
+(ERROR) @ErrorMsg
+
+"if" @keyword
+"else" @keyword
+"for" @keyword
+"return" @keyword
+
+"const" @type
+"static" @type
+"struct" @type
+"enum" @type
+"extern" @type
+
+(string_literal) @string
+
+(number_literal) @number
+(char_literal) @string
+
+; TODO(bfredl): overlapping matches are unreliable,
+; we need a proper priority mechanism
+;(type_identifier) @type
+((type_identifier) @Special (eq? @Special "LuaRef"))
+
+(primitive_type) @type
+(sized_type_specifier) @type
+
+((binary_expression left: (identifier) @WarningMsg.left right: (identifier) @WarningMsg.right) (eq? @WarningMsg.left @WarningMsg.right))
+
+(comment) @comment
+]]
+
+ local screen = Screen.new(65, 18)
+ screen:attach()
+ screen:set_default_attr_ids({
+ [1] = {bold = true, foreground = Screen.colors.Blue1},
+ [2] = {foreground = Screen.colors.Blue1},
+ [3] = {bold = true, foreground = Screen.colors.SeaGreen4},
+ [4] = {bold = true, foreground = Screen.colors.Brown},
+ [5] = {foreground = Screen.colors.Magenta},
+ [6] = {foreground = Screen.colors.Red},
+ [7] = {foreground = Screen.colors.SlateBlue},
+ [8] = {foreground = Screen.colors.Grey100, background = Screen.colors.Red},
+ [9] = {foreground = Screen.colors.Magenta, background = Screen.colors.Red},
+ [10] = {foreground = Screen.colors.Red, background = Screen.colors.Red},
+
+ })
+
+ insert(hl_text)
+ screen:expect{grid=[[
+ /// Schedule Lua callback on main loop's event queue |
+ static int nlua_schedule(lua_State *const lstate) |
+ { |
+ if (lua_type(lstate, 1) != LUA_TFUNCTION |
+ || lstate != lstate) { |
+ lua_pushliteral(lstate, "vim.schedule: expected function"); |
+ return lua_error(lstate); |
+ } |
+ |
+ LuaRef cb = nlua_ref(lstate, 1); |
+ |
+ multiqueue_put(main_loop.events, nlua_schedule_event, |
+ 1, (void *)(ptrdiff_t)cb); |
+ return 0; |
+ ^} |
+ {1:~ }|
+ {1:~ }|
+ |
+ ]]}
+
+ exec_lua([[
+ local TSHighlighter = vim.treesitter.TSHighlighter
+ local query = ...
+ test_hl = TSHighlighter.new(query, 0, "c")
+ ]], hl_query)
+ screen:expect{grid=[[
+ {2:/// Schedule Lua callback on main loop's event queue} |
+ {3:static} {3:int} nlua_schedule(lua_State *{3:const} lstate) |
+ { |
+ {4:if} (lua_type(lstate, {5:1}) != LUA_TFUNCTION |
+ || {6:lstate} != {6:lstate}) { |
+ lua_pushliteral(lstate, {5:"vim.schedule: expected function"}); |
+ {4:return} lua_error(lstate); |
+ } |
+ |
+ {7:LuaRef} cb = nlua_ref(lstate, {5:1}); |
+ |
+ multiqueue_put(main_loop.events, nlua_schedule_event, |
+ {5:1}, ({3:void} *)(ptrdiff_t)cb); |
+ {4:return} {5:0}; |
+ ^} |
+ {1:~ }|
+ {1:~ }|
+ |
+ ]]}
+
+ feed('7Go*/<esc>')
+ screen:expect{grid=[[
+ {2:/// Schedule Lua callback on main loop's event queue} |
+ {3:static} {3:int} nlua_schedule(lua_State *{3:const} lstate) |
+ { |
+ {4:if} (lua_type(lstate, {5:1}) != LUA_TFUNCTION |
+ || {6:lstate} != {6:lstate}) { |
+ lua_pushliteral(lstate, {5:"vim.schedule: expected function"}); |
+ {4:return} lua_error(lstate); |
+ {8:*^/} |
+ } |
+ |
+ {7:LuaRef} cb = nlua_ref(lstate, {5:1}); |
+ |
+ multiqueue_put(main_loop.events, nlua_schedule_event, |
+ {5:1}, ({3:void} *)(ptrdiff_t)cb); |
+ {4:return} {5:0}; |
+ } |
+ {1:~ }|
+ |
+ ]]}
+
+ feed('3Go/*<esc>')
+ screen:expect{grid=[[
+ {2:/// Schedule Lua callback on main loop's event queue} |
+ {3:static} {3:int} nlua_schedule(lua_State *{3:const} lstate) |
+ { |
+ {2:/^*} |
+ {2: if (lua_type(lstate, 1) != LUA_TFUNCTION} |
+ {2: || lstate != lstate) {} |
+ {2: lua_pushliteral(lstate, "vim.schedule: expected function");} |
+ {2: return lua_error(lstate);} |
+ {2:*/} |
+ } |
+ |
+ {7:LuaRef} cb = nlua_ref(lstate, {5:1}); |
+ |
+ multiqueue_put(main_loop.events, nlua_schedule_event, |
+ {5:1}, ({3:void} *)(ptrdiff_t)cb); |
+ {4:return} {5:0}; |
+ {8:}} |
+ |
+ ]]}
+ end)
+
+ it('inspects language', function()
+ local keys, fields, symbols = unpack(exec_lua([[
+ local lang = vim.treesitter.inspect_language('c')
+ local keys, symbols = {}, {}
+ for k,_ in pairs(lang) do
+ keys[k] = true
+ end
+
+ -- symbols array can have "holes" and is thus not a valid msgpack array
+ -- but we don't care about the numbers here (checked in the parser test)
+ for _, v in pairs(lang.symbols) do
+ table.insert(symbols, v)
+ end
+ return {keys, lang.fields, symbols}
+ ]]))
+
+ eq({fields=true, symbols=true}, keys)
+
+ local fset = {}
+ for _,f in pairs(fields) do
+ eq("string", type(f))
+ fset[f] = true
+ end
+ eq(true, fset["directive"])
+ eq(true, fset["initializer"])
+
+ local has_named, has_anonymous
+ for _,s in pairs(symbols) do
+ eq("string", type(s[1]))
+ eq("boolean", type(s[2]))
+ if s[1] == "for_statement" and s[2] == true then
+ has_named = true
+ elseif s[1] == "|=" and s[2] == false then
+ has_anonymous = true
end
- eq({true,true}, {has_named,has_anonymous})
- end)
+ end
+ eq({true,true}, {has_named,has_anonymous})
end)
end)