diff options
47 files changed, 9302 insertions, 3 deletions
| diff --git a/CMakeLists.txt b/CMakeLists.txt index f9bd87c085..8a4b21f4f6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -375,6 +375,13 @@ include_directories(SYSTEM ${MSGPACK_INCLUDE_DIRS})  find_package(LibLUV 1.30.0 REQUIRED)  include_directories(SYSTEM ${LIBLUV_INCLUDE_DIRS}) +find_package(Utf8proc REQUIRED) +include_directories(SYSTEM ${UTF8PROC_INCLUDE_DIRS}) +if(WIN32) +  add_definitions(-DUTF8PROC_STATIC) +endif() + +  # Note: The test lib requires LuaJIT; it will be skipped if LuaJIT is missing.  option(PREFER_LUA "Prefer Lua over LuaJIT in the nvim executable." OFF) diff --git a/ci/build.ps1 b/ci/build.ps1 index d533d7b4e0..4e1a69376b 100644 --- a/ci/build.ps1 +++ b/ci/build.ps1 @@ -94,6 +94,28 @@ npm.cmd install -g neovim  Get-Command -CommandType Application neovim-node-host.cmd  npm.cmd link neovim +#npm.cmd install -g tree-sitter-cli +#npm.cmd link tree-sitter-cli + +mkdir c:\treesitter +$env:TREE_SITTER_DIR = "c:\treesitter" +#$env:PATH = "c:\treesitter;$env:PATH" +$client = new-object System.Net.WebClient +cd c:\treesitter + +if ($bits -eq 32) { +  $client.DownloadFile("https://github.com/tree-sitter/tree-sitter/releases/download/0.15.5/tree-sitter-windows-x86.gz","c:\treesitter\tree-sitter-cli.gz") +} +elseif ($bits -eq 64) { +  $client.DownloadFile("https://github.com/tree-sitter/tree-sitter/releases/download/0.15.5/tree-sitter-windows-x64.gz","c:\treesitter\tree-sitter-cli.gz") +} +python -c "import gzip, shutil; f1,f2 = gzip.open('tree-sitter-cli.gz', 'rb'),  open('tree-sitter.exe', 'wb'); shutil.copyfileobj(f1, f2); f2.close()" + +$client.DownloadFile("https://codeload.github.com/tree-sitter/tree-sitter-c/zip/v0.15.2","c:\treesitter\tree_sitter_c.zip") +Expand-Archive c:\treesitter\tree_sitter_c.zip -DestinationPath c:\treesitter\ +cd c:\treesitter\tree-sitter-c-0.15.2 +c:\treesitter\tree-sitter.exe test +  function convertToCmakeArgs($vars) {    return $vars.GetEnumerator() | foreach { "-D$($_.Key)=$($_.Value)" }  } diff --git a/ci/install.sh b/ci/install.sh index cda9a11f08..b96cf3c073 100755 --- a/ci/install.sh +++ b/ci/install.sh @@ -24,3 +24,23 @@ gem install --no-document --version ">= 0.8.0" neovim  echo "Install neovim npm package"  npm install -g neovim  npm link neovim + +echo "Install tree-sitter npm package" +npm install -g tree-sitter-cli +npm link tree-sitter-cli + +echo "Install tree-sitter c parser" +curl "https://codeload.github.com/tree-sitter/tree-sitter-c/tar.gz/v0.15.2" -o tree_sitter_c.tar.gz +tar xf tree_sitter_c.tar.gz +cd tree-sitter-c-0.15.2 +export TREE_SITTER_DIR=$HOME/tree-sitter-build/ +mkdir -p $TREE_SITTER_DIR/bin + +if [[ "$BUILD_32BIT" != "ON" ]]; then +  # builds c parser in $HOME/tree-sitter-build/bin/c.(so|dylib) +  tree-sitter test +else +  # no tree-sitter binary for 32bit linux, so fake it (no tree-sitter unit tests) +  cd src/ +  gcc -m32 -o $TREE_SITTER_DIR/bin/c.so -shared parser.c -I. +fi diff --git a/ci/run_tests.sh b/ci/run_tests.sh index c175910da5..6b2f69293c 100755 --- a/ci/run_tests.sh +++ b/ci/run_tests.sh @@ -19,6 +19,8 @@ exit_suite --continue  enter_suite tests +export TREE_SITTER_DIR=$HOME/tree-sitter-build/ +  if test "$CLANG_SANITIZER" != "TSAN" ; then    # Additional threads are only created when the builtin UI starts, which    # doesn't happen in the unit/functional tests diff --git a/cmake/FindUtf8proc.cmake b/cmake/FindUtf8proc.cmake new file mode 100644 index 0000000000..dc4f7016a1 --- /dev/null +++ b/cmake/FindUtf8proc.cmake @@ -0,0 +1,54 @@ +# - Try to find utf8proc +# Once done this will define +#  UTF8PROC_FOUND - System has utf8proc +#  UTF8PROC_INCLUDE_DIRS - The utf8proc include directories +#  UTF8PROC_LIBRARIES - The libraries needed to use utf8proc + +if(NOT USE_BUNDLED_UTF8PROC) +  find_package(PkgConfig) +  if (PKG_CONFIG_FOUND) +      pkg_check_modules(PC_UTF8PROC QUIET utf8proc) +  endif() +else() +  set(PC_UTF8PROC_INCLUDEDIR) +  set(PC_UTF8PROC_INCLUDE_DIRS) +  set(PC_UTF8PROC_LIBDIR) +  set(PC_UTF8PROC_LIBRARY_DIRS) +  set(LIMIT_SEARCH NO_DEFAULT_PATH) +endif() + +set(UTF8PROC_DEFINITIONS ${PC_UTF8PROC_CFLAGS_OTHER}) + +find_path(UTF8PROC_INCLUDE_DIR utf8proc.h +          PATHS ${PC_UTF8PROC_INCLUDEDIR} ${PC_UTF8PROC_INCLUDE_DIRS} +          ${LIMIT_SEARCH}) + +# If we're asked to use static linkage, add libutf8proc.a as a preferred library name. +if(UTF8PROC_USE_STATIC) +  list(APPEND UTF8PROC_NAMES +    "${CMAKE_STATIC_LIBRARY_PREFIX}utf8proc${CMAKE_STATIC_LIBRARY_SUFFIX}") +if(MSVC) +  list(APPEND UTF8PROC_NAMES +    "${CMAKE_STATIC_LIBRARY_PREFIX}utf8proc_static${CMAKE_STATIC_LIBRARY_SUFFIX}") +endif() +endif() + +list(APPEND UTF8PROC_NAMES utf8proc) +if(MSVC) +  list(APPEND UTF8PROC_NAMES utf8proc_static) +endif() + +find_library(UTF8PROC_LIBRARY NAMES ${UTF8PROC_NAMES} +  HINTS ${PC_UTF8PROC_LIBDIR} ${PC_UTF8PROC_LIBRARY_DIRS} +  ${LIMIT_SEARCH}) + +set(UTF8PROC_LIBRARIES ${UTF8PROC_LIBRARY}) +set(UTF8PROC_INCLUDE_DIRS ${UTF8PROC_INCLUDE_DIR}) + +include(FindPackageHandleStandardArgs) +# handle the QUIETLY and REQUIRED arguments and set UTF8PROC_FOUND to TRUE +# if all listed variables are TRUE +find_package_handle_standard_args(Utf8proc DEFAULT_MSG +  UTF8PROC_LIBRARY UTF8PROC_INCLUDE_DIR) + +mark_as_advanced(UTF8PROC_INCLUDE_DIR UTF8PROC_LIBRARY) diff --git a/codecov.yml b/codecov.yml index a83fd916ee..0f867db668 100644 --- a/codecov.yml +++ b/codecov.yml @@ -25,3 +25,6 @@ coverage:      changes: no  comment: off + +ignore: +    - "src/tree_sitter" diff --git a/runtime/doc/if_lua.txt b/runtime/doc/if_lua.txt index aa2d0a03c6..0ba35aeae6 100644 --- a/runtime/doc/if_lua.txt +++ b/runtime/doc/if_lua.txt @@ -446,6 +446,112 @@ Example: TCP echo-server				*tcp-server*      print('TCP echo-server listening on port: '..server:getsockname().port)  ------------------------------------------------------------------------------ +VIM.TREESITTER						*lua-treesitter* + +Nvim integrates the tree-sitter library for incremental parsing of buffers. + +Currently Nvim does not provide the tree-sitter parsers, instead these must +be built separately, for instance using the tree-sitter utility. +The parser is loaded into nvim using > + +    vim.treesitter.add_language("/path/to/c_parser.so", "c") + +<Create a parser for a buffer and a given language (if another plugin uses the +same buffer/language combination, it will be safely reused). Use > + +    parser = vim.treesitter.get_parser(bufnr, lang) + +<`bufnr=0` can be used for current buffer. `lang` will default to 'filetype'  (this +doesn't work yet for some filetypes like "cpp") Currently, the parser will be +retained for the lifetime of a buffer but this is subject to change. A plugin +should keep a reference to the parser object as long as it wants incremental +updates. + +Whenever you need to access the current syntax tree, parse the buffer: > + +    tstree = parser:parse() + +<This will return an immutable tree that represents the current state of the +buffer. When the plugin wants to access the state after a (possible) edit +it should call `parse()` again. If the buffer wasn't edited, the same tree will +be returned again without extra work. If the buffer was parsed before, +incremental parsing will be done of the changed parts. + +NB: to use the parser directly inside a |nvim_buf_attach| lua callback, you must +call `get_parser()` before you register your callback. But preferably parsing +shouldn't be done directly in the change callback anyway as they will be very +frequent. Rather a plugin that does any kind of analysis on a tree should use +a timer to throttle too frequent updates. + +Tree methods						*lua-treesitter-tree* + +tstree:root()						*tstree:root()* +	Return the root node of this tree. + + +Node methods						*lua-treesitter-node* + +tsnode:parent()						*tsnode:parent()* +	Get the node's immediate parent. + +tsnode:child_count()					*tsnode:child_count()* +	Get the node's number of children. + +tsnode:child(N)						*tsnode:child()* +	Get the node's child at the given index, where zero represents the +	first child. + +tsnode:named_child_count()			*tsnode:named_child_count()* +	Get the node's number of named children. + +tsnode:named_child(N)					*tsnode:named_child()* +	Get the node's named child at the given index, where zero represents +	the first named child. + +tsnode:start()						*tsnode:start()* +	Get the node's start position. Return three values: the row, column +	and total byte count (all zero-based). + +tsnode:end_()						*tsnode:end_()* +	Get the node's end position. Return three values: the row, column +	and total byte count (all zero-based). + +tsnode:range()						*tsnode:range()* +	Get the range of the node. Return four values: the row, column +	of the start position, then the row, column of the end position. + +tsnode:type()						*tsnode:type()* +	Get the node's type as a string. + +tsnode:symbol()						*tsnode:symbol()* +	Get the node's type as a numerical id. + +tsnode:named()						*tsnode:named()* +	Check if the node is named. Named nodes correspond to named rules in +	the  grammar, whereas anonymous nodes correspond to string literals +	in the grammar. + +tsnode:missing()					*tsnode:missing()* +	Check if the node is missing. Missing nodes are inserted by the +	parser in order to recover from certain kinds of syntax errors. + +tsnode:has_error()					*tsnode:has_error()* +	Check if the node is a syntax error or contains any syntax errors. + +tsnode:sexpr()						*tsnode:sexpr()* +	Get an S-expression representing the node as a string. + +tsnode:descendant_for_range(start_row, start_col, end_row, end_col) +						*tsnode:descendant_for_range()* +	Get the smallest node within this node that spans the given range of +	(row, column) positions + +tsnode:named_descendant_for_range(start_row, start_col, end_row, end_col) +					*tsnode:named_descendant_for_range()* +	Get the smallest named node within this node that spans the given +	range of (row, column) positions + +------------------------------------------------------------------------------  VIM							*lua-util*  vim.in_fast_event()					*vim.in_fast_event()* diff --git a/runtime/lua/vim/treesitter.lua b/runtime/lua/vim/treesitter.lua new file mode 100644 index 0000000000..e0202927bb --- /dev/null +++ b/runtime/lua/vim/treesitter.lua @@ -0,0 +1,73 @@ +local a = vim.api + +-- TODO(bfredl): currently we retain parsers for the lifetime of the buffer. +-- Consider use weak references to release parser if all plugins are done with +-- it. +local parsers = {} + +local Parser = {} +Parser.__index = Parser + +function Parser:parse() +  if self.valid then +    return self.tree +  end +  self.tree = self._parser:parse_buf(self.bufnr) +  self.valid = true +  return self.tree +end + +function Parser:_on_lines(bufnr, _, start_row, old_stop_row, stop_row, old_byte_size) +  local start_byte = a.nvim_buf_get_offset(bufnr,start_row) +  local stop_byte = a.nvim_buf_get_offset(bufnr,stop_row) +  local old_stop_byte = start_byte + old_byte_size +  self._parser:edit(start_byte,old_stop_byte,stop_byte, +                    start_row,0,old_stop_row,0,stop_row,0) +  self.valid = false +end + +local module = { +  add_language=vim._ts_add_language, +  inspect_language=vim._ts_inspect_language, +} + +function module.create_parser(bufnr, ft, id) +  if bufnr == 0 then +    bufnr = a.nvim_get_current_buf() +  end +  local self = setmetatable({bufnr=bufnr, valid=false}, Parser) +  self._parser = vim._create_ts_parser(ft) +  self:parse() +    -- TODO(bfredl): use weakref to self, so that the parser is free'd is no plugin is +    -- using it. +  local function lines_cb(_, ...) +    return self:_on_lines(...) +  end +  local detach_cb = nil +  if id ~= nil then +    detach_cb = function() +      if parsers[id] == self then +        parsers[id] = nil +      end +    end +  end +  a.nvim_buf_attach(self.bufnr, false, {on_lines=lines_cb, on_detach=detach_cb}) +  return self +end + +function module.get_parser(bufnr, ft) +  if bufnr == nil or bufnr == 0 then +    bufnr = a.nvim_get_current_buf() +  end +  if ft == nil then +    ft = a.nvim_buf_get_option(bufnr, "filetype") +  end +  local id = tostring(bufnr)..'_'..ft + +  if parsers[id] == nil then +    parsers[id] = module.create_parser(bufnr, ft, id) +  end +  return parsers[id] +end + +return module diff --git a/src/nvim/CMakeLists.txt b/src/nvim/CMakeLists.txt index aa8100873b..27977e3a40 100644 --- a/src/nvim/CMakeLists.txt +++ b/src/nvim/CMakeLists.txt @@ -85,6 +85,10 @@ file(GLOB NVIM_HEADERS *.h)  file(GLOB XDIFF_SOURCES xdiff/*.c)  file(GLOB XDIFF_HEADERS xdiff/*.h) +file(GLOB TREESITTER_SOURCES ../tree_sitter/*.c) +file(GLOB TS_SOURCE_AMALGAM ../tree_sitter/lib.c) +list(REMOVE_ITEM TREESITTER_SOURCES ${TS_SOURCE_AMALGAM}) +  foreach(subdir          os          api @@ -141,6 +145,7 @@ set(CONV_SOURCES    ex_cmds.c    ex_docmd.c    fileio.c +  lua/treesitter.c    mbyte.c    memline.c    message.c @@ -172,6 +177,9 @@ if(NOT MSVC)      set_source_files_properties(        eval.c PROPERTIES COMPILE_FLAGS "${COMPILE_FLAGS} -Wno-conversion")    endif() + +  # tree-sitter: inlined external project, we don't maintain it. #10124 +  set_source_files_properties(${TREESITTER_SOURCES} PROPERTIES COMPILE_FLAGS "${COMPILE_FLAGS} -Wno-conversion -Wno-pedantic -Wno-shadow -Wno-missing-prototypes -Wno-unused-variable")  endif()  if(NOT "${MIN_LOG_LEVEL}" MATCHES "^$") @@ -395,6 +403,7 @@ list(APPEND NVIM_LINK_LIBRARIES    ${LIBVTERM_LIBRARIES}    ${LIBTERMKEY_LIBRARIES}    ${UNIBILIUM_LIBRARIES} +  ${UTF8PROC_LIBRARIES}    ${CMAKE_THREAD_LIBS_INIT}  ) @@ -414,7 +423,7 @@ endif()  add_executable(nvim ${NVIM_GENERATED_FOR_SOURCES} ${NVIM_GENERATED_FOR_HEADERS}    ${NVIM_GENERATED_SOURCES} ${NVIM_SOURCES} ${NVIM_HEADERS} -  ${XDIFF_SOURCES} ${XDIFF_HEADERS}) +  ${XDIFF_SOURCES} ${XDIFF_HEADERS} ${TREESITTER_SOURCES})  target_link_libraries(nvim ${NVIM_EXEC_LINK_LIBRARIES})  install_helper(TARGETS nvim) @@ -500,7 +509,7 @@ add_library(    EXCLUDE_FROM_ALL    ${NVIM_SOURCES} ${NVIM_GENERATED_SOURCES}    ${NVIM_HEADERS} ${NVIM_GENERATED_FOR_SOURCES} ${NVIM_GENERATED_FOR_HEADERS} -  ${XDIFF_SOURCES} ${XDIFF_HEADERS} +  ${XDIFF_SOURCES} ${XDIFF_HEADERS} ${TREESITTER_SOURCES}  )  set_property(TARGET libnvim APPEND PROPERTY               INCLUDE_DIRECTORIES ${LUA_PREFERRED_INCLUDE_DIRS}) @@ -525,7 +534,7 @@ else()      EXCLUDE_FROM_ALL      ${NVIM_SOURCES} ${NVIM_GENERATED_SOURCES}      ${NVIM_HEADERS} ${NVIM_GENERATED_FOR_SOURCES} ${NVIM_GENERATED_FOR_HEADERS} -    ${XDIFF_SOURCES} ${XDIFF_HEADERS} +    ${XDIFF_SOURCES} ${XDIFF_HEADERS} ${TREESITTER_SOURCES}      ${UNIT_TEST_FIXTURES}    )    target_link_libraries(nvim-test ${NVIM_TEST_LINK_LIBRARIES}) diff --git a/src/nvim/lua/executor.c b/src/nvim/lua/executor.c index f51aa3c6d4..127458fe39 100644 --- a/src/nvim/lua/executor.c +++ b/src/nvim/lua/executor.c @@ -31,6 +31,7 @@  #include "nvim/lua/executor.h"  #include "nvim/lua/converter.h" +#include "nvim/lua/treesitter.h"  #include "luv/luv.h" @@ -310,7 +311,11 @@ static int nlua_state_init(lua_State *const lstate) FUNC_ATTR_NONNULL_ALL    lua_setfield(lstate, -2, "luv");    lua_pop(lstate, 3); +  // internal vim._treesitter... API +  nlua_add_treesitter(lstate); +    lua_setglobal(lstate, "vim"); +    return 0;  } @@ -816,3 +821,27 @@ void ex_luafile(exarg_T *const eap)      return;    }  } + +static int create_tslua_parser(lua_State *L) +{ +  if (lua_gettop(L) < 1 || !lua_isstring(L, 1)) { +    return luaL_error(L, "string expected"); +  } + +  const char *lang_name = lua_tostring(L, 1); +  return tslua_push_parser(L, lang_name); +} + +static void nlua_add_treesitter(lua_State *const lstate) FUNC_ATTR_NONNULL_ALL +{ +  tslua_init(lstate); + +  lua_pushcfunction(lstate, create_tslua_parser); +  lua_setfield(lstate, -2, "_create_ts_parser"); + +  lua_pushcfunction(lstate, tslua_register_lang); +  lua_setfield(lstate, -2, "_ts_add_language"); + +  lua_pushcfunction(lstate, tslua_inspect_lang); +  lua_setfield(lstate, -2, "_ts_inspect_language"); +} diff --git a/src/nvim/lua/treesitter.c b/src/nvim/lua/treesitter.c new file mode 100644 index 0000000000..d2072402bb --- /dev/null +++ b/src/nvim/lua/treesitter.c @@ -0,0 +1,652 @@ +// This is an open source non-commercial project. Dear PVS-Studio, please check +// it. PVS-Studio Static Code Analyzer for C, C++ and C#: http://www.viva64.com + +// lua bindings for tree-sitter. +// NB: this file mostly contains a generic lua interface for tree-sitter +// trees and nodes, and could be broken out as a reusable lua package + +#include <stdbool.h> +#include <stdlib.h> +#include <string.h> +#include <inttypes.h> +#include <assert.h> + +#include <lua.h> +#include <lualib.h> +#include <lauxlib.h> + +#include "tree_sitter/api.h" + +#include "nvim/lua/treesitter.h" +#include "nvim/api/private/handle.h" +#include "nvim/memline.h" + +typedef struct { +  TSParser *parser; +  TSTree *tree;  // internal tree, used for editing/reparsing +} TSLua_parser; + +#ifdef INCLUDE_GENERATED_DECLARATIONS +# include "lua/treesitter.c.generated.h" +#endif + +static struct luaL_Reg parser_meta[] = { +  { "__gc", parser_gc }, +  { "__tostring", parser_tostring }, +  { "parse_buf", parser_parse_buf }, +  { "edit", parser_edit }, +  { "tree", parser_tree }, +  { NULL, NULL } +}; + +static struct luaL_Reg tree_meta[] = { +  { "__gc", tree_gc }, +  { "__tostring", tree_tostring }, +  { "root", tree_root }, +  { NULL, NULL } +}; + +static struct luaL_Reg node_meta[] = { +  { "__tostring", node_tostring }, +  { "__eq", node_eq }, +  { "__len", node_child_count }, +  { "range", node_range }, +  { "start", node_start }, +  { "end_", node_end }, +  { "type", node_type }, +  { "symbol", node_symbol }, +  { "named", node_named }, +  { "missing", node_missing }, +  { "has_error", node_has_error }, +  { "sexpr", node_sexpr }, +  { "child_count", node_child_count }, +  { "named_child_count", node_named_child_count }, +  { "child", node_child }, +  { "named_child", node_named_child }, +  { "descendant_for_range", node_descendant_for_range }, +  { "named_descendant_for_range", node_named_descendant_for_range }, +  { "parent", node_parent }, +  { NULL, NULL } +}; + +static PMap(cstr_t) *langs; + +static void build_meta(lua_State *L, const char *tname, const luaL_Reg *meta) +{ +  if (luaL_newmetatable(L, tname)) {  // [meta] +    for (size_t i = 0; meta[i].name != NULL; i++) { +      lua_pushcfunction(L, meta[i].func);  // [meta, func] +      lua_setfield(L, -2, meta[i].name);  // [meta] +    } + +    lua_pushvalue(L, -1);  // [meta, meta] +    lua_setfield(L, -2, "__index");  // [meta] +  } +  lua_pop(L, 1);  // [] (don't use it now) +} + +/// init the tslua library +/// +/// all global state is stored in the regirstry of the lua_State +void tslua_init(lua_State *L) +{ +  langs = pmap_new(cstr_t)(); + +  // type metatables +  build_meta(L, "treesitter_parser", parser_meta); +  build_meta(L, "treesitter_tree", tree_meta); +  build_meta(L, "treesitter_node", node_meta); +} + +int tslua_register_lang(lua_State *L) +{ +  if (lua_gettop(L) < 2 || !lua_isstring(L, 1) || !lua_isstring(L, 2)) { +    return luaL_error(L, "string expected"); +  } + +  const char *path = lua_tostring(L, 1); +  const char *lang_name = lua_tostring(L, 2); + +  if (pmap_has(cstr_t)(langs, lang_name)) { +    return 0; +  } + +#define BUFSIZE 128 +  char symbol_buf[BUFSIZE]; +  snprintf(symbol_buf, BUFSIZE, "tree_sitter_%s", lang_name); +#undef BUFSIZE + +  uv_lib_t lib; +  if (uv_dlopen(path, &lib)) { +    snprintf((char *)IObuff, IOSIZE, "Failed to load parser: uv_dlopen: %s", +             uv_dlerror(&lib)); +    uv_dlclose(&lib); +    lua_pushstring(L, (char *)IObuff); +    return lua_error(L); +  } + +  TSLanguage *(*lang_parser)(void); +  if (uv_dlsym(&lib, symbol_buf, (void **)&lang_parser)) { +    snprintf((char *)IObuff, IOSIZE, "Failed to load parser: uv_dlsym: %s", +             uv_dlerror(&lib)); +    uv_dlclose(&lib); +    lua_pushstring(L, (char *)IObuff); +    return lua_error(L); +  } + +  TSLanguage *lang = lang_parser(); +  if (lang == NULL) { +    return luaL_error(L, "Failed to load parser: internal error"); +  } + +  pmap_put(cstr_t)(langs, xstrdup(lang_name), lang); + +  lua_pushboolean(L, true); +  return 1; +} + +int tslua_inspect_lang(lua_State *L) +{ +  if (lua_gettop(L) < 1 || !lua_isstring(L, 1)) { +    return luaL_error(L, "string expected"); +  } +  const char *lang_name = lua_tostring(L, 1); + +  TSLanguage *lang = pmap_get(cstr_t)(langs, lang_name); +  if (!lang) { +    return luaL_error(L, "no such language: %s", lang_name); +  } + +  lua_createtable(L, 0, 2);  // [retval] + +  size_t nsymbols = (size_t)ts_language_symbol_count(lang); + +  lua_createtable(L, nsymbols-1, 1);  // [retval, symbols] +  for (size_t i = 0; i < nsymbols; i++) { +    TSSymbolType t = ts_language_symbol_type(lang, i); +    if (t == TSSymbolTypeAuxiliary) { +      // not used by the API +      continue; +    } +    lua_createtable(L, 2, 0);  // [retval, symbols, elem] +    lua_pushstring(L, ts_language_symbol_name(lang, i)); +    lua_rawseti(L, -2, 1); +    lua_pushboolean(L, t == TSSymbolTypeRegular); +    lua_rawseti(L, -2, 2);  // [retval, symbols, elem] +    lua_rawseti(L, -2, i);  // [retval, symbols] +  } + +  lua_setfield(L, -2, "symbols");  // [retval] + +  size_t nfields = (size_t)ts_language_field_count(lang); +  lua_createtable(L, nfields-1, 1);  // [retval, fields] +  for (size_t i = 0; i < nfields; i++) { +    lua_pushstring(L, ts_language_field_name_for_id(lang, i)); +    lua_rawseti(L, -2, i);  // [retval, fields] +  } + +  lua_setfield(L, -2, "fields");  // [retval] +  return 1; +} + +int tslua_push_parser(lua_State *L, const char *lang_name) +{ +  TSLanguage *lang = pmap_get(cstr_t)(langs, lang_name); +  if (!lang) { +    return luaL_error(L, "no such language: %s", lang_name); +  } + +  TSParser *parser = ts_parser_new(); +  ts_parser_set_language(parser, lang); +  TSLua_parser *p = lua_newuserdata(L, sizeof(TSLua_parser));  // [udata] +  p->parser = parser; +  p->tree = NULL; + +  lua_getfield(L, LUA_REGISTRYINDEX, "treesitter_parser");  // [udata, meta] +  lua_setmetatable(L, -2);  // [udata] +  return 1; +} + +static TSLua_parser *parser_check(lua_State *L) +{ +  return luaL_checkudata(L, 1, "treesitter_parser"); +} + +static int parser_gc(lua_State *L) +{ +  TSLua_parser *p = parser_check(L); +  if (!p) { +    return 0; +  } + +  ts_parser_delete(p->parser); +  if (p->tree) { +    ts_tree_delete(p->tree); +  } + +  return 0; +} + +static int parser_tostring(lua_State *L) +{ +  lua_pushstring(L, "<parser>"); +  return 1; +} + +static const char *input_cb(void *payload, uint32_t byte_index, +                            TSPoint position, uint32_t *bytes_read) +{ +  buf_T *bp  = payload; +#define BUFSIZE 256 +  static char buf[BUFSIZE]; + +  if ((linenr_T)position.row >= bp->b_ml.ml_line_count) { +    *bytes_read = 0; +    return ""; +  } +  char_u *line = ml_get_buf(bp, position.row+1, false); +  size_t len = STRLEN(line); +  size_t tocopy = MIN(len-position.column, BUFSIZE); + +  memcpy(buf, line+position.column, tocopy); +  // Translate embedded \n to NUL +  memchrsub(buf, '\n', '\0', tocopy); +  *bytes_read = (uint32_t)tocopy; +  if (tocopy < BUFSIZE) { +    // now add the final \n. If it didn't fit, input_cb will be called again +    // on the same line with advanced column. +    buf[tocopy] = '\n'; +    (*bytes_read)++; +  } +  return buf; +#undef BUFSIZE +} + +static int parser_parse_buf(lua_State *L) +{ +  TSLua_parser *p = parser_check(L); +  if (!p) { +    return 0; +  } + +  long bufnr = lua_tointeger(L, 2); +  void *payload = handle_get_buffer(bufnr); +  if (!payload) { +    return luaL_error(L, "invalid buffer handle: %d", bufnr); +  } +  TSInput input = { payload, input_cb, TSInputEncodingUTF8 }; +  TSTree *new_tree = ts_parser_parse(p->parser, p->tree, input); +  if (p->tree) { +    ts_tree_delete(p->tree); +  } +  p->tree = new_tree; + +  tslua_push_tree(L, p->tree); +  return 1; +} + +static int parser_tree(lua_State *L) +{ +  TSLua_parser *p = parser_check(L); +  if (!p) { +    return 0; +  } + +  tslua_push_tree(L, p->tree); +  return 1; +} + +static int parser_edit(lua_State *L) +{ +  if (lua_gettop(L) < 10) { +    lua_pushstring(L, "not enough args to parser:edit()"); +    return lua_error(L); +  } + +  TSLua_parser *p = parser_check(L); +  if (!p) { +    return 0; +  } + +  if (!p->tree) { +    return 0; +  } + +  long start_byte = lua_tointeger(L, 2); +  long old_end_byte = lua_tointeger(L, 3); +  long new_end_byte = lua_tointeger(L, 4); +  TSPoint start_point = { lua_tointeger(L, 5), lua_tointeger(L, 6) }; +  TSPoint old_end_point = { lua_tointeger(L, 7), lua_tointeger(L, 8) }; +  TSPoint new_end_point = { lua_tointeger(L, 9), lua_tointeger(L, 10) }; + +  TSInputEdit edit = { start_byte, old_end_byte, new_end_byte, +                       start_point, old_end_point, new_end_point }; + +  ts_tree_edit(p->tree, &edit); + +  return 0; +} + + +// Tree methods + +/// push tree interface on lua stack. +/// +/// This makes a copy of the tree, so ownership of the argument is unaffected. +void tslua_push_tree(lua_State *L, TSTree *tree) +{ +  if (tree == NULL) { +    lua_pushnil(L); +    return; +  } +  TSTree **ud = lua_newuserdata(L, sizeof(TSTree *));  // [udata] +  *ud = ts_tree_copy(tree); +  lua_getfield(L, LUA_REGISTRYINDEX, "treesitter_tree");  // [udata, meta] +  lua_setmetatable(L, -2);  // [udata] + +  // table used for node wrappers to keep a reference to tree wrapper +  // NB: in lua 5.3 the uservalue for the node could just be the tree, but +  // in lua 5.1 the uservalue (fenv) must be a table. +  lua_createtable(L, 1, 0);  // [udata, reftable] +  lua_pushvalue(L, -2);  // [udata, reftable, udata] +  lua_rawseti(L, -2, 1);  // [udata, reftable] +  lua_setfenv(L, -2);  // [udata] +} + +static TSTree *tree_check(lua_State *L) +{ +  TSTree **ud = luaL_checkudata(L, 1, "treesitter_tree"); +  return *ud; +} + +static int tree_gc(lua_State *L) +{ +  TSTree *tree = tree_check(L); +  if (!tree) { +    return 0; +  } + +  ts_tree_delete(tree); +  return 0; +} + +static int tree_tostring(lua_State *L) +{ +  lua_pushstring(L, "<tree>"); +  return 1; +} + +static int tree_root(lua_State *L) +{ +  TSTree *tree = tree_check(L); +  if (!tree) { +    return 0; +  } +  TSNode root = ts_tree_root_node(tree); +  push_node(L, root); +  return 1; +} + +// Node methods + +/// push node interface on lua stack +/// +/// top of stack must either be the tree this node belongs to or another node +/// of the same tree! This value is not popped. Can only be called inside a +/// cfunction with the tslua environment. +static void push_node(lua_State *L, TSNode node) +{ +  if (ts_node_is_null(node)) { +    lua_pushnil(L);  // [src, nil] +    return; +  } +  TSNode *ud = lua_newuserdata(L, sizeof(TSNode));  // [src, udata] +  *ud = node; +  lua_getfield(L, LUA_REGISTRYINDEX, "treesitter_node");  // [src, udata, meta] +  lua_setmetatable(L, -2);  // [src, udata] +  lua_getfenv(L, -2);  // [src, udata, reftable] +  lua_setfenv(L, -2);  // [src, udata] +} + +static bool node_check(lua_State *L, TSNode *res) +{ +  TSNode *ud = luaL_checkudata(L, 1, "treesitter_node"); +  if (ud) { +    *res = *ud; +    return true; +  } +  return false; +} + + +static int node_tostring(lua_State *L) +{ +  TSNode node; +  if (!node_check(L, &node)) { +    return 0; +  } +  lua_pushstring(L, "<node "); +  lua_pushstring(L, ts_node_type(node)); +  lua_pushstring(L, ">"); +  lua_concat(L, 3); +  return 1; +} + +static int node_eq(lua_State *L) +{ +  TSNode node; +  if (!node_check(L, &node)) { +    return 0; +  } +  // This should only be called if both x and y in "x == y" has the +  // treesitter_node metatable. So it is ok to error out otherwise. +  TSNode *ud = luaL_checkudata(L, 2, "treesitter_node"); +  if (!ud) { +    return 0; +  } +  TSNode node2 = *ud; +  lua_pushboolean(L, ts_node_eq(node, node2)); +  return 1; +} + +static int node_range(lua_State *L) +{ +  TSNode node; +  if (!node_check(L, &node)) { +    return 0; +  } +  TSPoint start = ts_node_start_point(node); +  TSPoint end = ts_node_end_point(node); +  lua_pushnumber(L, start.row); +  lua_pushnumber(L, start.column); +  lua_pushnumber(L, end.row); +  lua_pushnumber(L, end.column); +  return 4; +} + +static int node_start(lua_State *L) +{ +  TSNode node; +  if (!node_check(L, &node)) { +    return 0; +  } +  TSPoint start = ts_node_start_point(node); +  uint32_t start_byte = ts_node_start_byte(node); +  lua_pushnumber(L, start.row); +  lua_pushnumber(L, start.column); +  lua_pushnumber(L, start_byte); +  return 3; +} + +static int node_end(lua_State *L) +{ +  TSNode node; +  if (!node_check(L, &node)) { +    return 0; +  } +  TSPoint end = ts_node_end_point(node); +  uint32_t end_byte = ts_node_end_byte(node); +  lua_pushnumber(L, end.row); +  lua_pushnumber(L, end.column); +  lua_pushnumber(L, end_byte); +  return 3; +} + +static int node_child_count(lua_State *L) +{ +  TSNode node; +  if (!node_check(L, &node)) { +    return 0; +  } +  uint32_t count = ts_node_child_count(node); +  lua_pushnumber(L, count); +  return 1; +} + +static int node_named_child_count(lua_State *L) +{ +  TSNode node; +  if (!node_check(L, &node)) { +    return 0; +  } +  uint32_t count = ts_node_named_child_count(node); +  lua_pushnumber(L, count); +  return 1; +} + +static int node_type(lua_State *L) +{ +  TSNode node; +  if (!node_check(L, &node)) { +    return 0; +  } +  lua_pushstring(L, ts_node_type(node)); +  return 1; +} + +static int node_symbol(lua_State *L) +{ +  TSNode node; +  if (!node_check(L, &node)) { +    return 0; +  } +  TSSymbol symbol = ts_node_symbol(node); +  lua_pushnumber(L, symbol); +  return 1; +} + +static int node_named(lua_State *L) +{ +  TSNode node; +  if (!node_check(L, &node)) { +    return 0; +  } +  lua_pushboolean(L, ts_node_is_named(node)); +  return 1; +} + +static int node_sexpr(lua_State *L) +{ +  TSNode node; +  if (!node_check(L, &node)) { +    return 0; +  } +  char *allocated = ts_node_string(node); +  lua_pushstring(L, allocated); +  xfree(allocated); +  return 1; +} + +static int node_missing(lua_State *L) +{ +  TSNode node; +  if (!node_check(L, &node)) { +    return 0; +  } +  lua_pushboolean(L, ts_node_is_missing(node)); +  return 1; +} + +static int node_has_error(lua_State *L) +{ +  TSNode node; +  if (!node_check(L, &node)) { +    return 0; +  } +  lua_pushboolean(L, ts_node_has_error(node)); +  return 1; +} + +static int node_child(lua_State *L) +{ +  TSNode node; +  if (!node_check(L, &node)) { +    return 0; +  } +  long num = lua_tointeger(L, 2); +  TSNode child = ts_node_child(node, (uint32_t)num); + +  lua_pushvalue(L, 1); +  push_node(L, child); +  return 1; +} + +static int node_named_child(lua_State *L) +{ +  TSNode node; +  if (!node_check(L, &node)) { +    return 0; +  } +  long num = lua_tointeger(L, 2); +  TSNode child = ts_node_named_child(node, (uint32_t)num); + +  lua_pushvalue(L, 1); +  push_node(L, child); +  return 1; +} + +static int node_descendant_for_range(lua_State *L) +{ +  TSNode node; +  if (!node_check(L, &node)) { +    return 0; +  } +  TSPoint start = { (uint32_t)lua_tointeger(L, 2), +                   (uint32_t)lua_tointeger(L, 3) }; +  TSPoint end = { (uint32_t)lua_tointeger(L, 4), +                 (uint32_t)lua_tointeger(L, 5) }; +  TSNode child = ts_node_descendant_for_point_range(node, start, end); + +  lua_pushvalue(L, 1); +  push_node(L, child); +  return 1; +} + +static int node_named_descendant_for_range(lua_State *L) +{ +  TSNode node; +  if (!node_check(L, &node)) { +    return 0; +  } +  TSPoint start = { (uint32_t)lua_tointeger(L, 2), +                   (uint32_t)lua_tointeger(L, 3) }; +  TSPoint end = { (uint32_t)lua_tointeger(L, 4), +                 (uint32_t)lua_tointeger(L, 5) }; +  TSNode child = ts_node_named_descendant_for_point_range(node, start, end); + +  lua_pushvalue(L, 1); +  push_node(L, child); +  return 1; +} + +static int node_parent(lua_State *L) +{ +  TSNode node; +  if (!node_check(L, &node)) { +    return 0; +  } +  TSNode parent = ts_node_parent(node); +  push_node(L, parent); +  return 1; +} + diff --git a/src/nvim/lua/treesitter.h b/src/nvim/lua/treesitter.h new file mode 100644 index 0000000000..812166f67b --- /dev/null +++ b/src/nvim/lua/treesitter.h @@ -0,0 +1,14 @@ +#ifndef NVIM_LUA_TREESITTER_H +#define NVIM_LUA_TREESITTER_H + +#include <lua.h> +#include <lualib.h> +#include <lauxlib.h> + +#include "tree_sitter/api.h" + +#ifdef INCLUDE_GENERATED_DECLARATIONS +# include "lua/treesitter.h.generated.h" +#endif + +#endif  // NVIM_LUA_TREESITTER_H diff --git a/src/nvim/lua/vim.lua b/src/nvim/lua/vim.lua index b1a684b977..b67762e48e 100644 --- a/src/nvim/lua/vim.lua +++ b/src/nvim/lua/vim.lua @@ -232,6 +232,9 @@ local function __index(t, key)    if key == 'inspect' then      t.inspect = require('vim.inspect')      return t.inspect +  elseif key == 'treesitter' then +    t.treesitter = require('vim.treesitter') +    return t.treesitter    elseif require('vim.shared')[key] ~= nil then      -- Expose all `vim.shared` functions on the `vim` module.      t[key] = require('vim.shared')[key] diff --git a/src/tree_sitter/LICENSE b/src/tree_sitter/LICENSE new file mode 100644 index 0000000000..971b81f9a8 --- /dev/null +++ b/src/tree_sitter/LICENSE @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2018 Max Brunsfeld + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/src/tree_sitter/alloc.h b/src/tree_sitter/alloc.h new file mode 100644 index 0000000000..2229995bd1 --- /dev/null +++ b/src/tree_sitter/alloc.h @@ -0,0 +1,94 @@ +#ifndef TREE_SITTER_ALLOC_H_ +#define TREE_SITTER_ALLOC_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <stdlib.h> +#include <stdbool.h> +#include <stdio.h> + +#include "nvim/memory.h" + +#if 1 + +static inline bool ts_toggle_allocation_recording(bool value) { +  return false; +} + +#define ts_malloc xmalloc +#define ts_calloc xcalloc +#define ts_realloc xrealloc +#define ts_free xfree + +#elif defined(TREE_SITTER_TEST) + +void *ts_record_malloc(size_t); +void *ts_record_calloc(size_t, size_t); +void *ts_record_realloc(void *, size_t); +void ts_record_free(void *); +bool ts_toggle_allocation_recording(bool); + +static inline void *ts_malloc(size_t size) { +  return ts_record_malloc(size); +} + +static inline void *ts_calloc(size_t count, size_t size) { +  return ts_record_calloc(count, size); +} + +static inline void *ts_realloc(void *buffer, size_t size) { +  return ts_record_realloc(buffer, size); +} + +static inline void ts_free(void *buffer) { +  ts_record_free(buffer); +} + +#else + +#include <stdlib.h> + +static inline bool ts_toggle_allocation_recording(bool value) { +  return false; +} + +static inline void *ts_malloc(size_t size) { +  void *result = malloc(size); +  if (size > 0 && !result) { +    fprintf(stderr, "tree-sitter failed to allocate %lu bytes", size); +    exit(1); +  } +  return result; +} + +static inline void *ts_calloc(size_t count, size_t size) { +  void *result = calloc(count, size); +  if (count > 0 && !result) { +    fprintf(stderr, "tree-sitter failed to allocate %lu bytes", count * size); +    exit(1); +  } +  return result; +} + +static inline void *ts_realloc(void *buffer, size_t size) { +  void *result = realloc(buffer, size); +  if (size > 0 && !result) { +    fprintf(stderr, "tree-sitter failed to reallocate %lu bytes", size); +    exit(1); +  } +  return result; +} + +static inline void ts_free(void *buffer) { +  free(buffer); +} + +#endif + +#ifdef __cplusplus +} +#endif + +#endif  // TREE_SITTER_ALLOC_H_ diff --git a/src/tree_sitter/api.h b/src/tree_sitter/api.h new file mode 100644 index 0000000000..d39d0521ee --- /dev/null +++ b/src/tree_sitter/api.h @@ -0,0 +1,660 @@ +#ifndef TREE_SITTER_API_H_ +#define TREE_SITTER_API_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> +#include <stdbool.h> + +/****************************/ +/* Section - ABI Versioning */ +/****************************/ + +#define TREE_SITTER_LANGUAGE_VERSION 11 +#define TREE_SITTER_MIN_COMPATIBLE_LANGUAGE_VERSION 9 + +/*******************/ +/* Section - Types */ +/*******************/ + +typedef uint16_t TSSymbol; +typedef uint16_t TSFieldId; +typedef struct TSLanguage TSLanguage; +typedef struct TSParser TSParser; +typedef struct TSTree TSTree; + +typedef enum { +  TSInputEncodingUTF8, +  TSInputEncodingUTF16, +} TSInputEncoding; + +typedef enum { +  TSSymbolTypeRegular, +  TSSymbolTypeAnonymous, +  TSSymbolTypeAuxiliary, +} TSSymbolType; + +typedef struct { +  uint32_t row; +  uint32_t column; +} TSPoint; + +typedef struct { +  TSPoint start_point; +  TSPoint end_point; +  uint32_t start_byte; +  uint32_t end_byte; +} TSRange; + +typedef struct { +  void *payload; +  const char *(*read)(void *payload, uint32_t byte_index, TSPoint position, uint32_t *bytes_read); +  TSInputEncoding encoding; +} TSInput; + +typedef enum { +  TSLogTypeParse, +  TSLogTypeLex, +} TSLogType; + +typedef struct { +  void *payload; +  void (*log)(void *payload, TSLogType, const char *); +} TSLogger; + +typedef struct { +  uint32_t start_byte; +  uint32_t old_end_byte; +  uint32_t new_end_byte; +  TSPoint start_point; +  TSPoint old_end_point; +  TSPoint new_end_point; +} TSInputEdit; + +typedef struct { +  uint32_t context[4]; +  const void *id; +  const TSTree *tree; +} TSNode; + +typedef struct { +  const void *tree; +  const void *id; +  uint32_t context[2]; +} TSTreeCursor; + +/********************/ +/* Section - Parser */ +/********************/ + +/** + * Create a new parser. + */ +TSParser *ts_parser_new(void); + +/** + * Delete the parser, freeing all of the memory that it used. + */ +void ts_parser_delete(TSParser *parser); + +/** + * Set the language that the parser should use for parsing. + * + * Returns a boolean indicating whether or not the language was successfully + * assigned. True means assignment succeeded. False means there was a version + * mismatch: the language was generated with an incompatible version of the + * Tree-sitter CLI. Check the language's version using `ts_language_version` + * and compare it to this library's `TREE_SITTER_LANGUAGE_VERSION` and + * `TREE_SITTER_MIN_COMPATIBLE_LANGUAGE_VERSION` constants. + */ +bool ts_parser_set_language(TSParser *self, const TSLanguage *language); + +/** + * Get the parser's current language. + */ +const TSLanguage *ts_parser_language(const TSParser *self); + +/** + * Set the spans of text that the parser should include when parsing. + * + * By default, the parser will always include entire documents. This function + * allows you to parse only a *portion* of a document but still return a syntax + * tree whose ranges match up with the document as a whole. You can also pass + * multiple disjoint ranges. + * + * The second and third parameters specify the location and length of an array + * of ranges. The parser does *not* take ownership of these ranges; it copies + * the data, so it doesn't matter how these ranges are allocated. + */ +void ts_parser_set_included_ranges( +  TSParser *self, +  const TSRange *ranges, +  uint32_t length +); + +/** + * Get the ranges of text that the parser will include when parsing. + * + * The returned pointer is owned by the parser. The caller should not free it + * or write to it. The length of the array will be written to the given + * `length` pointer. + */ +const TSRange *ts_parser_included_ranges( +  const TSParser *self, +  uint32_t *length +); + +/** + * Use the parser to parse some source code and create a syntax tree. + * + * If you are parsing this document for the first time, pass `NULL` for the + * `old_tree` parameter. Otherwise, if you have already parsed an earlier + * version of this document and the document has since been edited, pass the + * previous syntax tree so that the unchanged parts of it can be reused. + * This will save time and memory. For this to work correctly, you must have + * already edited the old syntax tree using the `ts_tree_edit` function in a + * way that exactly matches the source code changes. + * + * The `TSInput` parameter lets you specify how to read the text. It has the + * following three fields: + * 1. `read`: A function to retrieve a chunk of text at a given byte offset + *    and (row, column) position. The function should return a pointer to the + *    text and write its length to the the `bytes_read` pointer. The parser + *    does not take ownership of this buffer; it just borrows it until it has + *    finished reading it. The function should write a zero value to the + *    `bytes_read` pointer to indicate the end of the document. + * 2. `payload`: An arbitrary pointer that will be passed to each invocation + *    of the `read` function. + * 3. `encoding`: An indication of how the text is encoded. Either + *    `TSInputEncodingUTF8` or `TSInputEncodingUTF16`. + * + * This function returns a syntax tree on success, and `NULL` on failure. There + * are three possible reasons for failure: + * 1. The parser does not have a language assigned. Check for this using the +      `ts_parser_language` function. + * 2. Parsing was cancelled due to a timeout that was set by an earlier call to + *    the `ts_parser_set_timeout_micros` function. You can resume parsing from + *    where the parser left out by calling `ts_parser_parse` again with the + *    same arguments. Or you can start parsing from scratch by first calling + *    `ts_parser_reset`. + * 3. Parsing was cancelled using a cancellation flag that was set by an + *    earlier call to `ts_parser_set_cancellation_flag`. You can resume parsing + *    from where the parser left out by calling `ts_parser_parse` again with + *    the same arguments. + */ +TSTree *ts_parser_parse( +  TSParser *self, +  const TSTree *old_tree, +  TSInput input +); + +/** + * Use the parser to parse some source code stored in one contiguous buffer. + * The first two parameters are the same as in the `ts_parser_parse` function + * above. The second two parameters indicate the location of the buffer and its + * length in bytes. + */ +TSTree *ts_parser_parse_string( +  TSParser *self, +  const TSTree *old_tree, +  const char *string, +  uint32_t length +); + +/** + * Use the parser to parse some source code stored in one contiguous buffer with + * a given encoding. The first four parameters work the same as in the + * `ts_parser_parse_string` method above. The final parameter indicates whether + * the text is encoded as UTF8 or UTF16. + */ +TSTree *ts_parser_parse_string_encoding( +  TSParser *self, +  const TSTree *old_tree, +  const char *string, +  uint32_t length, +  TSInputEncoding encoding +); + +/** + * Instruct the parser to start the next parse from the beginning. + * + * If the parser previously failed because of a timeout or a cancellation, then + * by default, it will resume where it left off on the next call to + * `ts_parser_parse` or other parsing functions. If you don't want to resume, + * and instead intend to use this parser to parse some other document, you must + * call this `ts_parser_reset` first. + */ +void ts_parser_reset(TSParser *self); + +/** + * Set the maximum duration in microseconds that parsing should be allowed to + * take before halting. If parsing takes longer than this, it will halt early, + * returning NULL. See `ts_parser_parse` for more information. + */ +void ts_parser_set_timeout_micros(TSParser *self, uint64_t timeout); + +/** + * Get the duration in microseconds that parsing is allowed to take. + */ +uint64_t ts_parser_timeout_micros(const TSParser *self); + +/** + * Set the parser's current cancellation flag pointer. If a non-null pointer is + * assigned, then the parser will periodically read from this pointer during + * parsing. If it reads a non-zero value, it will halt early, returning NULL. + * See `ts_parser_parse` for more information. + */ +void ts_parser_set_cancellation_flag(TSParser *self, const size_t *flag); + +/** + * Get the parser's current cancellation flag pointer. + */ +const size_t *ts_parser_cancellation_flag(const TSParser *self); + +/** + * Set the logger that a parser should use during parsing. + * + * The parser does not take ownership over the logger payload. If a logger was + * previously assigned, the caller is responsible for releasing any memory + * owned by the previous logger. + */ +void ts_parser_set_logger(TSParser *self, TSLogger logger); + +/** + * Get the parser's current logger. + */ +TSLogger ts_parser_logger(const TSParser *self); + +/** + * Set the file descriptor to which the parser should write debugging graphs + * during parsing. The graphs are formatted in the DOT language. You may want + * to pipe these graphs directly to a `dot(1)` process in order to generate + * SVG output. You can turn off this logging by passing a negative number. + */ +void ts_parser_print_dot_graphs(TSParser *self, int file); + +/** + * Set whether or not the parser should halt immediately upon detecting an + * error. This will generally result in a syntax tree with an error at the + * root, and one or more partial syntax trees within the error. This behavior + * may not be supported long-term. + */ +void ts_parser_halt_on_error(TSParser *self, bool halt); + +/******************/ +/* Section - Tree */ +/******************/ + +/** + * Create a shallow copy of the syntax tree. This is very fast. + * + * You need to copy a syntax tree in order to use it on more than one thread at + * a time, as syntax trees are not thread safe. + */ +TSTree *ts_tree_copy(const TSTree *self); + +/** + * Delete the syntax tree, freeing all of the memory that it used. + */ +void ts_tree_delete(TSTree *self); + +/** + * Get the root node of the syntax tree. + */ +TSNode ts_tree_root_node(const TSTree *self); + +/** + * Get the language that was used to parse the syntax tree. + */ +const TSLanguage *ts_tree_language(const TSTree *); + +/** + * Edit the syntax tree to keep it in sync with source code that has been + * edited. + * + * You must describe the edit both in terms of byte offsets and in terms of + * (row, column) coordinates. + */ +void ts_tree_edit(TSTree *self, const TSInputEdit *edit); + +/** + * Compare a new syntax tree to a previous syntax tree representing the same + * document, returning an array of ranges whose syntactic structure has changed. + * + * For this to work correctly, the old syntax tree must have been edited such + * that its ranges match up to the new tree. Generally, you'll want to call + * this function right after calling one of the `ts_parser_parse` functions, + * passing in the new tree that was returned from `ts_parser_parse` and the old + * tree that was passed as a parameter. + * + * The returned array is allocated using `malloc` and the caller is responsible + * for freeing it using `free`. The length of the array will be written to the + * given `length` pointer. + */ +TSRange *ts_tree_get_changed_ranges( +  const TSTree *self, +  const TSTree *old_tree, +  uint32_t *length +); + +/** + * Write a DOT graph describing the syntax tree to the given file. + */ +void ts_tree_print_dot_graph(const TSTree *, FILE *); + +/******************/ +/* Section - Node */ +/******************/ + +/** + * Get the node's type as a null-terminated string. + */ +const char *ts_node_type(TSNode); + +/** + * Get the node's type as a numerical id. + */ +TSSymbol ts_node_symbol(TSNode); + +/** + * Get the node's start byte. + */ +uint32_t ts_node_start_byte(TSNode); + +/** + * Get the node's start position in terms of rows and columns. + */ +TSPoint ts_node_start_point(TSNode); + +/** + * Get the node's end byte. + */ +uint32_t ts_node_end_byte(TSNode); + +/** + * Get the node's end position in terms of rows and columns. + */ +TSPoint ts_node_end_point(TSNode); + +/** + * Get an S-expression representing the node as a string. + * + * This string is allocated with `malloc` and the caller is responsible for + * freeing it using `free`. + */ +char *ts_node_string(TSNode); + +/** + * Check if the node is null. Functions like `ts_node_child` and + * `ts_node_next_sibling` will return a null node to indicate that no such node + * was found. + */ +bool ts_node_is_null(TSNode); + +/** + * Check if the node is *named*. Named nodes correspond to named rules in the + * grammar, whereas *anonymous* nodes correspond to string literals in the + * grammar. + */ +bool ts_node_is_named(TSNode); + +/** + * Check if the node is *missing*. Missing nodes are inserted by the parser in + * order to recover from certain kinds of syntax errors. + */ +bool ts_node_is_missing(TSNode); + +/** + * Check if the node is *missing*. Missing nodes are inserted by the parser in + * order to recover from certain kinds of syntax errors. + */ +bool ts_node_is_extra(TSNode); + +/** + * Check if a syntax node has been edited. + */ +bool ts_node_has_changes(TSNode); + +/** + * Check if the node is a syntax error or contains any syntax errors. + */ +bool ts_node_has_error(TSNode); + +/** + * Get the node's immediate parent. + */ +TSNode ts_node_parent(TSNode); + +/** + * Get the node's child at the given index, where zero represents the first + * child. + */ +TSNode ts_node_child(TSNode, uint32_t); + +/** + * Get the node's number of children. + */ +uint32_t ts_node_child_count(TSNode); + +/** + * Get the node's *named* child at the given index. + * + * See also `ts_node_is_named`. + */ +TSNode ts_node_named_child(TSNode, uint32_t); + +/** + * Get the node's number of *named* children. + * + * See also `ts_node_is_named`. + */ +uint32_t ts_node_named_child_count(TSNode); + +/** + * Get the node's child with the given field name. + */ +TSNode ts_node_child_by_field_name( +  TSNode self, +  const char *field_name, +  uint32_t field_name_length +); + +/** + * Get the node's child with the given numerical field id. + * + * You can convert a field name to an id using the + * `ts_language_field_id_for_name` function. + */ +TSNode ts_node_child_by_field_id(TSNode, TSFieldId); + +/** + * Get the node's next / previous sibling. + */ +TSNode ts_node_next_sibling(TSNode); +TSNode ts_node_prev_sibling(TSNode); + +/** + * Get the node's next / previous *named* sibling. + */ +TSNode ts_node_next_named_sibling(TSNode); +TSNode ts_node_prev_named_sibling(TSNode); + +/** + * Get the node's first child that extends beyond the given byte offset. + */ +TSNode ts_node_first_child_for_byte(TSNode, uint32_t); + +/** + * Get the node's first named child that extends beyond the given byte offset. + */ +TSNode ts_node_first_named_child_for_byte(TSNode, uint32_t); + +/** + * Get the smallest node within this node that spans the given range of bytes + * or (row, column) positions. + */ +TSNode ts_node_descendant_for_byte_range(TSNode, uint32_t, uint32_t); +TSNode ts_node_descendant_for_point_range(TSNode, TSPoint, TSPoint); + +/** + * Get the smallest named node within this node that spans the given range of + * bytes or (row, column) positions. + */ +TSNode ts_node_named_descendant_for_byte_range(TSNode, uint32_t, uint32_t); +TSNode ts_node_named_descendant_for_point_range(TSNode, TSPoint, TSPoint); + +/** + * Edit the node to keep it in-sync with source code that has been edited. + * + * This function is only rarely needed. When you edit a syntax tree with the + * `ts_tree_edit` function, all of the nodes that you retrieve from the tree + * afterward will already reflect the edit. You only need to use `ts_node_edit` + * when you have a `TSNode` instance that you want to keep and continue to use + * after an edit. + */ +void ts_node_edit(TSNode *, const TSInputEdit *); + +/** + * Check if two nodes are identical. + */ +bool ts_node_eq(TSNode, TSNode); + +/************************/ +/* Section - TreeCursor */ +/************************/ + +/** + * Create a new tree cursor starting from the given node. + * + * A tree cursor allows you to walk a syntax tree more efficiently than is + * possible using the `TSNode` functions. It is a mutable object that is always + * on a certain syntax node, and can be moved imperatively to different nodes. + */ +TSTreeCursor ts_tree_cursor_new(TSNode); + +/** + * Delete a tree cursor, freeing all of the memory that it used. + */ +void ts_tree_cursor_delete(TSTreeCursor *); + +/** + * Re-initialize a tree cursor to start at a different ndoe. + */ +void ts_tree_cursor_reset(TSTreeCursor *, TSNode); + +/** + * Get the tree cursor's current node. + */ +TSNode ts_tree_cursor_current_node(const TSTreeCursor *); + +/** + * Get the field name of the tree cursor's current node. + * + * This returns `NULL` if the current node doesn't have a field. + * See also `ts_node_child_by_field_name`. + */ +const char *ts_tree_cursor_current_field_name(const TSTreeCursor *); + +/** + * Get the field name of the tree cursor's current node. + * + * This returns zero if the current node doesn't have a field. + * See also `ts_node_child_by_field_id`, `ts_language_field_id_for_name`. + */ +TSFieldId ts_tree_cursor_current_field_id(const TSTreeCursor *); + +/** + * Move the cursor to the parent of its current node. + * + * This returns `true` if the cursor successfully moved, and returns `false` + * if there was no parent node (the cursor was already on the root node). + */ +bool ts_tree_cursor_goto_parent(TSTreeCursor *); + +/** + * Move the cursor to the next sibling of its current node. + * + * This returns `true` if the cursor successfully moved, and returns `false` + * if there was no next sibling node. + */ +bool ts_tree_cursor_goto_next_sibling(TSTreeCursor *); + +/** + * Move the cursor to the first schild of its current node. + * + * This returns `true` if the cursor successfully moved, and returns `false` + * if there were no children. + */ +bool ts_tree_cursor_goto_first_child(TSTreeCursor *); + +/** + * Move the cursor to the first schild of its current node that extends beyond + * the given byte offset. + * + * This returns the index of the child node if one was found, and returns -1 + * if no such child was found. + */ +int64_t ts_tree_cursor_goto_first_child_for_byte(TSTreeCursor *, uint32_t); + +TSTreeCursor ts_tree_cursor_copy(const TSTreeCursor *); + +/**********************/ +/* Section - Language */ +/**********************/ + +/** + * Get the number of distinct node types in the language. + */ +uint32_t ts_language_symbol_count(const TSLanguage *); + +/** + * Get a node type string for the given numerical id. + */ +const char *ts_language_symbol_name(const TSLanguage *, TSSymbol); + +/** + * Get the numerical id for the given node type string. + */ +TSSymbol ts_language_symbol_for_name(const TSLanguage *, const char *); + +/** + * Get the number of distinct field names in the language. + */ +uint32_t ts_language_field_count(const TSLanguage *); + +/** + * Get the field name string for the given numerical id. + */ +const char *ts_language_field_name_for_id(const TSLanguage *, TSFieldId); + +/** + * Get the numerical id for the given field name string. + */ +TSFieldId ts_language_field_id_for_name(const TSLanguage *, const char *, uint32_t); + +/** + * Check whether the given node type id belongs to named nodes, anonymous nodes, + * or a hidden nodes. + * + * See also `ts_node_is_named`. Hidden nodes are never returned from the API. + */ +TSSymbolType ts_language_symbol_type(const TSLanguage *, TSSymbol); + +/** + * Get the ABI version number for this language. This version number is used + * to ensure that languages were generated by a compatible version of + * Tree-sitter. + * + * See also `ts_parser_set_language`. + */ +uint32_t ts_language_version(const TSLanguage *); + +#ifdef __cplusplus +} +#endif + +#endif  // TREE_SITTER_API_H_ diff --git a/src/tree_sitter/array.h b/src/tree_sitter/array.h new file mode 100644 index 0000000000..bc77e687bf --- /dev/null +++ b/src/tree_sitter/array.h @@ -0,0 +1,142 @@ +#ifndef TREE_SITTER_ARRAY_H_ +#define TREE_SITTER_ARRAY_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <string.h> +#include <stdlib.h> +#include <stdint.h> +#include <assert.h> +#include <stdbool.h> +#include "./alloc.h" + +#define Array(T)     \ +  struct {           \ +    T *contents;     \ +    uint32_t size;     \ +    uint32_t capacity; \ +  } + +#define array_init(self) \ +  ((self)->size = 0, (self)->capacity = 0, (self)->contents = NULL) + +#define array_new() \ +  { NULL, 0, 0 } + +#define array_get(self, index) \ +  (assert((uint32_t)index < (self)->size), &(self)->contents[index]) + +#define array_front(self) array_get(self, 0) + +#define array_back(self) array_get(self, (self)->size - 1) + +#define array_clear(self) ((self)->size = 0) + +#define array_reserve(self, new_capacity) \ +  array__reserve((VoidArray *)(self), array__elem_size(self), new_capacity) + +#define array_erase(self, index) \ +  array__erase((VoidArray *)(self), array__elem_size(self), index) + +#define array_delete(self) array__delete((VoidArray *)self) + +#define array_push(self, element)                            \ +  (array__grow((VoidArray *)(self), 1, array__elem_size(self)), \ +   (self)->contents[(self)->size++] = (element)) + +#define array_grow_by(self, count) \ +  (array__grow((VoidArray *)(self), count, array__elem_size(self)), \ +   memset((self)->contents + (self)->size, 0, (count) * array__elem_size(self)), \ +   (self)->size += (count)) + +#define array_push_all(self, other)                                       \ +  array_splice((self), (self)->size, 0, (other)->size, (other)->contents) + +#define array_splice(self, index, old_count, new_count, new_contents)          \ +  array__splice((VoidArray *)(self), array__elem_size(self), index, old_count, \ +                new_count, new_contents) + +#define array_insert(self, index, element) \ +  array__splice((VoidArray *)(self), array__elem_size(self), index, 0, 1, &element) + +#define array_pop(self) ((self)->contents[--(self)->size]) + +#define array_assign(self, other) \ +  array__assign((VoidArray *)(self), (const VoidArray *)(other), array__elem_size(self)) + +// Private + +typedef Array(void) VoidArray; + +#define array__elem_size(self) sizeof(*(self)->contents) + +static inline void array__delete(VoidArray *self) { +  ts_free(self->contents); +  self->contents = NULL; +  self->size = 0; +  self->capacity = 0; +} + +static inline void array__erase(VoidArray *self, size_t element_size, +                                uint32_t index) { +  assert(index < self->size); +  char *contents = (char *)self->contents; +  memmove(contents + index * element_size, contents + (index + 1) * element_size, +          (self->size - index - 1) * element_size); +  self->size--; +} + +static inline void array__reserve(VoidArray *self, size_t element_size, uint32_t new_capacity) { +  if (new_capacity > self->capacity) { +    if (self->contents) { +      self->contents = ts_realloc(self->contents, new_capacity * element_size); +    } else { +      self->contents = ts_calloc(new_capacity, element_size); +    } +    self->capacity = new_capacity; +  } +} + +static inline void array__assign(VoidArray *self, const VoidArray *other, size_t element_size) { +  array__reserve(self, element_size, other->size); +  self->size = other->size; +  memcpy(self->contents, other->contents, self->size * element_size); +} + +static inline void array__grow(VoidArray *self, size_t count, size_t element_size) { +  size_t new_size = self->size + count; +  if (new_size > self->capacity) { +    size_t new_capacity = self->capacity * 2; +    if (new_capacity < 8) new_capacity = 8; +    if (new_capacity < new_size) new_capacity = new_size; +    array__reserve(self, element_size, new_capacity); +  } +} + +static inline void array__splice(VoidArray *self, size_t element_size, +                                 uint32_t index, uint32_t old_count, +                                 uint32_t new_count, const void *elements) { +  uint32_t new_size = self->size + new_count - old_count; +  uint32_t old_end = index + old_count; +  uint32_t new_end = index + new_count; +  assert(old_end <= self->size); + +  array__reserve(self, element_size, new_size); + +  char *contents = (char *)self->contents; +  if (self->size > old_end) +    memmove(contents + new_end * element_size, contents + old_end * element_size, +            (self->size - old_end) * element_size); +  if (new_count > 0) +    memcpy((contents + index * element_size), elements, +           new_count * element_size); +  self->size += new_count - old_count; +} + +#ifdef __cplusplus +} +#endif + +#endif  // TREE_SITTER_ARRAY_H_ diff --git a/src/tree_sitter/atomic.h b/src/tree_sitter/atomic.h new file mode 100644 index 0000000000..301ee36700 --- /dev/null +++ b/src/tree_sitter/atomic.h @@ -0,0 +1,42 @@ +#ifndef TREE_SITTER_ATOMIC_H_ +#define TREE_SITTER_ATOMIC_H_ + +#include <stdint.h> + +#ifdef _WIN32 + +#include <windows.h> + +static inline size_t atomic_load(const volatile size_t *p) { +  return *p; +} + +static inline uint32_t atomic_inc(volatile uint32_t *p) { +  return InterlockedIncrement(p); +} + +static inline uint32_t atomic_dec(volatile uint32_t *p) { +  return InterlockedDecrement(p); +} + +#else + +static inline size_t atomic_load(const volatile size_t *p) { +#ifdef __ATOMIC_RELAXED +  return __atomic_load_n(p, __ATOMIC_RELAXED); +#else +  return __sync_fetch_and_add((volatile size_t *)p, 0); +#endif +} + +static inline uint32_t atomic_inc(volatile uint32_t *p) { +  return __sync_add_and_fetch(p, 1u); +} + +static inline uint32_t atomic_dec(volatile uint32_t *p) { +  return __sync_sub_and_fetch(p, 1u); +} + +#endif + +#endif  // TREE_SITTER_ATOMIC_H_ diff --git a/src/tree_sitter/clock.h b/src/tree_sitter/clock.h new file mode 100644 index 0000000000..94545f3566 --- /dev/null +++ b/src/tree_sitter/clock.h @@ -0,0 +1,141 @@ +#ifndef TREE_SITTER_CLOCK_H_ +#define TREE_SITTER_CLOCK_H_ + +#include <stdint.h> + +typedef uint64_t TSDuration; + +#ifdef _WIN32 + +// Windows: +// * Represent a time as a performance counter value. +// * Represent a duration as a number of performance counter ticks. + +#include <windows.h> +typedef uint64_t TSClock; + +static inline TSDuration duration_from_micros(uint64_t micros) { +  LARGE_INTEGER frequency; +  QueryPerformanceFrequency(&frequency); +  return micros * (uint64_t)frequency.QuadPart / 1000000; +} + +static inline uint64_t duration_to_micros(TSDuration self) { +  LARGE_INTEGER frequency; +  QueryPerformanceFrequency(&frequency); +  return self * 1000000 / (uint64_t)frequency.QuadPart; +} + +static inline TSClock clock_null(void) { +  return 0; +} + +static inline TSClock clock_now(void) { +  LARGE_INTEGER result; +  QueryPerformanceCounter(&result); +  return (uint64_t)result.QuadPart; +} + +static inline TSClock clock_after(TSClock base, TSDuration duration) { +  return base + duration; +} + +static inline bool clock_is_null(TSClock self) { +  return !self; +} + +static inline bool clock_is_gt(TSClock self, TSClock other) { +  return self > other; +} + +#elif defined(CLOCK_MONOTONIC) && !defined(__APPLE__) + +// POSIX with monotonic clock support (Linux) +// * Represent a time as a monotonic (seconds, nanoseconds) pair. +// * Represent a duration as a number of microseconds. +// +// On these platforms, parse timeouts will correspond accurately to +// real time, regardless of what other processes are running. + +#include <time.h> +typedef struct timespec TSClock; + +static inline TSDuration duration_from_micros(uint64_t micros) { +  return micros; +} + +static inline uint64_t duration_to_micros(TSDuration self) { +  return self; +} + +static inline TSClock clock_now(void) { +  TSClock result; +  clock_gettime(CLOCK_MONOTONIC, &result); +  return result; +} + +static inline TSClock clock_null(void) { +  return (TSClock) {0, 0}; +} + +static inline TSClock clock_after(TSClock base, TSDuration duration) { +  TSClock result = base; +  result.tv_sec += duration / 1000000; +  result.tv_nsec += (duration % 1000000) * 1000; +  return result; +} + +static inline bool clock_is_null(TSClock self) { +  return !self.tv_sec; +} + +static inline bool clock_is_gt(TSClock self, TSClock other) { +  if (self.tv_sec > other.tv_sec) return true; +  if (self.tv_sec < other.tv_sec) return false; +  return self.tv_nsec > other.tv_nsec; +} + +#else + +// macOS or POSIX without monotonic clock support +// * Represent a time as a process clock value. +// * Represent a duration as a number of process clock ticks. +// +// On these platforms, parse timeouts may be affected by other processes, +// which is not ideal, but is better than using a non-monotonic time API +// like `gettimeofday`. + +#include <time.h> +typedef uint64_t TSClock; + +static inline TSDuration duration_from_micros(uint64_t micros) { +  return micros * (uint64_t)CLOCKS_PER_SEC / 1000000; +} + +static inline uint64_t duration_to_micros(TSDuration self) { +  return self * 1000000 / (uint64_t)CLOCKS_PER_SEC; +} + +static inline TSClock clock_null(void) { +  return 0; +} + +static inline TSClock clock_now(void) { +  return (uint64_t)clock(); +} + +static inline TSClock clock_after(TSClock base, TSDuration duration) { +  return base + duration; +} + +static inline bool clock_is_null(TSClock self) { +  return !self; +} + +static inline bool clock_is_gt(TSClock self, TSClock other) { +  return self > other; +} + +#endif + +#endif  // TREE_SITTER_CLOCK_H_ diff --git a/src/tree_sitter/error_costs.h b/src/tree_sitter/error_costs.h new file mode 100644 index 0000000000..32d3666a66 --- /dev/null +++ b/src/tree_sitter/error_costs.h @@ -0,0 +1,11 @@ +#ifndef TREE_SITTER_ERROR_COSTS_H_ +#define TREE_SITTER_ERROR_COSTS_H_ + +#define ERROR_STATE 0 +#define ERROR_COST_PER_RECOVERY 500 +#define ERROR_COST_PER_MISSING_TREE 110 +#define ERROR_COST_PER_SKIPPED_TREE 100 +#define ERROR_COST_PER_SKIPPED_LINE 30 +#define ERROR_COST_PER_SKIPPED_CHAR 1 + +#endif diff --git a/src/tree_sitter/get_changed_ranges.c b/src/tree_sitter/get_changed_ranges.c new file mode 100644 index 0000000000..5bd1d814bd --- /dev/null +++ b/src/tree_sitter/get_changed_ranges.c @@ -0,0 +1,482 @@ +#include "./get_changed_ranges.h" +#include "./subtree.h" +#include "./language.h" +#include "./error_costs.h" +#include "./tree_cursor.h" +#include <assert.h> + +// #define DEBUG_GET_CHANGED_RANGES + +static void ts_range_array_add(TSRangeArray *self, Length start, Length end) { +  if (self->size > 0) { +    TSRange *last_range = array_back(self); +    if (start.bytes <= last_range->end_byte) { +      last_range->end_byte = end.bytes; +      last_range->end_point = end.extent; +      return; +    } +  } + +  if (start.bytes < end.bytes) { +    TSRange range = { start.extent, end.extent, start.bytes, end.bytes }; +    array_push(self, range); +  } +} + +bool ts_range_array_intersects(const TSRangeArray *self, unsigned start_index, +                               uint32_t start_byte, uint32_t end_byte) { +  for (unsigned i = start_index; i < self->size; i++) { +    TSRange *range = &self->contents[i]; +    if (range->end_byte > start_byte) { +      if (range->start_byte >= end_byte) break; +      return true; +    } +  } +  return false; +} + +void ts_range_array_get_changed_ranges( +  const TSRange *old_ranges, unsigned old_range_count, +  const TSRange *new_ranges, unsigned new_range_count, +  TSRangeArray *differences +) { +  unsigned new_index = 0; +  unsigned old_index = 0; +  Length current_position = length_zero(); +  bool in_old_range = false; +  bool in_new_range = false; + +  while (old_index < old_range_count || new_index < new_range_count) { +    const TSRange *old_range = &old_ranges[old_index]; +    const TSRange *new_range = &new_ranges[new_index]; + +    Length next_old_position; +    if (in_old_range) { +      next_old_position = (Length) {old_range->end_byte, old_range->end_point}; +    } else if (old_index < old_range_count) { +      next_old_position = (Length) {old_range->start_byte, old_range->start_point}; +    } else { +      next_old_position = LENGTH_MAX; +    } + +    Length next_new_position; +    if (in_new_range) { +      next_new_position = (Length) {new_range->end_byte, new_range->end_point}; +    } else if (new_index < new_range_count) { +      next_new_position = (Length) {new_range->start_byte, new_range->start_point}; +    } else { +      next_new_position = LENGTH_MAX; +    } + +    if (next_old_position.bytes < next_new_position.bytes) { +      if (in_old_range != in_new_range) { +        ts_range_array_add(differences, current_position, next_old_position); +      } +      if (in_old_range) old_index++; +      current_position = next_old_position; +      in_old_range = !in_old_range; +    } else if (next_new_position.bytes < next_old_position.bytes) { +      if (in_old_range != in_new_range) { +        ts_range_array_add(differences, current_position, next_new_position); +      } +      if (in_new_range) new_index++; +      current_position = next_new_position; +      in_new_range = !in_new_range; +    } else { +      if (in_old_range != in_new_range) { +        ts_range_array_add(differences, current_position, next_new_position); +      } +      if (in_old_range) old_index++; +      if (in_new_range) new_index++; +      in_old_range = !in_old_range; +      in_new_range = !in_new_range; +      current_position = next_new_position; +    } +  } +} + +typedef struct { +  TreeCursor cursor; +  const TSLanguage *language; +  unsigned visible_depth; +  bool in_padding; +} Iterator; + +static Iterator iterator_new(TreeCursor *cursor, const Subtree *tree, const TSLanguage *language) { +  array_clear(&cursor->stack); +  array_push(&cursor->stack, ((TreeCursorEntry){ +    .subtree = tree, +    .position = length_zero(), +    .child_index = 0, +    .structural_child_index = 0, +  })); +  return (Iterator) { +    .cursor = *cursor, +    .language = language, +    .visible_depth = 1, +    .in_padding = false, +  }; +} + +static bool iterator_done(Iterator *self) { +  return self->cursor.stack.size == 0; +} + +static Length iterator_start_position(Iterator *self) { +  TreeCursorEntry entry = *array_back(&self->cursor.stack); +  if (self->in_padding) { +    return entry.position; +  } else { +    return length_add(entry.position, ts_subtree_padding(*entry.subtree)); +  } +} + +static Length iterator_end_position(Iterator *self) { +  TreeCursorEntry entry = *array_back(&self->cursor.stack); +  Length result = length_add(entry.position, ts_subtree_padding(*entry.subtree)); +  if (self->in_padding) { +    return result; +  } else { +    return length_add(result, ts_subtree_size(*entry.subtree)); +  } +} + +static bool iterator_tree_is_visible(const Iterator *self) { +  TreeCursorEntry entry = *array_back(&self->cursor.stack); +  if (ts_subtree_visible(*entry.subtree)) return true; +  if (self->cursor.stack.size > 1) { +    Subtree parent = *self->cursor.stack.contents[self->cursor.stack.size - 2].subtree; +    const TSSymbol *alias_sequence = ts_language_alias_sequence( +      self->language, +      parent.ptr->production_id +    ); +    return alias_sequence && alias_sequence[entry.structural_child_index] != 0; +  } +  return false; +} + +static void iterator_get_visible_state(const Iterator *self, Subtree *tree, +                                       TSSymbol *alias_symbol, uint32_t *start_byte) { +  uint32_t i = self->cursor.stack.size - 1; + +  if (self->in_padding) { +    if (i == 0) return; +    i--; +  } + +  for (; i + 1 > 0; i--) { +    TreeCursorEntry entry = self->cursor.stack.contents[i]; + +    if (i > 0) { +      const Subtree *parent = self->cursor.stack.contents[i - 1].subtree; +      const TSSymbol *alias_sequence = ts_language_alias_sequence( +        self->language, +        parent->ptr->production_id +      ); +      if (alias_sequence) { +        *alias_symbol = alias_sequence[entry.structural_child_index]; +      } +    } + +    if (ts_subtree_visible(*entry.subtree) || *alias_symbol) { +      *tree = *entry.subtree; +      *start_byte = entry.position.bytes; +      break; +    } +  } +} + +static void iterator_ascend(Iterator *self) { +  if (iterator_done(self)) return; +  if (iterator_tree_is_visible(self) && !self->in_padding) self->visible_depth--; +  if (array_back(&self->cursor.stack)->child_index > 0) self->in_padding = false; +  self->cursor.stack.size--; +} + +static bool iterator_descend(Iterator *self, uint32_t goal_position) { +  if (self->in_padding) return false; + +  bool did_descend; +  do { +    did_descend = false; +    TreeCursorEntry entry = *array_back(&self->cursor.stack); +    Length position = entry.position; +    uint32_t structural_child_index = 0; +    for (uint32_t i = 0, n = ts_subtree_child_count(*entry.subtree); i < n; i++) { +      const Subtree *child = &entry.subtree->ptr->children[i]; +      Length child_left = length_add(position, ts_subtree_padding(*child)); +      Length child_right = length_add(child_left, ts_subtree_size(*child)); + +      if (child_right.bytes > goal_position) { +        array_push(&self->cursor.stack, ((TreeCursorEntry){ +          .subtree = child, +          .position = position, +          .child_index = i, +          .structural_child_index = structural_child_index, +        })); + +        if (iterator_tree_is_visible(self)) { +          if (child_left.bytes > goal_position) { +            self->in_padding = true; +          } else { +            self->visible_depth++; +          } +          return true; +        } + +        did_descend = true; +        break; +      } + +      position = child_right; +      if (!ts_subtree_extra(*child)) structural_child_index++; +    } +  } while (did_descend); + +  return false; +} + +static void iterator_advance(Iterator *self) { +  if (self->in_padding) { +    self->in_padding = false; +    if (iterator_tree_is_visible(self)) { +      self->visible_depth++; +    } else { +      iterator_descend(self, 0); +    } +    return; +  } + +  for (;;) { +    if (iterator_tree_is_visible(self)) self->visible_depth--; +    TreeCursorEntry entry = array_pop(&self->cursor.stack); +    if (iterator_done(self)) return; + +    const Subtree *parent = array_back(&self->cursor.stack)->subtree; +    uint32_t child_index = entry.child_index + 1; +    if (ts_subtree_child_count(*parent) > child_index) { +      Length position = length_add(entry.position, ts_subtree_total_size(*entry.subtree)); +      uint32_t structural_child_index = entry.structural_child_index; +      if (!ts_subtree_extra(*entry.subtree)) structural_child_index++; +      const Subtree *next_child = &parent->ptr->children[child_index]; + +      array_push(&self->cursor.stack, ((TreeCursorEntry){ +        .subtree = next_child, +        .position = position, +        .child_index = child_index, +        .structural_child_index = structural_child_index, +      })); + +      if (iterator_tree_is_visible(self)) { +        if (ts_subtree_padding(*next_child).bytes > 0) { +          self->in_padding = true; +        } else { +          self->visible_depth++; +        } +      } else { +        iterator_descend(self, 0); +      } +      break; +    } +  } +} + +typedef enum { +  IteratorDiffers, +  IteratorMayDiffer, +  IteratorMatches, +} IteratorComparison; + +static IteratorComparison iterator_compare(const Iterator *old_iter, const Iterator *new_iter) { +  Subtree old_tree = NULL_SUBTREE; +  Subtree new_tree = NULL_SUBTREE; +  uint32_t old_start = 0; +  uint32_t new_start = 0; +  TSSymbol old_alias_symbol = 0; +  TSSymbol new_alias_symbol = 0; +  iterator_get_visible_state(old_iter, &old_tree, &old_alias_symbol, &old_start); +  iterator_get_visible_state(new_iter, &new_tree, &new_alias_symbol, &new_start); + +  if (!old_tree.ptr && !new_tree.ptr) return IteratorMatches; +  if (!old_tree.ptr || !new_tree.ptr) return IteratorDiffers; + +  if ( +    old_alias_symbol == new_alias_symbol && +    ts_subtree_symbol(old_tree) == ts_subtree_symbol(new_tree) +  ) { +    if (old_start == new_start && +        !ts_subtree_has_changes(old_tree) && +        ts_subtree_symbol(old_tree) != ts_builtin_sym_error && +        ts_subtree_size(old_tree).bytes == ts_subtree_size(new_tree).bytes && +        ts_subtree_parse_state(old_tree) != TS_TREE_STATE_NONE && +        ts_subtree_parse_state(new_tree) != TS_TREE_STATE_NONE && +        (ts_subtree_parse_state(old_tree) == ERROR_STATE) == +        (ts_subtree_parse_state(new_tree) == ERROR_STATE)) { +      return IteratorMatches; +    } else { +      return IteratorMayDiffer; +    } +  } + +  return IteratorDiffers; +} + +#ifdef DEBUG_GET_CHANGED_RANGES +static inline void iterator_print_state(Iterator *self) { +  TreeCursorEntry entry = *array_back(&self->cursor.stack); +  TSPoint start = iterator_start_position(self).extent; +  TSPoint end = iterator_end_position(self).extent; +  const char *name = ts_language_symbol_name(self->language, ts_subtree_symbol(*entry.subtree)); +  printf( +    "(%-25s %s\t depth:%u [%u, %u] - [%u, %u])", +    name, self->in_padding ? "(p)" : "   ", +    self->visible_depth, +    start.row + 1, start.column, +    end.row + 1, end.column +  ); +} +#endif + +unsigned ts_subtree_get_changed_ranges(const Subtree *old_tree, const Subtree *new_tree, +                                       TreeCursor *cursor1, TreeCursor *cursor2, +                                       const TSLanguage *language, +                                       const TSRangeArray *included_range_differences, +                                       TSRange **ranges) { +  TSRangeArray results = array_new(); + +  Iterator old_iter = iterator_new(cursor1, old_tree, language); +  Iterator new_iter = iterator_new(cursor2, new_tree, language); + +  unsigned included_range_difference_index = 0; + +  Length position = iterator_start_position(&old_iter); +  Length next_position = iterator_start_position(&new_iter); +  if (position.bytes < next_position.bytes) { +    ts_range_array_add(&results, position, next_position); +    position = next_position; +  } else if (position.bytes > next_position.bytes) { +    ts_range_array_add(&results, next_position, position); +    next_position = position; +  } + +  do { +    #ifdef DEBUG_GET_CHANGED_RANGES +    printf("At [%-2u, %-2u] Compare ", position.extent.row + 1, position.extent.column); +    iterator_print_state(&old_iter); +    printf("\tvs\t"); +    iterator_print_state(&new_iter); +    puts(""); +    #endif + +    // Compare the old and new subtrees. +    IteratorComparison comparison = iterator_compare(&old_iter, &new_iter); + +    // Even if the two subtrees appear to be identical, they could differ +    // internally if they contain a range of text that was previously +    // excluded from the parse, and is now included, or vice-versa. +    if (comparison == IteratorMatches && ts_range_array_intersects( +      included_range_differences, +      included_range_difference_index, +      position.bytes, +      iterator_end_position(&old_iter).bytes +    )) { +      comparison = IteratorMayDiffer; +    } + +    bool is_changed = false; +    switch (comparison) { +      // If the subtrees are definitely identical, move to the end +      // of both subtrees. +      case IteratorMatches: +        next_position = iterator_end_position(&old_iter); +        break; + +      // If the subtrees might differ internally, descend into both +      // subtrees, finding the first child that spans the current position. +      case IteratorMayDiffer: +        if (iterator_descend(&old_iter, position.bytes)) { +          if (!iterator_descend(&new_iter, position.bytes)) { +            is_changed = true; +            next_position = iterator_end_position(&old_iter); +          } +        } else if (iterator_descend(&new_iter, position.bytes)) { +          is_changed = true; +          next_position = iterator_end_position(&new_iter); +        } else { +          next_position = length_min( +            iterator_end_position(&old_iter), +            iterator_end_position(&new_iter) +          ); +        } +        break; + +      // If the subtrees are different, record a change and then move +      // to the end of both subtrees. +      case IteratorDiffers: +        is_changed = true; +        next_position = length_min( +          iterator_end_position(&old_iter), +          iterator_end_position(&new_iter) +        ); +        break; +    } + +    // Ensure that both iterators are caught up to the current position. +    while ( +      !iterator_done(&old_iter) && +      iterator_end_position(&old_iter).bytes <= next_position.bytes +    ) iterator_advance(&old_iter); +    while ( +      !iterator_done(&new_iter) && +      iterator_end_position(&new_iter).bytes <= next_position.bytes +    ) iterator_advance(&new_iter); + +    // Ensure that both iterators are at the same depth in the tree. +    while (old_iter.visible_depth > new_iter.visible_depth) { +      iterator_ascend(&old_iter); +    } +    while (new_iter.visible_depth > old_iter.visible_depth) { +      iterator_ascend(&new_iter); +    } + +    if (is_changed) { +      #ifdef DEBUG_GET_CHANGED_RANGES +      printf( +        "  change: [[%u, %u] - [%u, %u]]\n", +        position.extent.row + 1, position.extent.column, +        next_position.extent.row + 1, next_position.extent.column +      ); +      #endif + +      ts_range_array_add(&results, position, next_position); +    } + +    position = next_position; + +    // Keep track of the current position in the included range differences +    // array in order to avoid scanning the entire array on each iteration. +    while (included_range_difference_index < included_range_differences->size) { +      const TSRange *range = &included_range_differences->contents[ +        included_range_difference_index +      ]; +      if (range->end_byte <= position.bytes) { +        included_range_difference_index++; +      } else { +        break; +      } +    } +  } while (!iterator_done(&old_iter) && !iterator_done(&new_iter)); + +  Length old_size = ts_subtree_total_size(*old_tree); +  Length new_size = ts_subtree_total_size(*new_tree); +  if (old_size.bytes < new_size.bytes) { +    ts_range_array_add(&results, old_size, new_size); +  } else if (new_size.bytes < old_size.bytes) { +    ts_range_array_add(&results, new_size, old_size); +  } + +  *cursor1 = old_iter.cursor; +  *cursor2 = new_iter.cursor; +  *ranges = results.contents; +  return results.size; +} diff --git a/src/tree_sitter/get_changed_ranges.h b/src/tree_sitter/get_changed_ranges.h new file mode 100644 index 0000000000..a1f1dbb430 --- /dev/null +++ b/src/tree_sitter/get_changed_ranges.h @@ -0,0 +1,36 @@ +#ifndef TREE_SITTER_GET_CHANGED_RANGES_H_ +#define TREE_SITTER_GET_CHANGED_RANGES_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "./tree_cursor.h" +#include "./subtree.h" + +typedef Array(TSRange) TSRangeArray; + +void ts_range_array_get_changed_ranges( +  const TSRange *old_ranges, unsigned old_range_count, +  const TSRange *new_ranges, unsigned new_range_count, +  TSRangeArray *differences +); + +bool ts_range_array_intersects( +  const TSRangeArray *self, unsigned start_index, +  uint32_t start_byte, uint32_t end_byte +); + +unsigned ts_subtree_get_changed_ranges( +  const Subtree *old_tree, const Subtree *new_tree, +  TreeCursor *cursor1, TreeCursor *cursor2, +  const TSLanguage *language, +  const TSRangeArray *included_range_differences, +  TSRange **ranges +); + +#ifdef __cplusplus +} +#endif + +#endif  // TREE_SITTER_GET_CHANGED_RANGES_H_ diff --git a/src/tree_sitter/language.c b/src/tree_sitter/language.c new file mode 100644 index 0000000000..1bfb1a8d03 --- /dev/null +++ b/src/tree_sitter/language.c @@ -0,0 +1,107 @@ +#include "./language.h" +#include "./subtree.h" +#include "./error_costs.h" +#include <string.h> + +void ts_language_table_entry(const TSLanguage *self, TSStateId state, +                             TSSymbol symbol, TableEntry *result) { +  if (symbol == ts_builtin_sym_error || symbol == ts_builtin_sym_error_repeat) { +    result->action_count = 0; +    result->is_reusable = false; +    result->actions = NULL; +  } else { +    assert(symbol < self->token_count); +    uint32_t action_index = ts_language_lookup(self, state, symbol); +    const TSParseActionEntry *entry = &self->parse_actions[action_index]; +    result->action_count = entry->count; +    result->is_reusable = entry->reusable; +    result->actions = (const TSParseAction *)(entry + 1); +  } +} + +uint32_t ts_language_symbol_count(const TSLanguage *language) { +  return language->symbol_count + language->alias_count; +} + +uint32_t ts_language_version(const TSLanguage *language) { +  return language->version; +} + +TSSymbolMetadata ts_language_symbol_metadata(const TSLanguage *language, TSSymbol symbol) { +  if (symbol == ts_builtin_sym_error)  { +    return (TSSymbolMetadata){.visible = true, .named = true}; +  } else if (symbol == ts_builtin_sym_error_repeat) { +    return (TSSymbolMetadata){.visible = false, .named = false}; +  } else { +    return language->symbol_metadata[symbol]; +  } +} + +const char *ts_language_symbol_name(const TSLanguage *language, TSSymbol symbol) { +  if (symbol == ts_builtin_sym_error) { +    return "ERROR"; +  } else if (symbol == ts_builtin_sym_error_repeat) { +    return "_ERROR"; +  } else { +    return language->symbol_names[symbol]; +  } +} + +TSSymbol ts_language_symbol_for_name(const TSLanguage *self, const char *name) { +  if (!strcmp(name, "ERROR")) return ts_builtin_sym_error; + +  uint32_t count = ts_language_symbol_count(self); +  for (TSSymbol i = 0; i < count; i++) { +    if (!strcmp(self->symbol_names[i], name)) { +      return i; +    } +  } +  return 0; +} + +TSSymbolType ts_language_symbol_type(const TSLanguage *language, TSSymbol symbol) { +  TSSymbolMetadata metadata = ts_language_symbol_metadata(language, symbol); +  if (metadata.named) { +    return TSSymbolTypeRegular; +  } else if (metadata.visible) { +    return TSSymbolTypeAnonymous; +  } else { +    return TSSymbolTypeAuxiliary; +  } +} + +uint32_t ts_language_field_count(const TSLanguage *self) { +  if (self->version >= TREE_SITTER_LANGUAGE_VERSION_WITH_FIELDS) { +    return self->field_count; +  } else { +    return 0; +  } +} + +const char *ts_language_field_name_for_id(const TSLanguage *self, TSFieldId id) { +  uint32_t count = ts_language_field_count(self); +  if (count) { +    return self->field_names[id]; +  } else { +    return NULL; +  } +} + +TSFieldId ts_language_field_id_for_name( +  const TSLanguage *self, +  const char *name, +  uint32_t name_length +) { +  uint32_t count = ts_language_field_count(self); +  for (TSSymbol i = 1; i < count + 1; i++) { +    switch (strncmp(name, self->field_names[i], name_length)) { +      case 0: +        return i; +      case -1: +        return 0; +      default: +        break; +    } +  } +  return 0; +} diff --git a/src/tree_sitter/language.h b/src/tree_sitter/language.h new file mode 100644 index 0000000000..0741486a1b --- /dev/null +++ b/src/tree_sitter/language.h @@ -0,0 +1,138 @@ +#ifndef TREE_SITTER_LANGUAGE_H_ +#define TREE_SITTER_LANGUAGE_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "./subtree.h" +#include "tree_sitter/parser.h" + +#define ts_builtin_sym_error_repeat (ts_builtin_sym_error - 1) +#define TREE_SITTER_LANGUAGE_VERSION_WITH_FIELDS 10 +#define TREE_SITTER_LANGUAGE_VERSION_WITH_SMALL_STATES 11 + +typedef struct { +  const TSParseAction *actions; +  uint32_t action_count; +  bool is_reusable; +} TableEntry; + +void ts_language_table_entry(const TSLanguage *, TSStateId, TSSymbol, TableEntry *); + +TSSymbolMetadata ts_language_symbol_metadata(const TSLanguage *, TSSymbol); + +static inline bool ts_language_is_symbol_external(const TSLanguage *self, TSSymbol symbol) { +  return 0 < symbol && symbol < self->external_token_count + 1; +} + +static inline const TSParseAction *ts_language_actions(const TSLanguage *self, +                                                       TSStateId state, +                                                       TSSymbol symbol, +                                                       uint32_t *count) { +  TableEntry entry; +  ts_language_table_entry(self, state, symbol, &entry); +  *count = entry.action_count; +  return entry.actions; +} + +static inline bool ts_language_has_actions(const TSLanguage *self, +                                           TSStateId state, +                                           TSSymbol symbol) { +  TableEntry entry; +  ts_language_table_entry(self, state, symbol, &entry); +  return entry.action_count > 0; +} + +static inline bool ts_language_has_reduce_action(const TSLanguage *self, +                                                 TSStateId state, +                                                 TSSymbol symbol) { +  TableEntry entry; +  ts_language_table_entry(self, state, symbol, &entry); +  return entry.action_count > 0 && entry.actions[0].type == TSParseActionTypeReduce; +} + +static inline uint16_t ts_language_lookup( +  const TSLanguage *self, +  TSStateId state, +  TSSymbol symbol +) { +  if ( +    self->version >= TREE_SITTER_LANGUAGE_VERSION_WITH_SMALL_STATES && +    state >= self->large_state_count +  ) { +    uint32_t index = self->small_parse_table_map[state - self->large_state_count]; +    const uint16_t *data = &self->small_parse_table[index]; +    uint16_t section_count = *(data++); +    for (unsigned i = 0; i < section_count; i++) { +      uint16_t section_value = *(data++); +      uint16_t symbol_count = *(data++); +      for (unsigned i = 0; i < symbol_count; i++) { +        if (*(data++) == symbol) return section_value; +      } +    } +    return 0; +  } else { +    return self->parse_table[state * self->symbol_count + symbol]; +  } +} + +static inline TSStateId ts_language_next_state(const TSLanguage *self, +                                               TSStateId state, +                                               TSSymbol symbol) { +  if (symbol == ts_builtin_sym_error || symbol == ts_builtin_sym_error_repeat) { +    return 0; +  } else if (symbol < self->token_count) { +    uint32_t count; +    const TSParseAction *actions = ts_language_actions(self, state, symbol, &count); +    if (count > 0) { +      TSParseAction action = actions[count - 1]; +      if (action.type == TSParseActionTypeShift || action.type == TSParseActionTypeRecover) { +        return action.params.state; +      } +    } +    return 0; +  } else { +    return ts_language_lookup(self, state, symbol); +  } +} + +static inline const bool * +ts_language_enabled_external_tokens(const TSLanguage *self, +                                    unsigned external_scanner_state) { +  if (external_scanner_state == 0) { +    return NULL; +  } else { +    return self->external_scanner.states + self->external_token_count * external_scanner_state; +  } +} + +static inline const TSSymbol * +ts_language_alias_sequence(const TSLanguage *self, uint32_t production_id) { +  return production_id > 0 ? +    self->alias_sequences + production_id * self->max_alias_sequence_length : +    NULL; +} + +static inline void ts_language_field_map( +  const TSLanguage *self, +  uint32_t production_id, +  const TSFieldMapEntry **start, +  const TSFieldMapEntry **end +) { +  if (self->version < TREE_SITTER_LANGUAGE_VERSION_WITH_FIELDS || self->field_count == 0) { +    *start = NULL; +    *end = NULL; +    return; +  } + +  TSFieldMapSlice slice = self->field_map_slices[production_id]; +  *start = &self->field_map_entries[slice.index]; +  *end = &self->field_map_entries[slice.index] + slice.length; +} + +#ifdef __cplusplus +} +#endif + +#endif  // TREE_SITTER_LANGUAGE_H_ diff --git a/src/tree_sitter/length.h b/src/tree_sitter/length.h new file mode 100644 index 0000000000..61de9fc1d5 --- /dev/null +++ b/src/tree_sitter/length.h @@ -0,0 +1,44 @@ +#ifndef TREE_SITTER_LENGTH_H_ +#define TREE_SITTER_LENGTH_H_ + +#include <stdlib.h> +#include <stdbool.h> +#include "./point.h" +#include "tree_sitter/api.h" + +typedef struct { +  uint32_t bytes; +  TSPoint extent; +} Length; + +static const Length LENGTH_UNDEFINED = {0, {0, 1}}; +static const Length LENGTH_MAX = {UINT32_MAX, {UINT32_MAX, UINT32_MAX}}; + +static inline bool length_is_undefined(Length length) { +  return length.bytes == 0 && length.extent.column != 0; +} + +static inline Length length_min(Length len1, Length len2) { +  return (len1.bytes < len2.bytes) ? len1 : len2; +} + +static inline Length length_add(Length len1, Length len2) { +  Length result; +  result.bytes = len1.bytes + len2.bytes; +  result.extent = point_add(len1.extent, len2.extent); +  return result; +} + +static inline Length length_sub(Length len1, Length len2) { +  Length result; +  result.bytes = len1.bytes - len2.bytes; +  result.extent = point_sub(len1.extent, len2.extent); +  return result; +} + +static inline Length length_zero(void) { +  Length result = {0, {0, 0}}; +  return result; +} + +#endif diff --git a/src/tree_sitter/lexer.c b/src/tree_sitter/lexer.c new file mode 100644 index 0000000000..fdc127466f --- /dev/null +++ b/src/tree_sitter/lexer.c @@ -0,0 +1,322 @@ +#include <stdio.h> +#include "./lexer.h" +#include "./subtree.h" +#include "./length.h" +#include "./utf16.h" +#include "utf8proc.h" + +#define LOG(...)                                                                      \ +  if (self->logger.log) {                                                             \ +    snprintf(self->debug_buffer, TREE_SITTER_SERIALIZATION_BUFFER_SIZE, __VA_ARGS__); \ +    self->logger.log(self->logger.payload, TSLogTypeLex, self->debug_buffer);         \ +  } + +#define LOG_CHARACTER(message, character) \ +  LOG(                                    \ +    32 <= character && character < 127 ?  \ +    message " character:'%c'" :           \ +    message " character:%d", character    \ +  ) + +static const char empty_chunk[3] = { 0, 0 }; + +static const int32_t BYTE_ORDER_MARK = 0xFEFF; + +static void ts_lexer__get_chunk(Lexer *self) { +  self->chunk_start = self->current_position.bytes; +  self->chunk = self->input.read( +    self->input.payload, +    self->current_position.bytes, +    self->current_position.extent, +    &self->chunk_size +  ); +  if (!self->chunk_size) self->chunk = empty_chunk; +} + +typedef utf8proc_ssize_t (*DecodeFunction)( +  const utf8proc_uint8_t *, +  utf8proc_ssize_t, +  utf8proc_int32_t * +); + +static void ts_lexer__get_lookahead(Lexer *self) { +  uint32_t position_in_chunk = self->current_position.bytes - self->chunk_start; +  const uint8_t *chunk = (const uint8_t *)self->chunk + position_in_chunk; +  uint32_t size = self->chunk_size - position_in_chunk; + +  if (size == 0) { +    self->lookahead_size = 1; +    self->data.lookahead = '\0'; +    return; +  } + +  DecodeFunction decode = +    self->input.encoding == TSInputEncodingUTF8 ? utf8proc_iterate : utf16_iterate; + +  self->lookahead_size = decode(chunk, size, &self->data.lookahead); + +  // If this chunk ended in the middle of a multi-byte character, +  // try again with a fresh chunk. +  if (self->data.lookahead == -1 && size < 4) { +    ts_lexer__get_chunk(self); +    chunk = (const uint8_t *)self->chunk; +    size = self->chunk_size; +    self->lookahead_size = decode(chunk, size, &self->data.lookahead); +  } + +  if (self->data.lookahead == -1) { +    self->lookahead_size = 1; +  } +} + +static void ts_lexer__advance(TSLexer *payload, bool skip) { +  Lexer *self = (Lexer *)payload; +  if (self->chunk == empty_chunk) +    return; + +  if (self->lookahead_size) { +    self->current_position.bytes += self->lookahead_size; +    if (self->data.lookahead == '\n') { +      self->current_position.extent.row++; +      self->current_position.extent.column = 0; +    } else { +      self->current_position.extent.column += self->lookahead_size; +    } +  } + +  TSRange *current_range = &self->included_ranges[self->current_included_range_index]; +  if (self->current_position.bytes == current_range->end_byte) { +    self->current_included_range_index++; +    if (self->current_included_range_index == self->included_range_count) { +      self->data.lookahead = '\0'; +      self->lookahead_size = 1; +      return; +    } else { +      current_range++; +      self->current_position = (Length) { +        current_range->start_byte, +        current_range->start_point, +      }; +    } +  } + +  if (skip) { +    LOG_CHARACTER("skip", self->data.lookahead); +    self->token_start_position = self->current_position; +  } else { +    LOG_CHARACTER("consume", self->data.lookahead); +  } + +  if (self->current_position.bytes >= self->chunk_start + self->chunk_size) { +    ts_lexer__get_chunk(self); +  } + +  ts_lexer__get_lookahead(self); +} + +static void ts_lexer__mark_end(TSLexer *payload) { +  Lexer *self = (Lexer *)payload; +  TSRange *current_included_range = &self->included_ranges[self->current_included_range_index]; +  if (self->current_included_range_index > 0 && +      self->current_position.bytes == current_included_range->start_byte) { +    TSRange *previous_included_range = current_included_range - 1; +    self->token_end_position = (Length) { +      previous_included_range->end_byte, +      previous_included_range->end_point, +    }; +  } else { +    self->token_end_position = self->current_position; +  } +} + +static uint32_t ts_lexer__get_column(TSLexer *payload) { +  Lexer *self = (Lexer *)payload; +  uint32_t goal_byte = self->current_position.bytes; + +  self->current_position.bytes -= self->current_position.extent.column; +  self->current_position.extent.column = 0; + +  if (self->current_position.bytes < self->chunk_start) { +    ts_lexer__get_chunk(self); +  } + +  uint32_t result = 0; +  while (self->current_position.bytes < goal_byte) { +    ts_lexer__advance(payload, false); +    result++; +  } + +  return result; +} + +static bool ts_lexer__is_at_included_range_start(TSLexer *payload) { +  const Lexer *self = (const Lexer *)payload; +  TSRange *current_range = &self->included_ranges[self->current_included_range_index]; +  return self->current_position.bytes == current_range->start_byte; +} + +// The lexer's methods are stored as a struct field so that generated +// parsers can call them without needing to be linked against this library. + +void ts_lexer_init(Lexer *self) { +  *self = (Lexer) { +    .data = { +      .advance = ts_lexer__advance, +      .mark_end = ts_lexer__mark_end, +      .get_column = ts_lexer__get_column, +      .is_at_included_range_start = ts_lexer__is_at_included_range_start, +      .lookahead = 0, +      .result_symbol = 0, +    }, +    .chunk = NULL, +    .chunk_start = 0, +    .current_position = {UINT32_MAX, {0, 0}}, +    .logger = { +      .payload = NULL, +      .log = NULL +    }, +    .current_included_range_index = 0, +  }; + +  self->included_ranges = NULL; +  ts_lexer_set_included_ranges(self, NULL, 0); +  ts_lexer_reset(self, length_zero()); +} + +void ts_lexer_delete(Lexer *self) { +  ts_free(self->included_ranges); +} + +void ts_lexer_set_input(Lexer *self, TSInput input) { +  self->input = input; +  self->data.lookahead = 0; +  self->lookahead_size = 0; +  self->chunk = 0; +  self->chunk_start = 0; +  self->chunk_size = 0; +} + +static void ts_lexer_goto(Lexer *self, Length position) { +  bool found_included_range = false; +  for (unsigned i = 0; i < self->included_range_count; i++) { +    TSRange *included_range = &self->included_ranges[i]; +    if (included_range->end_byte > position.bytes) { +      if (included_range->start_byte > position.bytes) { +        position = (Length) { +          .bytes = included_range->start_byte, +          .extent = included_range->start_point, +        }; +      } + +      self->current_included_range_index = i; +      found_included_range = true; +      break; +    } +  } + +  if (!found_included_range) { +    TSRange *last_included_range = &self->included_ranges[self->included_range_count - 1]; +    position = (Length) { +      .bytes = last_included_range->end_byte, +      .extent = last_included_range->end_point, +    }; +    self->chunk = empty_chunk; +    self->chunk_start = position.bytes; +    self->chunk_size = 2; +  } + +  self->token_start_position = position; +  self->token_end_position = LENGTH_UNDEFINED; +  self->current_position = position; + +  if (self->chunk && (position.bytes < self->chunk_start || +                      position.bytes >= self->chunk_start + self->chunk_size)) { +    self->chunk = 0; +    self->chunk_start = 0; +    self->chunk_size = 0; +  } + +  self->lookahead_size = 0; +  self->data.lookahead = 0; +} + +void ts_lexer_reset(Lexer *self, Length position) { +  if (position.bytes != self->current_position.bytes) ts_lexer_goto(self, position); +} + +void ts_lexer_start(Lexer *self) { +  self->token_start_position = self->current_position; +  self->token_end_position = LENGTH_UNDEFINED; +  self->data.result_symbol = 0; +  if (!self->chunk) ts_lexer__get_chunk(self); +  if (!self->lookahead_size) ts_lexer__get_lookahead(self); +  if ( +    self->current_position.bytes == 0 && +    self->data.lookahead == BYTE_ORDER_MARK +  ) ts_lexer__advance((TSLexer *)self, true); +} + +void ts_lexer_finish(Lexer *self, uint32_t *lookahead_end_byte) { +  if (length_is_undefined(self->token_end_position)) { +    ts_lexer__mark_end(&self->data); +  } + +  uint32_t current_lookahead_end_byte = self->current_position.bytes + 1; + +  // In order to determine that a byte sequence is invalid UTF8 or UTF16, +  // the character decoding algorithm may have looked at the following byte. +  // Therefore, the next byte *after* the current (invalid) character +  // affects the interpretation of the current character. +  if (self->data.lookahead == -1) { +    current_lookahead_end_byte++; +  } + +  if (current_lookahead_end_byte > *lookahead_end_byte) { +    *lookahead_end_byte = current_lookahead_end_byte; +  } +} + +void ts_lexer_advance_to_end(Lexer *self) { +  while (self->data.lookahead != 0) { +    ts_lexer__advance((TSLexer *)self, false); +  } +} + +void ts_lexer_mark_end(Lexer *self) { +  ts_lexer__mark_end(&self->data); +} + +static const TSRange DEFAULT_RANGES[] = { +  { +    .start_point = { +      .row = 0, +      .column = 0, +    }, +    .end_point = { +      .row = UINT32_MAX, +      .column = UINT32_MAX, +    }, +    .start_byte = 0, +    .end_byte = UINT32_MAX +  } +}; + +void ts_lexer_set_included_ranges(Lexer *self, const TSRange *ranges, uint32_t count) { +  if (!ranges) { +    ranges = DEFAULT_RANGES; +    count = 1; +  } + +  size_t sz = count * sizeof(TSRange); +  self->included_ranges = ts_realloc(self->included_ranges, sz); +  memcpy(self->included_ranges, ranges, sz); +  self->included_range_count = count; +  ts_lexer_goto(self, self->current_position); +} + +TSRange *ts_lexer_included_ranges(const Lexer *self, uint32_t *count) { +  *count = self->included_range_count; +  return self->included_ranges; +} + +#undef LOG diff --git a/src/tree_sitter/lexer.h b/src/tree_sitter/lexer.h new file mode 100644 index 0000000000..f523d88f65 --- /dev/null +++ b/src/tree_sitter/lexer.h @@ -0,0 +1,48 @@ +#ifndef TREE_SITTER_LEXER_H_ +#define TREE_SITTER_LEXER_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "./length.h" +#include "./subtree.h" +#include "tree_sitter/api.h" +#include "tree_sitter/parser.h" + +typedef struct { +  TSLexer data; +  Length current_position; +  Length token_start_position; +  Length token_end_position; + +  TSRange * included_ranges; +  size_t included_range_count; +  size_t current_included_range_index; + +  const char *chunk; +  uint32_t chunk_start; +  uint32_t chunk_size; +  uint32_t lookahead_size; + +  TSInput input; +  TSLogger logger; +  char debug_buffer[TREE_SITTER_SERIALIZATION_BUFFER_SIZE]; +} Lexer; + +void ts_lexer_init(Lexer *); +void ts_lexer_delete(Lexer *); +void ts_lexer_set_input(Lexer *, TSInput); +void ts_lexer_reset(Lexer *, Length); +void ts_lexer_start(Lexer *); +void ts_lexer_finish(Lexer *, uint32_t *); +void ts_lexer_advance_to_end(Lexer *); +void ts_lexer_mark_end(Lexer *); +void ts_lexer_set_included_ranges(Lexer *self, const TSRange *ranges, uint32_t count); +TSRange *ts_lexer_included_ranges(const Lexer *self, uint32_t *count); + +#ifdef __cplusplus +} +#endif + +#endif  // TREE_SITTER_LEXER_H_ diff --git a/src/tree_sitter/lib.c b/src/tree_sitter/lib.c new file mode 100644 index 0000000000..fc5fbc9210 --- /dev/null +++ b/src/tree_sitter/lib.c @@ -0,0 +1,20 @@ +// The Tree-sitter library can be built by compiling this one source file. +// +// The following directories must be added to the include path: +//   - include +//   - utf8proc + +#define _POSIX_C_SOURCE 200112L +#define UTF8PROC_STATIC + +#include "./get_changed_ranges.c" +#include "./language.c" +#include "./lexer.c" +#include "./node.c" +#include "./parser.c" +#include "./stack.c" +#include "./subtree.c" +#include "./tree_cursor.c" +#include "./tree.c" +#include "./utf16.c" +#include "utf8proc.c" diff --git a/src/tree_sitter/node.c b/src/tree_sitter/node.c new file mode 100644 index 0000000000..6b2be36ee5 --- /dev/null +++ b/src/tree_sitter/node.c @@ -0,0 +1,673 @@ +#include <stdbool.h> +#include "./subtree.h" +#include "./tree.h" +#include "./language.h" + +typedef struct { +  Subtree parent; +  const TSTree *tree; +  Length position; +  uint32_t child_index; +  uint32_t structural_child_index; +  const TSSymbol *alias_sequence; +} NodeChildIterator; + +// TSNode - constructors + +TSNode ts_node_new( +  const TSTree *tree, +  const Subtree *subtree, +  Length position, +  TSSymbol alias +) { +  return (TSNode) { +    {position.bytes, position.extent.row, position.extent.column, alias}, +    subtree, +    tree, +  }; +} + +static inline TSNode ts_node__null(void) { +  return ts_node_new(NULL, NULL, length_zero(), 0); +} + +// TSNode - accessors + +uint32_t ts_node_start_byte(TSNode self) { +  return self.context[0]; +} + +TSPoint ts_node_start_point(TSNode self) { +  return (TSPoint) {self.context[1], self.context[2]}; +} + +static inline uint32_t ts_node__alias(const TSNode *self) { +  return self->context[3]; +} + +static inline Subtree ts_node__subtree(TSNode self) { +  return *(const Subtree *)self.id; +} + +// NodeChildIterator + +static inline NodeChildIterator ts_node_iterate_children(const TSNode *node) { +  Subtree subtree = ts_node__subtree(*node); +  if (ts_subtree_child_count(subtree) == 0) { +    return (NodeChildIterator) {NULL_SUBTREE, node->tree, length_zero(), 0, 0, NULL}; +  } +  const TSSymbol *alias_sequence = ts_language_alias_sequence( +    node->tree->language, +    subtree.ptr->production_id +  ); +  return (NodeChildIterator) { +    .tree = node->tree, +    .parent = subtree, +    .position = {ts_node_start_byte(*node), ts_node_start_point(*node)}, +    .child_index = 0, +    .structural_child_index = 0, +    .alias_sequence = alias_sequence, +  }; +} + +static inline bool ts_node_child_iterator_done(NodeChildIterator *self) { +  return self->child_index == self->parent.ptr->child_count; +} + +static inline bool ts_node_child_iterator_next( +  NodeChildIterator *self, +  TSNode *result +) { +  if (!self->parent.ptr || ts_node_child_iterator_done(self)) return false; +  const Subtree *child = &self->parent.ptr->children[self->child_index]; +  TSSymbol alias_symbol = 0; +  if (!ts_subtree_extra(*child)) { +    if (self->alias_sequence) { +      alias_symbol = self->alias_sequence[self->structural_child_index]; +    } +    self->structural_child_index++; +  } +  if (self->child_index > 0) { +    self->position = length_add(self->position, ts_subtree_padding(*child)); +  } +  *result = ts_node_new( +    self->tree, +    child, +    self->position, +    alias_symbol +  ); +  self->position = length_add(self->position, ts_subtree_size(*child)); +  self->child_index++; +  return true; +} + +// TSNode - private + +static inline bool ts_node__is_relevant(TSNode self, bool include_anonymous) { +  Subtree tree = ts_node__subtree(self); +  if (include_anonymous) { +    return ts_subtree_visible(tree) || ts_node__alias(&self); +  } else { +    TSSymbol alias = ts_node__alias(&self); +    if (alias) { +      return ts_language_symbol_metadata(self.tree->language, alias).named; +    } else { +      return ts_subtree_visible(tree) && ts_subtree_named(tree); +    } +  } +} + +static inline uint32_t ts_node__relevant_child_count( +  TSNode self, +  bool include_anonymous +) { +  Subtree tree = ts_node__subtree(self); +  if (ts_subtree_child_count(tree) > 0) { +    if (include_anonymous) { +      return tree.ptr->visible_child_count; +    } else { +      return tree.ptr->named_child_count; +    } +  } else { +    return 0; +  } +} + +static inline TSNode ts_node__child( +  TSNode self, +  uint32_t child_index, +  bool include_anonymous +) { +  TSNode result = self; +  bool did_descend = true; + +  while (did_descend) { +    did_descend = false; + +    TSNode child; +    uint32_t index = 0; +    NodeChildIterator iterator = ts_node_iterate_children(&result); +    while (ts_node_child_iterator_next(&iterator, &child)) { +      if (ts_node__is_relevant(child, include_anonymous)) { +        if (index == child_index) { +          ts_tree_set_cached_parent(self.tree, &child, &self); +          return child; +        } +        index++; +      } else { +        uint32_t grandchild_index = child_index - index; +        uint32_t grandchild_count = ts_node__relevant_child_count(child, include_anonymous); +        if (grandchild_index < grandchild_count) { +          did_descend = true; +          result = child; +          child_index = grandchild_index; +          break; +        } +        index += grandchild_count; +      } +    } +  } + +  return ts_node__null(); +} + +static bool ts_subtree_has_trailing_empty_descendant( +  Subtree self, +  Subtree other +) { +  for (unsigned i = ts_subtree_child_count(self) - 1; i + 1 > 0; i--) { +    Subtree child = self.ptr->children[i]; +    if (ts_subtree_total_bytes(child) > 0) break; +    if (child.ptr == other.ptr || ts_subtree_has_trailing_empty_descendant(child, other)) { +      return true; +    } +  } +  return false; +} + +static inline TSNode ts_node__prev_sibling(TSNode self, bool include_anonymous) { +  Subtree self_subtree = ts_node__subtree(self); +  bool self_is_empty = ts_subtree_total_bytes(self_subtree) == 0; +  uint32_t target_end_byte = ts_node_end_byte(self); + +  TSNode node = ts_node_parent(self); +  TSNode earlier_node = ts_node__null(); +  bool earlier_node_is_relevant = false; + +  while (!ts_node_is_null(node)) { +    TSNode earlier_child = ts_node__null(); +    bool earlier_child_is_relevant = false; +    bool found_child_containing_target = false; + +    TSNode child; +    NodeChildIterator iterator = ts_node_iterate_children(&node); +    while (ts_node_child_iterator_next(&iterator, &child)) { +      if (child.id == self.id) break; +      if (iterator.position.bytes > target_end_byte) { +        found_child_containing_target = true; +        break; +      } + +      if (iterator.position.bytes == target_end_byte && +          (!self_is_empty || +           ts_subtree_has_trailing_empty_descendant(ts_node__subtree(child), self_subtree))) { +        found_child_containing_target = true; +        break; +      } + +      if (ts_node__is_relevant(child, include_anonymous)) { +        earlier_child = child; +        earlier_child_is_relevant = true; +      } else if (ts_node__relevant_child_count(child, include_anonymous) > 0) { +        earlier_child = child; +        earlier_child_is_relevant = false; +      } +    } + +    if (found_child_containing_target) { +      if (!ts_node_is_null(earlier_child)) { +        earlier_node = earlier_child; +        earlier_node_is_relevant = earlier_child_is_relevant; +      } +      node = child; +    } else if (earlier_child_is_relevant) { +      return earlier_child; +    } else if (!ts_node_is_null(earlier_child)) { +      node = earlier_child; +    } else if (earlier_node_is_relevant) { +      return earlier_node; +    } else { +      node = earlier_node; +    } +  } + +  return ts_node__null(); +} + +static inline TSNode ts_node__next_sibling(TSNode self, bool include_anonymous) { +  uint32_t target_end_byte = ts_node_end_byte(self); + +  TSNode node = ts_node_parent(self); +  TSNode later_node = ts_node__null(); +  bool later_node_is_relevant = false; + +  while (!ts_node_is_null(node)) { +    TSNode later_child = ts_node__null(); +    bool later_child_is_relevant = false; +    TSNode child_containing_target = ts_node__null(); + +    TSNode child; +    NodeChildIterator iterator = ts_node_iterate_children(&node); +    while (ts_node_child_iterator_next(&iterator, &child)) { +      if (iterator.position.bytes < target_end_byte) continue; +      if (ts_node_start_byte(child) <= ts_node_start_byte(self)) { +        if (ts_node__subtree(child).ptr != ts_node__subtree(self).ptr) { +          child_containing_target = child; +        } +      } else if (ts_node__is_relevant(child, include_anonymous)) { +        later_child = child; +        later_child_is_relevant = true; +        break; +      } else if (ts_node__relevant_child_count(child, include_anonymous) > 0) { +        later_child = child; +        later_child_is_relevant = false; +        break; +      } +    } + +    if (!ts_node_is_null(child_containing_target)) { +      if (!ts_node_is_null(later_child)) { +        later_node = later_child; +        later_node_is_relevant = later_child_is_relevant; +      } +      node = child_containing_target; +    } else if (later_child_is_relevant) { +      return later_child; +    } else if (!ts_node_is_null(later_child)) { +      node = later_child; +    } else if (later_node_is_relevant) { +      return later_node; +    } else { +      node = later_node; +    } +  } + +  return ts_node__null(); +} + +static inline TSNode ts_node__first_child_for_byte( +  TSNode self, +  uint32_t goal, +  bool include_anonymous +) { +  TSNode node = self; +  bool did_descend = true; + +  while (did_descend) { +    did_descend = false; + +    TSNode child; +    NodeChildIterator iterator = ts_node_iterate_children(&node); +    while (ts_node_child_iterator_next(&iterator, &child)) { +      if (ts_node_end_byte(child) > goal) { +        if (ts_node__is_relevant(child, include_anonymous)) { +          return child; +        } else if (ts_node_child_count(child) > 0) { +          did_descend = true; +          node = child; +          break; +        } +      } +    } +  } + +  return ts_node__null(); +} + +static inline TSNode ts_node__descendant_for_byte_range( +  TSNode self, +  uint32_t range_start, +  uint32_t range_end, +  bool include_anonymous +) { +  TSNode node = self; +  TSNode last_visible_node = self; + +  bool did_descend = true; +  while (did_descend) { +    did_descend = false; + +    TSNode child; +    NodeChildIterator iterator = ts_node_iterate_children(&node); +    while (ts_node_child_iterator_next(&iterator, &child)) { +      uint32_t node_end = iterator.position.bytes; + +      // The end of this node must extend far enough forward to touch +      // the end of the range and exceed the start of the range. +      if (node_end < range_end) continue; +      if (node_end <= range_start) continue; + +      // The start of this node must extend far enough backward to +      // touch the start of the range. +      if (range_start < ts_node_start_byte(child)) break; + +      node = child; +      if (ts_node__is_relevant(node, include_anonymous)) { +        ts_tree_set_cached_parent(self.tree, &child, &last_visible_node); +        last_visible_node = node; +      } +      did_descend = true; +      break; +    } +  } + +  return last_visible_node; +} + +static inline TSNode ts_node__descendant_for_point_range( +  TSNode self, +  TSPoint range_start, +  TSPoint range_end, +  bool include_anonymous +) { +  TSNode node = self; +  TSNode last_visible_node = self; + +  bool did_descend = true; +  while (did_descend) { +    did_descend = false; + +    TSNode child; +    NodeChildIterator iterator = ts_node_iterate_children(&node); +    while (ts_node_child_iterator_next(&iterator, &child)) { +      TSPoint node_end = iterator.position.extent; + +      // The end of this node must extend far enough forward to touch +      // the end of the range and exceed the start of the range. +      if (point_lt(node_end, range_end)) continue; +      if (point_lte(node_end, range_start)) continue; + +      // The start of this node must extend far enough backward to +      // touch the start of the range. +      if (point_lt(range_start, ts_node_start_point(child))) break; + +      node = child; +      if (ts_node__is_relevant(node, include_anonymous)) { +        ts_tree_set_cached_parent(self.tree, &child, &last_visible_node); +        last_visible_node = node; +      } +      did_descend = true; +      break; +    } +  } + +  return last_visible_node; +} + +// TSNode - public + +uint32_t ts_node_end_byte(TSNode self) { +  return ts_node_start_byte(self) + ts_subtree_size(ts_node__subtree(self)).bytes; +} + +TSPoint ts_node_end_point(TSNode self) { +  return point_add(ts_node_start_point(self), ts_subtree_size(ts_node__subtree(self)).extent); +} + +TSSymbol ts_node_symbol(TSNode self) { +  return ts_node__alias(&self) +    ? ts_node__alias(&self) +    : ts_subtree_symbol(ts_node__subtree(self)); +} + +const char *ts_node_type(TSNode self) { +  return ts_language_symbol_name(self.tree->language, ts_node_symbol(self)); +} + +char *ts_node_string(TSNode self) { +  return ts_subtree_string(ts_node__subtree(self), self.tree->language, false); +} + +bool ts_node_eq(TSNode self, TSNode other) { +  return self.tree == other.tree && self.id == other.id; +} + +bool ts_node_is_null(TSNode self) { +  return self.id == 0; +} + +bool ts_node_is_extra(TSNode self) { +  return ts_subtree_extra(ts_node__subtree(self)); +} + +bool ts_node_is_named(TSNode self) { +  TSSymbol alias = ts_node__alias(&self); +  return alias +    ? ts_language_symbol_metadata(self.tree->language, alias).named +    : ts_subtree_named(ts_node__subtree(self)); +} + +bool ts_node_is_missing(TSNode self) { +  return ts_subtree_missing(ts_node__subtree(self)); +} + +bool ts_node_has_changes(TSNode self) { +  return ts_subtree_has_changes(ts_node__subtree(self)); +} + +bool ts_node_has_error(TSNode self) { +  return ts_subtree_error_cost(ts_node__subtree(self)) > 0; +} + +TSNode ts_node_parent(TSNode self) { +  TSNode node = ts_tree_get_cached_parent(self.tree, &self); +  if (node.id) return node; + +  node = ts_tree_root_node(self.tree); +  uint32_t end_byte = ts_node_end_byte(self); +  if (node.id == self.id) return ts_node__null(); + +  TSNode last_visible_node = node; +  bool did_descend = true; +  while (did_descend) { +    did_descend = false; + +    TSNode child; +    NodeChildIterator iterator = ts_node_iterate_children(&node); +    while (ts_node_child_iterator_next(&iterator, &child)) { +      if ( +        ts_node_start_byte(child) > ts_node_start_byte(self) || +        child.id == self.id +      ) break; +      if (iterator.position.bytes >= end_byte) { +        node = child; +        if (ts_node__is_relevant(child, true)) { +          ts_tree_set_cached_parent(self.tree, &node, &last_visible_node); +          last_visible_node = node; +        } +        did_descend = true; +        break; +      } +    } +  } + +  return last_visible_node; +} + +TSNode ts_node_child(TSNode self, uint32_t child_index) { +  return ts_node__child(self, child_index, true); +} + +TSNode ts_node_named_child(TSNode self, uint32_t child_index) { +  return ts_node__child(self, child_index, false); +} + +TSNode ts_node_child_by_field_id(TSNode self, TSFieldId field_id) { +recur: +  if (!field_id || ts_node_child_count(self) == 0) return ts_node__null(); + +  const TSFieldMapEntry *field_map, *field_map_end; +  ts_language_field_map( +    self.tree->language, +    ts_node__subtree(self).ptr->production_id, +    &field_map, +    &field_map_end +  ); +  if (field_map == field_map_end) return ts_node__null(); + +  // The field mappings are sorted by their field id. Scan all +  // the mappings to find the ones for the given field id. +  while (field_map->field_id < field_id) { +    field_map++; +    if (field_map == field_map_end) return ts_node__null(); +  } +  while (field_map_end[-1].field_id > field_id) { +    field_map_end--; +    if (field_map == field_map_end) return ts_node__null(); +  } + +  TSNode child; +  NodeChildIterator iterator = ts_node_iterate_children(&self); +  while (ts_node_child_iterator_next(&iterator, &child)) { +    if (!ts_subtree_extra(ts_node__subtree(child))) { +      uint32_t index = iterator.structural_child_index - 1; +      if (index < field_map->child_index) continue; + +      // Hidden nodes' fields are "inherited" by their visible parent. +      if (field_map->inherited) { + +        // If this is the *last* possible child node for this field, +        // then perform a tail call to avoid recursion. +        if (field_map + 1 == field_map_end) { +          self = child; +          goto recur; +        } + +        // Otherwise, descend into this child, but if it doesn't contain +        // the field, continue searching subsequent children. +        else { +          TSNode result = ts_node_child_by_field_id(child, field_id); +          if (result.id) return result; +          field_map++; +          if (field_map == field_map_end) return ts_node__null(); +        } +      } + +      else if (ts_node__is_relevant(child, true)) { +        return child; +      } + +      // If the field refers to a hidden node, return its first visible +      // child. +      else { +        return ts_node_child(child, 0); +      } +    } +  } + +  return ts_node__null(); +} + +TSNode ts_node_child_by_field_name( +  TSNode self, +  const char *name, +  uint32_t name_length +) { +  TSFieldId field_id = ts_language_field_id_for_name( +    self.tree->language, +    name, +    name_length +  ); +  return ts_node_child_by_field_id(self, field_id); +} + +uint32_t ts_node_child_count(TSNode self) { +  Subtree tree = ts_node__subtree(self); +  if (ts_subtree_child_count(tree) > 0) { +    return tree.ptr->visible_child_count; +  } else { +    return 0; +  } +} + +uint32_t ts_node_named_child_count(TSNode self) { +  Subtree tree = ts_node__subtree(self); +  if (ts_subtree_child_count(tree) > 0) { +    return tree.ptr->named_child_count; +  } else { +    return 0; +  } +} + +TSNode ts_node_next_sibling(TSNode self) { +  return ts_node__next_sibling(self, true); +} + +TSNode ts_node_next_named_sibling(TSNode self) { +  return ts_node__next_sibling(self, false); +} + +TSNode ts_node_prev_sibling(TSNode self) { +  return ts_node__prev_sibling(self, true); +} + +TSNode ts_node_prev_named_sibling(TSNode self) { +  return ts_node__prev_sibling(self, false); +} + +TSNode ts_node_first_child_for_byte(TSNode self, uint32_t byte) { +  return ts_node__first_child_for_byte(self, byte, true); +} + +TSNode ts_node_first_named_child_for_byte(TSNode self, uint32_t byte) { +  return ts_node__first_child_for_byte(self, byte, false); +} + +TSNode ts_node_descendant_for_byte_range( +  TSNode self, +  uint32_t start, +  uint32_t end +) { +  return ts_node__descendant_for_byte_range(self, start, end, true); +} + +TSNode ts_node_named_descendant_for_byte_range( +  TSNode self, +  uint32_t start, +  uint32_t end +) { +  return ts_node__descendant_for_byte_range(self, start, end, false); +} + +TSNode ts_node_descendant_for_point_range( +  TSNode self, +  TSPoint start, +  TSPoint end +) { +  return ts_node__descendant_for_point_range(self, start, end, true); +} + +TSNode ts_node_named_descendant_for_point_range( +  TSNode self, +  TSPoint start, +  TSPoint end +) { +  return ts_node__descendant_for_point_range(self, start, end, false); +} + +void ts_node_edit(TSNode *self, const TSInputEdit *edit) { +  uint32_t start_byte = ts_node_start_byte(*self); +  TSPoint start_point = ts_node_start_point(*self); + +  if (start_byte >= edit->old_end_byte) { +    start_byte = edit->new_end_byte + (start_byte - edit->old_end_byte); +    start_point = point_add(edit->new_end_point, point_sub(start_point, edit->old_end_point)); +  } else if (start_byte > edit->start_byte) { +    start_byte = edit->new_end_byte; +    start_point = edit->new_end_point; +  } + +  self->context[0] = start_byte; +  self->context[1] = start_point.row; +  self->context[2] = start_point.column; +} diff --git a/src/tree_sitter/parser.c b/src/tree_sitter/parser.c new file mode 100644 index 0000000000..88b20845fd --- /dev/null +++ b/src/tree_sitter/parser.c @@ -0,0 +1,1887 @@ +#include <time.h> +#include <assert.h> +#include <stdio.h> +#include <limits.h> +#include <stdbool.h> +#include "tree_sitter/api.h" +#include "./alloc.h" +#include "./array.h" +#include "./atomic.h" +#include "./clock.h" +#include "./error_costs.h" +#include "./get_changed_ranges.h" +#include "./language.h" +#include "./length.h" +#include "./lexer.h" +#include "./reduce_action.h" +#include "./reusable_node.h" +#include "./stack.h" +#include "./subtree.h" +#include "./tree.h" + +#define LOG(...)                                                                            \ +  if (self->lexer.logger.log || self->dot_graph_file) {                                     \ +    snprintf(self->lexer.debug_buffer, TREE_SITTER_SERIALIZATION_BUFFER_SIZE, __VA_ARGS__); \ +    ts_parser__log(self);                                                                   \ +  } + +#define LOG_STACK()                                                              \ +  if (self->dot_graph_file) {                                                    \ +    ts_stack_print_dot_graph(self->stack, self->language, self->dot_graph_file); \ +    fputs("\n\n", self->dot_graph_file);                                         \ +  } + +#define LOG_TREE(tree)                                                      \ +  if (self->dot_graph_file) {                                               \ +    ts_subtree_print_dot_graph(tree, self->language, self->dot_graph_file); \ +    fputs("\n", self->dot_graph_file);                                      \ +  } + +#define SYM_NAME(symbol) ts_language_symbol_name(self->language, symbol) + +#define TREE_NAME(tree) SYM_NAME(ts_subtree_symbol(tree)) + +static const unsigned MAX_VERSION_COUNT = 6; +static const unsigned MAX_VERSION_COUNT_OVERFLOW = 4; +static const unsigned MAX_SUMMARY_DEPTH = 16; +static const unsigned MAX_COST_DIFFERENCE = 16 * ERROR_COST_PER_SKIPPED_TREE; +static const unsigned OP_COUNT_PER_TIMEOUT_CHECK = 100; + +typedef struct { +  Subtree token; +  Subtree last_external_token; +  uint32_t byte_index; +} TokenCache; + +struct TSParser { +  Lexer lexer; +  Stack *stack; +  SubtreePool tree_pool; +  const TSLanguage *language; +  ReduceActionSet reduce_actions; +  Subtree finished_tree; +  SubtreeHeapData scratch_tree_data; +  MutableSubtree scratch_tree; +  TokenCache token_cache; +  ReusableNode reusable_node; +  void *external_scanner_payload; +  FILE *dot_graph_file; +  TSClock end_clock; +  TSDuration timeout_duration; +  unsigned accept_count; +  unsigned operation_count; +  const volatile size_t *cancellation_flag; +  bool halt_on_error; +  Subtree old_tree; +  TSRangeArray included_range_differences; +  unsigned included_range_difference_index; +}; + +typedef struct { +  unsigned cost; +  unsigned node_count; +  int dynamic_precedence; +  bool is_in_error; +} ErrorStatus; + +typedef enum { +  ErrorComparisonTakeLeft, +  ErrorComparisonPreferLeft, +  ErrorComparisonNone, +  ErrorComparisonPreferRight, +  ErrorComparisonTakeRight, +} ErrorComparison; + +typedef struct { +  const char *string; +  uint32_t length; +} TSStringInput; + +// StringInput + +static const char *ts_string_input_read( +  void *_self, +  uint32_t byte, +  TSPoint _, +  uint32_t *length +) { +  TSStringInput *self = (TSStringInput *)_self; +  if (byte >= self->length) { +    *length = 0; +    return ""; +  } else { +    *length = self->length - byte; +    return self->string + byte; +  } +} + +// Parser - Private + +static void ts_parser__log(TSParser *self) { +  if (self->lexer.logger.log) { +    self->lexer.logger.log( +      self->lexer.logger.payload, +      TSLogTypeParse, +      self->lexer.debug_buffer +    ); +  } + +  if (self->dot_graph_file) { +    fprintf(self->dot_graph_file, "graph {\nlabel=\""); +    for (char *c = &self->lexer.debug_buffer[0]; *c != 0; c++) { +      if (*c == '"') fputc('\\', self->dot_graph_file); +      fputc(*c, self->dot_graph_file); +    } +    fprintf(self->dot_graph_file, "\"\n}\n\n"); +  } +} + +static bool ts_parser__breakdown_top_of_stack( +  TSParser *self, +  StackVersion version +) { +  bool did_break_down = false; +  bool pending = false; + +  do { +    StackSliceArray pop = ts_stack_pop_pending(self->stack, version); +    if (!pop.size) break; + +    did_break_down = true; +    pending = false; +    for (uint32_t i = 0; i < pop.size; i++) { +      StackSlice slice = pop.contents[i]; +      TSStateId state = ts_stack_state(self->stack, slice.version); +      Subtree parent = *array_front(&slice.subtrees); + +      for (uint32_t j = 0, n = ts_subtree_child_count(parent); j < n; j++) { +        Subtree child = parent.ptr->children[j]; +        pending = ts_subtree_child_count(child) > 0; + +        if (ts_subtree_is_error(child)) { +          state = ERROR_STATE; +        } else if (!ts_subtree_extra(child)) { +          state = ts_language_next_state(self->language, state, ts_subtree_symbol(child)); +        } + +        ts_subtree_retain(child); +        ts_stack_push(self->stack, slice.version, child, pending, state); +      } + +      for (uint32_t j = 1; j < slice.subtrees.size; j++) { +        Subtree tree = slice.subtrees.contents[j]; +        ts_stack_push(self->stack, slice.version, tree, false, state); +      } + +      ts_subtree_release(&self->tree_pool, parent); +      array_delete(&slice.subtrees); + +      LOG("breakdown_top_of_stack tree:%s", TREE_NAME(parent)); +      LOG_STACK(); +    } +  } while (pending); + +  return did_break_down; +} + +static void ts_parser__breakdown_lookahead( +  TSParser *self, +  Subtree *lookahead, +  TSStateId state, +  ReusableNode *reusable_node +) { +  bool did_descend = false; +  Subtree tree = reusable_node_tree(reusable_node); +  while (ts_subtree_child_count(tree) > 0 && ts_subtree_parse_state(tree) != state) { +    LOG("state_mismatch sym:%s", TREE_NAME(tree)); +    reusable_node_descend(reusable_node); +    tree = reusable_node_tree(reusable_node); +    did_descend = true; +  } + +  if (did_descend) { +    ts_subtree_release(&self->tree_pool, *lookahead); +    *lookahead = tree; +    ts_subtree_retain(*lookahead); +  } +} + +static ErrorComparison ts_parser__compare_versions( +  TSParser *self, +  ErrorStatus a, +  ErrorStatus b +) { +  if (!a.is_in_error && b.is_in_error) { +    if (a.cost < b.cost) { +      return ErrorComparisonTakeLeft; +    } else { +      return ErrorComparisonPreferLeft; +    } +  } + +  if (a.is_in_error && !b.is_in_error) { +    if (b.cost < a.cost) { +      return ErrorComparisonTakeRight; +    } else { +      return ErrorComparisonPreferRight; +    } +  } + +  if (a.cost < b.cost) { +    if ((b.cost - a.cost) * (1 + a.node_count) > MAX_COST_DIFFERENCE) { +      return ErrorComparisonTakeLeft; +    } else { +      return ErrorComparisonPreferLeft; +    } +  } + +  if (b.cost < a.cost) { +    if ((a.cost - b.cost) * (1 + b.node_count) > MAX_COST_DIFFERENCE) { +      return ErrorComparisonTakeRight; +    } else { +      return ErrorComparisonPreferRight; +    } +  } + +  if (a.dynamic_precedence > b.dynamic_precedence) return ErrorComparisonPreferLeft; +  if (b.dynamic_precedence > a.dynamic_precedence) return ErrorComparisonPreferRight; +  return ErrorComparisonNone; +} + +static ErrorStatus ts_parser__version_status( +  TSParser *self, +  StackVersion version +) { +  unsigned cost = ts_stack_error_cost(self->stack, version); +  bool is_paused = ts_stack_is_paused(self->stack, version); +  if (is_paused) cost += ERROR_COST_PER_SKIPPED_TREE; +  return (ErrorStatus) { +    .cost = cost, +    .node_count = ts_stack_node_count_since_error(self->stack, version), +    .dynamic_precedence = ts_stack_dynamic_precedence(self->stack, version), +    .is_in_error = is_paused || ts_stack_state(self->stack, version) == ERROR_STATE +  }; +} + +static bool ts_parser__better_version_exists( +  TSParser *self, +  StackVersion version, +  bool is_in_error, +  unsigned cost +) { +  if (self->finished_tree.ptr && ts_subtree_error_cost(self->finished_tree) <= cost) { +    return true; +  } + +  Length position = ts_stack_position(self->stack, version); +  ErrorStatus status = { +    .cost = cost, +    .is_in_error = is_in_error, +    .dynamic_precedence = ts_stack_dynamic_precedence(self->stack, version), +    .node_count = ts_stack_node_count_since_error(self->stack, version), +  }; + +  for (StackVersion i = 0, n = ts_stack_version_count(self->stack); i < n; i++) { +    if (i == version || +        !ts_stack_is_active(self->stack, i) || +        ts_stack_position(self->stack, i).bytes < position.bytes) continue; +    ErrorStatus status_i = ts_parser__version_status(self, i); +    switch (ts_parser__compare_versions(self, status, status_i)) { +      case ErrorComparisonTakeRight: +        return true; +      case ErrorComparisonPreferRight: +        if (ts_stack_can_merge(self->stack, i, version)) return true; +      default: +        break; +    } +  } + +  return false; +} + +static void ts_parser__restore_external_scanner( +  TSParser *self, +  Subtree external_token +) { +  if (external_token.ptr) { +    self->language->external_scanner.deserialize( +      self->external_scanner_payload, +      ts_external_scanner_state_data(&external_token.ptr->external_scanner_state), +      external_token.ptr->external_scanner_state.length +    ); +  } else { +    self->language->external_scanner.deserialize(self->external_scanner_payload, NULL, 0); +  } +} + +static bool ts_parser__can_reuse_first_leaf( +  TSParser *self, +  TSStateId state, +  Subtree tree, +  TableEntry *table_entry +) { +  TSLexMode current_lex_mode = self->language->lex_modes[state]; +  TSSymbol leaf_symbol = ts_subtree_leaf_symbol(tree); +  TSStateId leaf_state = ts_subtree_leaf_parse_state(tree); +  TSLexMode leaf_lex_mode = self->language->lex_modes[leaf_state]; + +  // If the token was created in a state with the same set of lookaheads, it is reusable. +  if ( +    table_entry->action_count > 0 && +    memcmp(&leaf_lex_mode, ¤t_lex_mode, sizeof(TSLexMode)) == 0 && +    ( +      leaf_symbol != self->language->keyword_capture_token || +      (!ts_subtree_is_keyword(tree) && ts_subtree_parse_state(tree) == state) +    ) +  ) return true; + +  // Empty tokens are not reusable in states with different lookaheads. +  if (ts_subtree_size(tree).bytes == 0 && leaf_symbol != ts_builtin_sym_end) return false; + +  // If the current state allows external tokens or other tokens that conflict with this +  // token, this token is not reusable. +  return current_lex_mode.external_lex_state == 0 && table_entry->is_reusable; +} + +static Subtree ts_parser__lex( +  TSParser *self, +  StackVersion version, +  TSStateId parse_state +) { +  Length start_position = ts_stack_position(self->stack, version); +  Subtree external_token = ts_stack_last_external_token(self->stack, version); +  TSLexMode lex_mode = self->language->lex_modes[parse_state]; +  const bool *valid_external_tokens = ts_language_enabled_external_tokens( +    self->language, +    lex_mode.external_lex_state +  ); + +  bool found_external_token = false; +  bool error_mode = parse_state == ERROR_STATE; +  bool skipped_error = false; +  int32_t first_error_character = 0; +  Length error_start_position = length_zero(); +  Length error_end_position = length_zero(); +  uint32_t lookahead_end_byte = 0; +  ts_lexer_reset(&self->lexer, start_position); + +  for (;;) { +    Length current_position = self->lexer.current_position; + +    if (valid_external_tokens) { +      LOG( +        "lex_external state:%d, row:%u, column:%u", +        lex_mode.external_lex_state, +        current_position.extent.row + 1, +        current_position.extent.column +      ); +      ts_lexer_start(&self->lexer); +      ts_parser__restore_external_scanner(self, external_token); +      bool found_token = self->language->external_scanner.scan( +        self->external_scanner_payload, +        &self->lexer.data, +        valid_external_tokens +      ); +      ts_lexer_finish(&self->lexer, &lookahead_end_byte); + +      // Zero-length external tokens are generally allowed, but they're not +      // allowed right after a syntax error. This is for two reasons: +      // 1. After a syntax error, the lexer is looking for any possible token, +      //    as opposed to the specific set of tokens that are valid in some +      //    parse state. In this situation, it's very easy for an external +      //    scanner to produce unwanted zero-length tokens. +      // 2. The parser sometimes inserts *missing* tokens to recover from +      //    errors. These tokens are also zero-length. If we allow more +      //    zero-length tokens to be created after missing tokens, it +      //    can lead to infinite loops. Forbidding zero-length tokens +      //    right at the point of error recovery is a conservative strategy +      //    for preventing this kind of infinite loop. +      if (found_token && ( +        self->lexer.token_end_position.bytes > current_position.bytes || +        (!error_mode && ts_stack_has_advanced_since_error(self->stack, version)) +      )) { +        found_external_token = true; +        break; +      } + +      ts_lexer_reset(&self->lexer, current_position); +    } + +    LOG( +      "lex_internal state:%d, row:%u, column:%u", +      lex_mode.lex_state, +      current_position.extent.row + 1, +      current_position.extent.column +    ); +    ts_lexer_start(&self->lexer); +    bool found_token = self->language->lex_fn(&self->lexer.data, lex_mode.lex_state); +    ts_lexer_finish(&self->lexer, &lookahead_end_byte); +    if (found_token) break; + +    if (!error_mode) { +      error_mode = true; +      lex_mode = self->language->lex_modes[ERROR_STATE]; +      valid_external_tokens = ts_language_enabled_external_tokens( +        self->language, +        lex_mode.external_lex_state +      ); +      ts_lexer_reset(&self->lexer, start_position); +      continue; +    } + +    if (!skipped_error) { +      LOG("skip_unrecognized_character"); +      skipped_error = true; +      error_start_position = self->lexer.token_start_position; +      error_end_position = self->lexer.token_start_position; +      first_error_character = self->lexer.data.lookahead; +    } + +    if (self->lexer.current_position.bytes == error_end_position.bytes) { +      if (self->lexer.data.lookahead == 0) { +        self->lexer.data.result_symbol = ts_builtin_sym_error; +        break; +      } +      self->lexer.data.advance(&self->lexer.data, false); +    } + +    error_end_position = self->lexer.current_position; +  } + +  Subtree result; +  if (skipped_error) { +    Length padding = length_sub(error_start_position, start_position); +    Length size = length_sub(error_end_position, error_start_position); +    uint32_t lookahead_bytes = lookahead_end_byte - error_end_position.bytes; +    result = ts_subtree_new_error( +      &self->tree_pool, +      first_error_character, +      padding, +      size, +      lookahead_bytes, +      parse_state, +      self->language +    ); + +    LOG( +      "lexed_lookahead sym:%s, size:%u, character:'%c'", +      SYM_NAME(ts_subtree_symbol(result)), +      ts_subtree_total_size(result).bytes, +      first_error_character +    ); +  } else { +    if (self->lexer.token_end_position.bytes < self->lexer.token_start_position.bytes) { +      self->lexer.token_start_position = self->lexer.token_end_position; +    } + +    bool is_keyword = false; +    TSSymbol symbol = self->lexer.data.result_symbol; +    Length padding = length_sub(self->lexer.token_start_position, start_position); +    Length size = length_sub(self->lexer.token_end_position, self->lexer.token_start_position); +    uint32_t lookahead_bytes = lookahead_end_byte - self->lexer.token_end_position.bytes; + +    if (found_external_token) { +      symbol = self->language->external_scanner.symbol_map[symbol]; +    } else if (symbol == self->language->keyword_capture_token && symbol != 0) { +      uint32_t end_byte = self->lexer.token_end_position.bytes; +      ts_lexer_reset(&self->lexer, self->lexer.token_start_position); +      ts_lexer_start(&self->lexer); +      if ( +        self->language->keyword_lex_fn(&self->lexer.data, 0) && +        self->lexer.token_end_position.bytes == end_byte && +        ts_language_has_actions(self->language, parse_state, self->lexer.data.result_symbol) +      ) { +        is_keyword = true; +        symbol = self->lexer.data.result_symbol; +      } +    } + +    result = ts_subtree_new_leaf( +      &self->tree_pool, +      symbol, +      padding, +      size, +      lookahead_bytes, +      parse_state, +      found_external_token, +      is_keyword, +      self->language +    ); + +    if (found_external_token) { +      unsigned length = self->language->external_scanner.serialize( +        self->external_scanner_payload, +        self->lexer.debug_buffer +      ); +      ts_external_scanner_state_init( +        &((SubtreeHeapData *)result.ptr)->external_scanner_state, +        self->lexer.debug_buffer, +        length +      ); +    } + +    LOG( +      "lexed_lookahead sym:%s, size:%u", +      SYM_NAME(ts_subtree_symbol(result)), +      ts_subtree_total_size(result).bytes +    ); +  } + +  return result; +} + +static Subtree ts_parser__get_cached_token( +  TSParser *self, +  TSStateId state, +  size_t position, +  Subtree last_external_token, +  TableEntry *table_entry +) { +  TokenCache *cache = &self->token_cache; +  if ( +    cache->token.ptr && cache->byte_index == position && +    ts_subtree_external_scanner_state_eq(cache->last_external_token, last_external_token) +  ) { +    ts_language_table_entry(self->language, state, ts_subtree_symbol(cache->token), table_entry); +    if (ts_parser__can_reuse_first_leaf(self, state, cache->token, table_entry)) { +      ts_subtree_retain(cache->token); +      return cache->token; +    } +  } +  return NULL_SUBTREE; +} + +static void ts_parser__set_cached_token( +  TSParser *self, +  size_t byte_index, +  Subtree last_external_token, +  Subtree token +) { +  TokenCache *cache = &self->token_cache; +  if (token.ptr) ts_subtree_retain(token); +  if (last_external_token.ptr) ts_subtree_retain(last_external_token); +  if (cache->token.ptr) ts_subtree_release(&self->tree_pool, cache->token); +  if (cache->last_external_token.ptr) ts_subtree_release(&self->tree_pool, cache->last_external_token); +  cache->token = token; +  cache->byte_index = byte_index; +  cache->last_external_token = last_external_token; +} + +static bool ts_parser__has_included_range_difference( +  const TSParser *self, +  uint32_t start_position, +  uint32_t end_position +) { +  return ts_range_array_intersects( +    &self->included_range_differences, +    self->included_range_difference_index, +    start_position, +    end_position +  ); +} + +static Subtree ts_parser__reuse_node( +  TSParser *self, +  StackVersion version, +  TSStateId *state, +  uint32_t position, +  Subtree last_external_token, +  TableEntry *table_entry +) { +  Subtree result; +  while ((result = reusable_node_tree(&self->reusable_node)).ptr) { +    uint32_t byte_offset = reusable_node_byte_offset(&self->reusable_node); +    uint32_t end_byte_offset = byte_offset + ts_subtree_total_bytes(result); + +    if (byte_offset > position) { +      LOG("before_reusable_node symbol:%s", TREE_NAME(result)); +      break; +    } + +    if (byte_offset < position) { +      LOG("past_reusable_node symbol:%s", TREE_NAME(result)); +      if (end_byte_offset <= position || !reusable_node_descend(&self->reusable_node)) { +        reusable_node_advance(&self->reusable_node); +      } +      continue; +    } + +    if (!ts_subtree_external_scanner_state_eq(self->reusable_node.last_external_token, last_external_token)) { +      LOG("reusable_node_has_different_external_scanner_state symbol:%s", TREE_NAME(result)); +      reusable_node_advance(&self->reusable_node); +      continue; +    } + +    const char *reason = NULL; +    if (ts_subtree_has_changes(result)) { +      reason = "has_changes"; +    } else if (ts_subtree_is_error(result)) { +      reason = "is_error"; +    } else if (ts_subtree_missing(result)) { +      reason = "is_missing"; +    } else if (ts_subtree_is_fragile(result)) { +      reason = "is_fragile"; +    } else if (ts_parser__has_included_range_difference(self, byte_offset, end_byte_offset)) { +      reason = "contains_different_included_range"; +    } + +    if (reason) { +      LOG("cant_reuse_node_%s tree:%s", reason, TREE_NAME(result)); +      if (!reusable_node_descend(&self->reusable_node)) { +        reusable_node_advance(&self->reusable_node); +        ts_parser__breakdown_top_of_stack(self, version); +        *state = ts_stack_state(self->stack, version); +      } +      continue; +    } + +    TSSymbol leaf_symbol = ts_subtree_leaf_symbol(result); +    ts_language_table_entry(self->language, *state, leaf_symbol, table_entry); +    if (!ts_parser__can_reuse_first_leaf(self, *state, result, table_entry)) { +      LOG( +        "cant_reuse_node symbol:%s, first_leaf_symbol:%s", +        TREE_NAME(result), +        SYM_NAME(leaf_symbol) +      ); +      reusable_node_advance_past_leaf(&self->reusable_node); +      break; +    } + +    LOG("reuse_node symbol:%s", TREE_NAME(result)); +    ts_subtree_retain(result); +    return result; +  } + +  return NULL_SUBTREE; +} + +static bool ts_parser__select_tree(TSParser *self, Subtree left, Subtree right) { +  if (!left.ptr) return true; +  if (!right.ptr) return false; + +  if (ts_subtree_error_cost(right) < ts_subtree_error_cost(left)) { +    LOG("select_smaller_error symbol:%s, over_symbol:%s", TREE_NAME(right), TREE_NAME(left)); +    return true; +  } + +  if (ts_subtree_error_cost(left) < ts_subtree_error_cost(right)) { +    LOG("select_smaller_error symbol:%s, over_symbol:%s", TREE_NAME(left), TREE_NAME(right)); +    return false; +  } + +  if (ts_subtree_dynamic_precedence(right) > ts_subtree_dynamic_precedence(left)) { +    LOG("select_higher_precedence symbol:%s, prec:%u, over_symbol:%s, other_prec:%u", +        TREE_NAME(right), ts_subtree_dynamic_precedence(right), TREE_NAME(left), +        ts_subtree_dynamic_precedence(left)); +    return true; +  } + +  if (ts_subtree_dynamic_precedence(left) > ts_subtree_dynamic_precedence(right)) { +    LOG("select_higher_precedence symbol:%s, prec:%u, over_symbol:%s, other_prec:%u", +        TREE_NAME(left), ts_subtree_dynamic_precedence(left), TREE_NAME(right), +        ts_subtree_dynamic_precedence(right)); +    return false; +  } + +  if (ts_subtree_error_cost(left) > 0) return true; + +  int comparison = ts_subtree_compare(left, right); +  switch (comparison) { +    case -1: +      LOG("select_earlier symbol:%s, over_symbol:%s", TREE_NAME(left), TREE_NAME(right)); +      return false; +      break; +    case 1: +      LOG("select_earlier symbol:%s, over_symbol:%s", TREE_NAME(right), TREE_NAME(left)); +      return true; +    default: +      LOG("select_existing symbol:%s, over_symbol:%s", TREE_NAME(left), TREE_NAME(right)); +      return false; +  } +} + +static void ts_parser__shift( +  TSParser *self, +  StackVersion version, +  TSStateId state, +  Subtree lookahead, +  bool extra +) { +  Subtree subtree_to_push; +  if (extra != ts_subtree_extra(lookahead)) { +    MutableSubtree result = ts_subtree_make_mut(&self->tree_pool, lookahead); +    ts_subtree_set_extra(&result); +    subtree_to_push = ts_subtree_from_mut(result); +  } else { +    subtree_to_push = lookahead; +  } + +  bool is_pending = ts_subtree_child_count(subtree_to_push) > 0; +  ts_stack_push(self->stack, version, subtree_to_push, is_pending, state); +  if (ts_subtree_has_external_tokens(subtree_to_push)) { +    ts_stack_set_last_external_token( +      self->stack, version, ts_subtree_last_external_token(subtree_to_push) +    ); +  } +} + +static bool ts_parser__replace_children( +  TSParser *self, +  MutableSubtree *tree, +  SubtreeArray *children +) { +  *self->scratch_tree.ptr = *tree->ptr; +  self->scratch_tree.ptr->child_count = 0; +  ts_subtree_set_children(self->scratch_tree, children->contents, children->size, self->language); +  if (ts_parser__select_tree(self, ts_subtree_from_mut(*tree), ts_subtree_from_mut(self->scratch_tree))) { +    *tree->ptr = *self->scratch_tree.ptr; +    return true; +  } else { +    return false; +  } +} + +static StackVersion ts_parser__reduce( +  TSParser *self, +  StackVersion version, +  TSSymbol symbol, +  uint32_t count, +  int dynamic_precedence, +  uint16_t production_id, +  bool fragile +) { +  uint32_t initial_version_count = ts_stack_version_count(self->stack); +  uint32_t removed_version_count = 0; +  StackSliceArray pop = ts_stack_pop_count(self->stack, version, count); + +  for (uint32_t i = 0; i < pop.size; i++) { +    StackSlice slice = pop.contents[i]; +    StackVersion slice_version = slice.version - removed_version_count; + +    // Error recovery can sometimes cause lots of stack versions to merge, +    // such that a single pop operation can produce a lots of slices. +    // Avoid creating too many stack versions in that situation. +    if (i > 0 && slice_version > MAX_VERSION_COUNT + MAX_VERSION_COUNT_OVERFLOW) { +      ts_stack_remove_version(self->stack, slice_version); +      ts_subtree_array_delete(&self->tree_pool, &slice.subtrees); +      removed_version_count++; +      while (i + 1 < pop.size) { +        StackSlice next_slice = pop.contents[i + 1]; +        if (next_slice.version != slice.version) break; +        ts_subtree_array_delete(&self->tree_pool, &next_slice.subtrees); +        i++; +      } +      continue; +    } + +    // Extra tokens on top of the stack should not be included in this new parent +    // node. They will be re-pushed onto the stack after the parent node is +    // created and pushed. +    SubtreeArray children = slice.subtrees; +    while (children.size > 0 && ts_subtree_extra(children.contents[children.size - 1])) { +      children.size--; +    } + +    MutableSubtree parent = ts_subtree_new_node(&self->tree_pool, +      symbol, &children, production_id, self->language +    ); + +    // This pop operation may have caused multiple stack versions to collapse +    // into one, because they all diverged from a common state. In that case, +    // choose one of the arrays of trees to be the parent node's children, and +    // delete the rest of the tree arrays. +    while (i + 1 < pop.size) { +      StackSlice next_slice = pop.contents[i + 1]; +      if (next_slice.version != slice.version) break; +      i++; + +      SubtreeArray children = next_slice.subtrees; +      while (children.size > 0 && ts_subtree_extra(children.contents[children.size - 1])) { +        children.size--; +      } + +      if (ts_parser__replace_children(self, &parent, &children)) { +        ts_subtree_array_delete(&self->tree_pool, &slice.subtrees); +        slice = next_slice; +      } else { +        ts_subtree_array_delete(&self->tree_pool, &next_slice.subtrees); +      } +    } + +    parent.ptr->dynamic_precedence += dynamic_precedence; +    parent.ptr->production_id = production_id; + +    TSStateId state = ts_stack_state(self->stack, slice_version); +    TSStateId next_state = ts_language_next_state(self->language, state, symbol); +    if (fragile || pop.size > 1 || initial_version_count > 1) { +      parent.ptr->fragile_left = true; +      parent.ptr->fragile_right = true; +      parent.ptr->parse_state = TS_TREE_STATE_NONE; +    } else { +      parent.ptr->parse_state = state; +    } + +    // Push the parent node onto the stack, along with any extra tokens that +    // were previously on top of the stack. +    ts_stack_push(self->stack, slice_version, ts_subtree_from_mut(parent), false, next_state); +    for (uint32_t j = parent.ptr->child_count; j < slice.subtrees.size; j++) { +      ts_stack_push(self->stack, slice_version, slice.subtrees.contents[j], false, next_state); +    } + +    for (StackVersion j = 0; j < slice_version; j++) { +      if (j == version) continue; +      if (ts_stack_merge(self->stack, j, slice_version)) { +        removed_version_count++; +        break; +      } +    } +  } + +  // Return the first new stack version that was created. +  return ts_stack_version_count(self->stack) > initial_version_count +    ? initial_version_count +    : STACK_VERSION_NONE; +} + +static void ts_parser__accept( +  TSParser *self, +  StackVersion version, +  Subtree lookahead +) { +  assert(ts_subtree_is_eof(lookahead)); +  ts_stack_push(self->stack, version, lookahead, false, 1); + +  StackSliceArray pop = ts_stack_pop_all(self->stack, version); +  for (uint32_t i = 0; i < pop.size; i++) { +    SubtreeArray trees = pop.contents[i].subtrees; + +    Subtree root = NULL_SUBTREE; +    for (uint32_t j = trees.size - 1; j + 1 > 0; j--) { +      Subtree child = trees.contents[j]; +      if (!ts_subtree_extra(child)) { +        assert(!child.data.is_inline); +        uint32_t child_count = ts_subtree_child_count(child); +        for (uint32_t k = 0; k < child_count; k++) { +          ts_subtree_retain(child.ptr->children[k]); +        } +        array_splice(&trees, j, 1, child_count, child.ptr->children); +        root = ts_subtree_from_mut(ts_subtree_new_node( +          &self->tree_pool, +          ts_subtree_symbol(child), +          &trees, +          child.ptr->production_id, +          self->language +        )); +        ts_subtree_release(&self->tree_pool, child); +        break; +      } +    } + +    assert(root.ptr); +    self->accept_count++; + +    if (self->finished_tree.ptr) { +      if (ts_parser__select_tree(self, self->finished_tree, root)) { +        ts_subtree_release(&self->tree_pool, self->finished_tree); +        self->finished_tree = root; +      } else { +        ts_subtree_release(&self->tree_pool, root); +      } +    } else { +      self->finished_tree = root; +    } +  } + +  ts_stack_remove_version(self->stack, pop.contents[0].version); +  ts_stack_halt(self->stack, version); +} + +static bool ts_parser__do_all_potential_reductions( +  TSParser *self, +  StackVersion starting_version, +  TSSymbol lookahead_symbol +) { +  uint32_t initial_version_count = ts_stack_version_count(self->stack); + +  bool can_shift_lookahead_symbol = false; +  StackVersion version = starting_version; +  for (unsigned i = 0; true; i++) { +    uint32_t version_count = ts_stack_version_count(self->stack); +    if (version >= version_count) break; + +    bool merged = false; +    for (StackVersion i = initial_version_count; i < version; i++) { +      if (ts_stack_merge(self->stack, i, version)) { +        merged = true; +        break; +      } +    } +    if (merged) continue; + +    TSStateId state = ts_stack_state(self->stack, version); +    bool has_shift_action = false; +    array_clear(&self->reduce_actions); + +    TSSymbol first_symbol, end_symbol; +    if (lookahead_symbol != 0) { +      first_symbol = lookahead_symbol; +      end_symbol = lookahead_symbol + 1; +    } else { +      first_symbol = 1; +      end_symbol = self->language->token_count; +    } + +    for (TSSymbol symbol = first_symbol; symbol < end_symbol; symbol++) { +      TableEntry entry; +      ts_language_table_entry(self->language, state, symbol, &entry); +      for (uint32_t i = 0; i < entry.action_count; i++) { +        TSParseAction action = entry.actions[i]; +        switch (action.type) { +          case TSParseActionTypeShift: +          case TSParseActionTypeRecover: +            if (!action.params.extra && !action.params.repetition) has_shift_action = true; +            break; +          case TSParseActionTypeReduce: +            if (action.params.child_count > 0) +              ts_reduce_action_set_add(&self->reduce_actions, (ReduceAction){ +                .symbol = action.params.symbol, +                .count = action.params.child_count, +                .dynamic_precedence = action.params.dynamic_precedence, +                .production_id = action.params.production_id, +              }); +          default: +            break; +        } +      } +    } + +    StackVersion reduction_version = STACK_VERSION_NONE; +    for (uint32_t i = 0; i < self->reduce_actions.size; i++) { +      ReduceAction action = self->reduce_actions.contents[i]; + +      reduction_version = ts_parser__reduce( +        self, version, action.symbol, action.count, +        action.dynamic_precedence, action.production_id, +        true +      ); +    } + +    if (has_shift_action) { +      can_shift_lookahead_symbol = true; +    } else if (reduction_version != STACK_VERSION_NONE && i < MAX_VERSION_COUNT) { +      ts_stack_renumber_version(self->stack, reduction_version, version); +      continue; +    } else if (lookahead_symbol != 0) { +      ts_stack_remove_version(self->stack, version); +    } + +    if (version == starting_version) { +      version = version_count; +    } else { +      version++; +    } +  } + +  return can_shift_lookahead_symbol; +} + +static void ts_parser__handle_error( +  TSParser *self, +  StackVersion version, +  TSSymbol lookahead_symbol +) { +  uint32_t previous_version_count = ts_stack_version_count(self->stack); + +  // Perform any reductions that can happen in this state, regardless of the lookahead. After +  // skipping one or more invalid tokens, the parser might find a token that would have allowed +  // a reduction to take place. +  ts_parser__do_all_potential_reductions(self, version, 0); +  uint32_t version_count = ts_stack_version_count(self->stack); +  Length position = ts_stack_position(self->stack, version); + +  // Push a discontinuity onto the stack. Merge all of the stack versions that +  // were created in the previous step. +  bool did_insert_missing_token = false; +  for (StackVersion v = version; v < version_count;) { +    if (!did_insert_missing_token) { +      TSStateId state = ts_stack_state(self->stack, v); +      for (TSSymbol missing_symbol = 1; +           missing_symbol < self->language->token_count; +           missing_symbol++) { +        TSStateId state_after_missing_symbol = ts_language_next_state( +          self->language, state, missing_symbol +        ); +        if (state_after_missing_symbol == 0) continue; + +        if (ts_language_has_reduce_action( +          self->language, +          state_after_missing_symbol, +          lookahead_symbol +        )) { +          // In case the parser is currently outside of any included range, the lexer will +          // snap to the beginning of the next included range. The missing token's padding +          // must be assigned to position it within the next included range. +          ts_lexer_reset(&self->lexer, position); +          ts_lexer_mark_end(&self->lexer); +          Length padding = length_sub(self->lexer.token_end_position, position); + +          StackVersion version_with_missing_tree = ts_stack_copy_version(self->stack, v); +          Subtree missing_tree = ts_subtree_new_missing_leaf( +            &self->tree_pool, missing_symbol, padding, self->language +          ); +          ts_stack_push( +            self->stack, version_with_missing_tree, +            missing_tree, false, +            state_after_missing_symbol +          ); + +          if (ts_parser__do_all_potential_reductions( +            self, version_with_missing_tree, +            lookahead_symbol +          )) { +            LOG( +              "recover_with_missing symbol:%s, state:%u", +              SYM_NAME(missing_symbol), +              ts_stack_state(self->stack, version_with_missing_tree) +            ); +            did_insert_missing_token = true; +            break; +          } +        } +      } +    } + +    ts_stack_push(self->stack, v, NULL_SUBTREE, false, ERROR_STATE); +    v = (v == version) ? previous_version_count : v + 1; +  } + +  for (unsigned i = previous_version_count; i < version_count; i++) { +    bool did_merge = ts_stack_merge(self->stack, version, previous_version_count); +    assert(did_merge); +  } + +  ts_stack_record_summary(self->stack, version, MAX_SUMMARY_DEPTH); +  LOG_STACK(); +} + +static void ts_parser__halt_parse(TSParser *self) { +  LOG("halting_parse"); +  LOG_STACK(); + +  ts_lexer_advance_to_end(&self->lexer); +  Length remaining_length = length_sub( +    self->lexer.current_position, +    ts_stack_position(self->stack, 0) +  ); + +  Subtree filler_node = ts_subtree_new_error( +    &self->tree_pool, +    0, +    length_zero(), +    remaining_length, +    remaining_length.bytes, +    0, +    self->language +  ); +  ts_subtree_to_mut_unsafe(filler_node).ptr->visible = false; +  ts_stack_push(self->stack, 0, filler_node, false, 0); + +  SubtreeArray children = array_new(); +  Subtree root_error = ts_subtree_new_error_node(&self->tree_pool, &children, false, self->language); +  ts_stack_push(self->stack, 0, root_error, false, 0); + +  Subtree eof = ts_subtree_new_leaf( +    &self->tree_pool, +    ts_builtin_sym_end, +    length_zero(), +    length_zero(), +    0, +    0, +    false, +    false, +    self->language +  ); +  ts_parser__accept(self, 0, eof); +} + +static bool ts_parser__recover_to_state( +  TSParser *self, +  StackVersion version, +  unsigned depth, +  TSStateId goal_state +) { +  StackSliceArray pop = ts_stack_pop_count(self->stack, version, depth); +  StackVersion previous_version = STACK_VERSION_NONE; + +  for (unsigned i = 0; i < pop.size; i++) { +    StackSlice slice = pop.contents[i]; + +    if (slice.version == previous_version) { +      ts_subtree_array_delete(&self->tree_pool, &slice.subtrees); +      array_erase(&pop, i--); +      continue; +    } + +    if (ts_stack_state(self->stack, slice.version) != goal_state) { +      ts_stack_halt(self->stack, slice.version); +      ts_subtree_array_delete(&self->tree_pool, &slice.subtrees); +      array_erase(&pop, i--); +      continue; +    } + +    SubtreeArray error_trees = ts_stack_pop_error(self->stack, slice.version); +    if (error_trees.size > 0) { +      assert(error_trees.size == 1); +      Subtree error_tree = error_trees.contents[0]; +      uint32_t error_child_count = ts_subtree_child_count(error_tree); +      if (error_child_count > 0) { +        array_splice(&slice.subtrees, 0, 0, error_child_count, error_tree.ptr->children); +        for (unsigned j = 0; j < error_child_count; j++) { +          ts_subtree_retain(slice.subtrees.contents[j]); +        } +      } +      ts_subtree_array_delete(&self->tree_pool, &error_trees); +    } + +    SubtreeArray trailing_extras = ts_subtree_array_remove_trailing_extras(&slice.subtrees); + +    if (slice.subtrees.size > 0) { +      Subtree error = ts_subtree_new_error_node(&self->tree_pool, &slice.subtrees, true, self->language); +      ts_stack_push(self->stack, slice.version, error, false, goal_state); +    } else { +      array_delete(&slice.subtrees); +    } + +    for (unsigned j = 0; j < trailing_extras.size; j++) { +      Subtree tree = trailing_extras.contents[j]; +      ts_stack_push(self->stack, slice.version, tree, false, goal_state); +    } + +    previous_version = slice.version; +    array_delete(&trailing_extras); +  } + +  return previous_version != STACK_VERSION_NONE; +} + +static void ts_parser__recover( +  TSParser *self, +  StackVersion version, +  Subtree lookahead +) { +  bool did_recover = false; +  unsigned previous_version_count = ts_stack_version_count(self->stack); +  Length position = ts_stack_position(self->stack, version); +  StackSummary *summary = ts_stack_get_summary(self->stack, version); +  unsigned node_count_since_error = ts_stack_node_count_since_error(self->stack, version); +  unsigned current_error_cost = ts_stack_error_cost(self->stack, version); + +  // When the parser is in the error state, there are two strategies for recovering with a +  // given lookahead token: +  // 1. Find a previous state on the stack in which that lookahead token would be valid. Then, +  //    create a new stack version that is in that state again. This entails popping all of the +  //    subtrees that have been pushed onto the stack since that previous state, and wrapping +  //    them in an ERROR node. +  // 2. Wrap the lookahead token in an ERROR node, push that ERROR node onto the stack, and +  //    move on to the next lookahead token, remaining in the error state. +  // +  // First, try the strategy 1. Upon entering the error state, the parser recorded a summary +  // of the previous parse states and their depths. Look at each state in the summary, to see +  // if the current lookahead token would be valid in that state. +  if (summary && !ts_subtree_is_error(lookahead)) { +    for (unsigned i = 0; i < summary->size; i++) { +      StackSummaryEntry entry = summary->contents[i]; + +      if (entry.state == ERROR_STATE) continue; +      if (entry.position.bytes == position.bytes) continue; +      unsigned depth = entry.depth; +      if (node_count_since_error > 0) depth++; + +      // Do not recover in ways that create redundant stack versions. +      bool would_merge = false; +      for (unsigned j = 0; j < previous_version_count; j++) { +        if ( +          ts_stack_state(self->stack, j) == entry.state && +          ts_stack_position(self->stack, j).bytes == position.bytes +        ) { +          would_merge = true; +          break; +        } +      } +      if (would_merge) continue; + +      // Do not recover if the result would clearly be worse than some existing stack version. +      unsigned new_cost = +        current_error_cost + +        entry.depth * ERROR_COST_PER_SKIPPED_TREE + +        (position.bytes - entry.position.bytes) * ERROR_COST_PER_SKIPPED_CHAR + +        (position.extent.row - entry.position.extent.row) * ERROR_COST_PER_SKIPPED_LINE; +      if (ts_parser__better_version_exists(self, version, false, new_cost)) break; + +      // If the current lookahead token is valid in some previous state, recover to that state. +      // Then stop looking for further recoveries. +      if (ts_language_has_actions(self->language, entry.state, ts_subtree_symbol(lookahead))) { +        if (ts_parser__recover_to_state(self, version, depth, entry.state)) { +          did_recover = true; +          LOG("recover_to_previous state:%u, depth:%u", entry.state, depth); +          LOG_STACK(); +          break; +        } +      } +    } +  } + +  // In the process of attemping to recover, some stack versions may have been created +  // and subsequently halted. Remove those versions. +  for (unsigned i = previous_version_count; i < ts_stack_version_count(self->stack); i++) { +    if (!ts_stack_is_active(self->stack, i)) { +      ts_stack_remove_version(self->stack, i--); +    } +  } + +  // If strategy 1 succeeded, a new stack version will have been created which is able to handle +  // the current lookahead token. Now, in addition, try strategy 2 described above: skip the +  // current lookahead token by wrapping it in an ERROR node. + +  // Don't pursue this additional strategy if there are already too many stack versions. +  if (did_recover && ts_stack_version_count(self->stack) > MAX_VERSION_COUNT) { +    ts_stack_halt(self->stack, version); +    ts_subtree_release(&self->tree_pool, lookahead); +    return; +  } + +  // If the parser is still in the error state at the end of the file, just wrap everything +  // in an ERROR node and terminate. +  if (ts_subtree_is_eof(lookahead)) { +    LOG("recover_eof"); +    SubtreeArray children = array_new(); +    Subtree parent = ts_subtree_new_error_node(&self->tree_pool, &children, false, self->language); +    ts_stack_push(self->stack, version, parent, false, 1); +    ts_parser__accept(self, version, lookahead); +    return; +  } + +  // Do not recover if the result would clearly be worse than some existing stack version. +  unsigned new_cost = +    current_error_cost + ERROR_COST_PER_SKIPPED_TREE + +    ts_subtree_total_bytes(lookahead) * ERROR_COST_PER_SKIPPED_CHAR + +    ts_subtree_total_size(lookahead).extent.row * ERROR_COST_PER_SKIPPED_LINE; +  if (ts_parser__better_version_exists(self, version, false, new_cost)) { +    ts_stack_halt(self->stack, version); +    ts_subtree_release(&self->tree_pool, lookahead); +    return; +  } + +  // If the current lookahead token is an extra token, mark it as extra. This means it won't +  // be counted in error cost calculations. +  unsigned n; +  const TSParseAction *actions = ts_language_actions(self->language, 1, ts_subtree_symbol(lookahead), &n); +  if (n > 0 && actions[n - 1].type == TSParseActionTypeShift && actions[n - 1].params.extra) { +    MutableSubtree mutable_lookahead = ts_subtree_make_mut(&self->tree_pool, lookahead); +    ts_subtree_set_extra(&mutable_lookahead); +    lookahead = ts_subtree_from_mut(mutable_lookahead); +  } + +  // Wrap the lookahead token in an ERROR. +  LOG("skip_token symbol:%s", TREE_NAME(lookahead)); +  SubtreeArray children = array_new(); +  array_reserve(&children, 1); +  array_push(&children, lookahead); +  MutableSubtree error_repeat = ts_subtree_new_node( +    &self->tree_pool, +    ts_builtin_sym_error_repeat, +    &children, +    0, +    self->language +  ); + +  // If other tokens have already been skipped, so there is already an ERROR at the top of the +  // stack, then pop that ERROR off the stack and wrap the two ERRORs together into one larger +  // ERROR. +  if (node_count_since_error > 0) { +    StackSliceArray pop = ts_stack_pop_count(self->stack, version, 1); + +    // TODO: Figure out how to make this condition occur. +    // See https://github.com/atom/atom/issues/18450#issuecomment-439579778 +    // If multiple stack versions have merged at this point, just pick one of the errors +    // arbitrarily and discard the rest. +    if (pop.size > 1) { +      for (unsigned i = 1; i < pop.size; i++) { +        ts_subtree_array_delete(&self->tree_pool, &pop.contents[i].subtrees); +      } +      while (ts_stack_version_count(self->stack) > pop.contents[0].version + 1) { +        ts_stack_remove_version(self->stack, pop.contents[0].version + 1); +      } +    } + +    ts_stack_renumber_version(self->stack, pop.contents[0].version, version); +    array_push(&pop.contents[0].subtrees, ts_subtree_from_mut(error_repeat)); +    error_repeat = ts_subtree_new_node( +      &self->tree_pool, +      ts_builtin_sym_error_repeat, +      &pop.contents[0].subtrees, +      0, +      self->language +    ); +  } + +  // Push the new ERROR onto the stack. +  ts_stack_push(self->stack, version, ts_subtree_from_mut(error_repeat), false, ERROR_STATE); +  if (ts_subtree_has_external_tokens(lookahead)) { +    ts_stack_set_last_external_token( +      self->stack, version, ts_subtree_last_external_token(lookahead) +    ); +  } +} + +static bool ts_parser__advance( +  TSParser *self, +  StackVersion version, +  bool allow_node_reuse +) { +  TSStateId state = ts_stack_state(self->stack, version); +  uint32_t position = ts_stack_position(self->stack, version).bytes; +  Subtree last_external_token = ts_stack_last_external_token(self->stack, version); + +  bool did_reuse = true; +  Subtree lookahead = NULL_SUBTREE; +  TableEntry table_entry = {.action_count = 0}; + +  // If possible, reuse a node from the previous syntax tree. +  if (allow_node_reuse) { +    lookahead = ts_parser__reuse_node( +      self, version, &state, position, last_external_token, &table_entry +    ); +  } + +  // If no node from the previous syntax tree could be reused, then try to +  // reuse the token previously returned by the lexer. +  if (!lookahead.ptr) { +    did_reuse = false; +    lookahead = ts_parser__get_cached_token( +      self, state, position, last_external_token, &table_entry +    ); +  } + +  // Otherwise, re-run the lexer. +  if (!lookahead.ptr) { +    lookahead = ts_parser__lex(self, version, state); +    ts_parser__set_cached_token(self, position, last_external_token, lookahead); +    ts_language_table_entry(self->language, state, ts_subtree_symbol(lookahead), &table_entry); +  } + +  for (;;) { +    // If a cancellation flag or a timeout was provided, then check every +    // time a fixed number of parse actions has been processed. +    if (++self->operation_count == OP_COUNT_PER_TIMEOUT_CHECK) { +      self->operation_count = 0; +    } +    if ( +      self->operation_count == 0 && +      ((self->cancellation_flag && atomic_load(self->cancellation_flag)) || +       (!clock_is_null(self->end_clock) && clock_is_gt(clock_now(), self->end_clock))) +    ) { +      ts_subtree_release(&self->tree_pool, lookahead); +      return false; +    } + +    // Process each parse action for the current lookahead token in +    // the current state. If there are multiple actions, then this is +    // an ambiguous state. REDUCE actions always create a new stack +    // version, whereas SHIFT actions update the existing stack version +    // and terminate this loop. +    StackVersion last_reduction_version = STACK_VERSION_NONE; +    for (uint32_t i = 0; i < table_entry.action_count; i++) { +      TSParseAction action = table_entry.actions[i]; + +      switch (action.type) { +        case TSParseActionTypeShift: { +          if (action.params.repetition) break; +          TSStateId next_state; +          if (action.params.extra) { + +            // TODO: remove when TREE_SITTER_LANGUAGE_VERSION 9 is out. +            if (state == ERROR_STATE) continue; + +            next_state = state; +            LOG("shift_extra"); +          } else { +            next_state = action.params.state; +            LOG("shift state:%u", next_state); +          } + +          if (ts_subtree_child_count(lookahead) > 0) { +            ts_parser__breakdown_lookahead(self, &lookahead, state, &self->reusable_node); +            next_state = ts_language_next_state(self->language, state, ts_subtree_symbol(lookahead)); +          } + +          ts_parser__shift(self, version, next_state, lookahead, action.params.extra); +          if (did_reuse) reusable_node_advance(&self->reusable_node); +          return true; +        } + +        case TSParseActionTypeReduce: { +          bool is_fragile = table_entry.action_count > 1; +          LOG("reduce sym:%s, child_count:%u", SYM_NAME(action.params.symbol), action.params.child_count); +          StackVersion reduction_version = ts_parser__reduce( +            self, version, action.params.symbol, action.params.child_count, +            action.params.dynamic_precedence, action.params.production_id, +            is_fragile +          ); +          if (reduction_version != STACK_VERSION_NONE) { +            last_reduction_version = reduction_version; +          } +          break; +        } + +        case TSParseActionTypeAccept: { +          LOG("accept"); +          ts_parser__accept(self, version, lookahead); +          return true; +        } + +        case TSParseActionTypeRecover: { +          if (ts_subtree_child_count(lookahead) > 0) { +            ts_parser__breakdown_lookahead(self, &lookahead, ERROR_STATE, &self->reusable_node); +          } + +          ts_parser__recover(self, version, lookahead); +          if (did_reuse) reusable_node_advance(&self->reusable_node); +          return true; +        } +      } +    } + +    // If a reduction was performed, then replace the current stack version +    // with one of the stack versions created by a reduction, and continue +    // processing this version of the stack with the same lookahead symbol. +    if (last_reduction_version != STACK_VERSION_NONE) { +      ts_stack_renumber_version(self->stack, last_reduction_version, version); +      LOG_STACK(); +      state = ts_stack_state(self->stack, version); +      ts_language_table_entry( +        self->language, +        state, +        ts_subtree_leaf_symbol(lookahead), +        &table_entry +      ); +      continue; +    } + +    // If there were no parse actions for the current lookahead token, then +    // it is not valid in this state. If the current lookahead token is a +    // keyword, then switch to treating it as the normal word token if that +    // token is valid in this state. +    if ( +      ts_subtree_is_keyword(lookahead) && +      ts_subtree_symbol(lookahead) != self->language->keyword_capture_token +    ) { +      ts_language_table_entry(self->language, state, self->language->keyword_capture_token, &table_entry); +      if (table_entry.action_count > 0) { +        LOG( +          "switch from_keyword:%s, to_word_token:%s", +          TREE_NAME(lookahead), +          SYM_NAME(self->language->keyword_capture_token) +        ); + +        MutableSubtree mutable_lookahead = ts_subtree_make_mut(&self->tree_pool, lookahead); +        ts_subtree_set_symbol(&mutable_lookahead, self->language->keyword_capture_token, self->language); +        lookahead = ts_subtree_from_mut(mutable_lookahead); +        continue; +      } +    } + +    // If the current lookahead token is not valid and the parser is +    // already in the error state, restart the error recovery process. +    // TODO - can this be unified with the other `RECOVER` case above? +    if (state == ERROR_STATE) { +      ts_parser__recover(self, version, lookahead); +      return true; +    } + +    // If the current lookahead token is not valid and the previous +    // subtree on the stack was reused from an old tree, it isn't actually +    // valid to reuse it. Remove it from the stack, and in its place, +    // push each of its children. Then try again to process the current +    // lookahead. +    if (ts_parser__breakdown_top_of_stack(self, version)) { +      continue; +    } + +    // At this point, the current lookahead token is definitely not valid +    // for this parse stack version. Mark this version as paused and continue +    // processing any other stack versions that might exist. If some other +    // version advances successfully, then this version can simply be removed. +    // But if all versions end up paused, then error recovery is needed. +    LOG("detect_error"); +    ts_stack_pause(self->stack, version, ts_subtree_leaf_symbol(lookahead)); +    ts_subtree_release(&self->tree_pool, lookahead); +    return true; +  } +} + +static unsigned ts_parser__condense_stack(TSParser *self) { +  bool made_changes = false; +  unsigned min_error_cost = UINT_MAX; +  for (StackVersion i = 0; i < ts_stack_version_count(self->stack); i++) { +    // Prune any versions that have been marked for removal. +    if (ts_stack_is_halted(self->stack, i)) { +      ts_stack_remove_version(self->stack, i); +      i--; +      continue; +    } + +    // Keep track of the minimum error cost of any stack version so +    // that it can be returned. +    ErrorStatus status_i = ts_parser__version_status(self, i); +    if (!status_i.is_in_error && status_i.cost < min_error_cost) { +      min_error_cost = status_i.cost; +    } + +    // Examine each pair of stack versions, removing any versions that +    // are clearly worse than another version. Ensure that the versions +    // are ordered from most promising to least promising. +    for (StackVersion j = 0; j < i; j++) { +      ErrorStatus status_j = ts_parser__version_status(self, j); + +      switch (ts_parser__compare_versions(self, status_j, status_i)) { +        case ErrorComparisonTakeLeft: +          made_changes = true; +          ts_stack_remove_version(self->stack, i); +          i--; +          j = i; +          break; + +        case ErrorComparisonPreferLeft: +        case ErrorComparisonNone: +          if (ts_stack_merge(self->stack, j, i)) { +            made_changes = true; +            i--; +            j = i; +          } +          break; + +        case ErrorComparisonPreferRight: +          made_changes = true; +          if (ts_stack_merge(self->stack, j, i)) { +            i--; +            j = i; +          } else { +            ts_stack_swap_versions(self->stack, i, j); +          } +          break; + +        case ErrorComparisonTakeRight: +          made_changes = true; +          ts_stack_remove_version(self->stack, j); +          i--; +          j--; +          break; +      } +    } +  } + +  // Enfore a hard upper bound on the number of stack versions by +  // discarding the least promising versions. +  while (ts_stack_version_count(self->stack) > MAX_VERSION_COUNT) { +    ts_stack_remove_version(self->stack, MAX_VERSION_COUNT); +    made_changes = true; +  } + +  // If the best-performing stack version is currently paused, or all +  // versions are paused, then resume the best paused version and begin +  // the error recovery process. Otherwise, remove the paused versions. +  if (ts_stack_version_count(self->stack) > 0) { +    bool has_unpaused_version = false; +    for (StackVersion i = 0, n = ts_stack_version_count(self->stack); i < n; i++) { +      if (ts_stack_is_paused(self->stack, i)) { +        if (!has_unpaused_version && self->accept_count < MAX_VERSION_COUNT) { +          LOG("resume version:%u", i); +          min_error_cost = ts_stack_error_cost(self->stack, i); +          TSSymbol lookahead_symbol = ts_stack_resume(self->stack, i); +          ts_parser__handle_error(self, i, lookahead_symbol); +          has_unpaused_version = true; +        } else { +          ts_stack_remove_version(self->stack, i); +          i--; +          n--; +        } +      } else { +        has_unpaused_version = true; +      } +    } +  } + +  if (made_changes) { +    LOG("condense"); +    LOG_STACK(); +  } + +  return min_error_cost; +} + +static bool ts_parser_has_outstanding_parse(TSParser *self) { +  return ( +    self->lexer.current_position.bytes > 0 || +    ts_stack_state(self->stack, 0) != 1 +  ); +} + +// Parser - Public + +TSParser *ts_parser_new(void) { +  TSParser *self = ts_calloc(1, sizeof(TSParser)); +  ts_lexer_init(&self->lexer); +  array_init(&self->reduce_actions); +  array_reserve(&self->reduce_actions, 4); +  self->tree_pool = ts_subtree_pool_new(32); +  self->stack = ts_stack_new(&self->tree_pool); +  self->finished_tree = NULL_SUBTREE; +  self->reusable_node = reusable_node_new(); +  self->dot_graph_file = NULL; +  self->halt_on_error = false; +  self->cancellation_flag = NULL; +  self->timeout_duration = 0; +  self->end_clock = clock_null(); +  self->operation_count = 0; +  self->old_tree = NULL_SUBTREE; +  self->scratch_tree.ptr = &self->scratch_tree_data; +  self->included_range_differences = (TSRangeArray) array_new(); +  self->included_range_difference_index = 0; +  ts_parser__set_cached_token(self, 0, NULL_SUBTREE, NULL_SUBTREE); +  return self; +} + +void ts_parser_delete(TSParser *self) { +  if (!self) return; + +  ts_stack_delete(self->stack); +  if (self->reduce_actions.contents) { +    array_delete(&self->reduce_actions); +  } +  if (self->included_range_differences.contents) { +    array_delete(&self->included_range_differences); +  } +  if (self->old_tree.ptr) { +    ts_subtree_release(&self->tree_pool, self->old_tree); +    self->old_tree = NULL_SUBTREE; +  } +  ts_lexer_delete(&self->lexer); +  ts_parser__set_cached_token(self, 0, NULL_SUBTREE, NULL_SUBTREE); +  ts_subtree_pool_delete(&self->tree_pool); +  reusable_node_delete(&self->reusable_node); +  ts_parser_set_language(self, NULL); +  ts_free(self); +} + +const TSLanguage *ts_parser_language(const TSParser *self) { +  return self->language; +} + +bool ts_parser_set_language(TSParser *self, const TSLanguage *language) { +  if (language) { +    if (language->version > TREE_SITTER_LANGUAGE_VERSION) return false; +    if (language->version < TREE_SITTER_MIN_COMPATIBLE_LANGUAGE_VERSION) return false; +  } + +  if (self->external_scanner_payload && self->language->external_scanner.destroy) { +    self->language->external_scanner.destroy(self->external_scanner_payload); +  } + +  if (language && language->external_scanner.create) { +    self->external_scanner_payload = language->external_scanner.create(); +  } else { +    self->external_scanner_payload = NULL; +  } + +  self->language = language; +  return true; +} + +TSLogger ts_parser_logger(const TSParser *self) { +  return self->lexer.logger; +} + +void ts_parser_set_logger(TSParser *self, TSLogger logger) { +  self->lexer.logger = logger; +} + +void ts_parser_print_dot_graphs(TSParser *self, int fd) { +  if (self->dot_graph_file) { +    fclose(self->dot_graph_file); +  } + +  if (fd >= 0) { +    self->dot_graph_file = fdopen(fd, "a"); +  } else { +    self->dot_graph_file = NULL; +  } +} + +void ts_parser_halt_on_error(TSParser *self, bool should_halt_on_error) { +  self->halt_on_error = should_halt_on_error; +} + +const size_t *ts_parser_cancellation_flag(const TSParser *self) { +  return (const size_t *)self->cancellation_flag; +} + +void ts_parser_set_cancellation_flag(TSParser *self, const size_t *flag) { +  self->cancellation_flag = (const volatile size_t *)flag; +} + +uint64_t ts_parser_timeout_micros(const TSParser *self) { +  return duration_to_micros(self->timeout_duration); +} + +void ts_parser_set_timeout_micros(TSParser *self, uint64_t timeout_micros) { +  self->timeout_duration = duration_from_micros(timeout_micros); +} + +void ts_parser_set_included_ranges(TSParser *self, const TSRange *ranges, uint32_t count) { +  ts_lexer_set_included_ranges(&self->lexer, ranges, count); +} + +const TSRange *ts_parser_included_ranges(const TSParser *self, uint32_t *count) { +  return ts_lexer_included_ranges(&self->lexer, count); +} + +void ts_parser_reset(TSParser *self) { +  if (self->language->external_scanner.deserialize) { +    self->language->external_scanner.deserialize(self->external_scanner_payload, NULL, 0); +  } + +  if (self->old_tree.ptr) { +    ts_subtree_release(&self->tree_pool, self->old_tree); +    self->old_tree = NULL_SUBTREE; +  } + +  reusable_node_clear(&self->reusable_node); +  ts_lexer_reset(&self->lexer, length_zero()); +  ts_stack_clear(self->stack); +  ts_parser__set_cached_token(self, 0, NULL_SUBTREE, NULL_SUBTREE); +  if (self->finished_tree.ptr) { +    ts_subtree_release(&self->tree_pool, self->finished_tree); +    self->finished_tree = NULL_SUBTREE; +  } +  self->accept_count = 0; +} + +TSTree *ts_parser_parse( +  TSParser *self, +  const TSTree *old_tree, +  TSInput input +) { +  if (!self->language || !input.read) return NULL; + +  ts_lexer_set_input(&self->lexer, input); + +  array_clear(&self->included_range_differences); +  self->included_range_difference_index = 0; + +  if (ts_parser_has_outstanding_parse(self)) { +    LOG("resume_parsing"); +  } else if (old_tree) { +    ts_subtree_retain(old_tree->root); +    self->old_tree = old_tree->root; +    ts_range_array_get_changed_ranges( +      old_tree->included_ranges, old_tree->included_range_count, +      self->lexer.included_ranges, self->lexer.included_range_count, +      &self->included_range_differences +    ); +    reusable_node_reset(&self->reusable_node, old_tree->root); +    LOG("parse_after_edit"); +    LOG_TREE(self->old_tree); +    for (unsigned i = 0; i < self->included_range_differences.size; i++) { +      TSRange *range = &self->included_range_differences.contents[i]; +      LOG("different_included_range %u - %u", range->start_byte, range->end_byte); +    } +  } else { +    reusable_node_clear(&self->reusable_node); +    LOG("new_parse"); +  } + +  uint32_t position = 0, last_position = 0, version_count = 0; +  self->operation_count = 0; +  if (self->timeout_duration) { +    self->end_clock = clock_after(clock_now(), self->timeout_duration); +  } else { +    self->end_clock = clock_null(); +  } + +  do { +    for (StackVersion version = 0; +         version_count = ts_stack_version_count(self->stack), version < version_count; +         version++) { +      bool allow_node_reuse = version_count == 1; +      while (ts_stack_is_active(self->stack, version)) { +        LOG("process version:%d, version_count:%u, state:%d, row:%u, col:%u", +            version, ts_stack_version_count(self->stack), +            ts_stack_state(self->stack, version), +            ts_stack_position(self->stack, version).extent.row + 1, +            ts_stack_position(self->stack, version).extent.column); + +        if (!ts_parser__advance(self, version, allow_node_reuse)) return NULL; +        LOG_STACK(); + +        position = ts_stack_position(self->stack, version).bytes; +        if (position > last_position || (version > 0 && position == last_position)) { +          last_position = position; +          break; +        } +      } +    } + +    unsigned min_error_cost = ts_parser__condense_stack(self); +    if (self->finished_tree.ptr && ts_subtree_error_cost(self->finished_tree) < min_error_cost) { +      break; +    } else if (self->halt_on_error && min_error_cost > 0) { +      ts_parser__halt_parse(self); +      break; +    } + +    while (self->included_range_difference_index < self->included_range_differences.size) { +      TSRange *range = &self->included_range_differences.contents[self->included_range_difference_index]; +      if (range->end_byte <= position) { +        self->included_range_difference_index++; +      } else { +        break; +      } +    } +  } while (version_count != 0); + +  ts_subtree_balance(self->finished_tree, &self->tree_pool, self->language); +  LOG("done"); +  LOG_TREE(self->finished_tree); + +  TSTree *result = ts_tree_new( +    self->finished_tree, +    self->language, +    self->lexer.included_ranges, +    self->lexer.included_range_count +  ); +  self->finished_tree = NULL_SUBTREE; +  ts_parser_reset(self); +  return result; +} + +TSTree *ts_parser_parse_string( +  TSParser *self, +  const TSTree *old_tree, +  const char *string, +  uint32_t length +) { +  return ts_parser_parse_string_encoding(self, old_tree, string, length, TSInputEncodingUTF8); +} + +TSTree *ts_parser_parse_string_encoding(TSParser *self, const TSTree *old_tree, +                                        const char *string, uint32_t length, TSInputEncoding encoding) { +  TSStringInput input = {string, length}; +  return ts_parser_parse(self, old_tree, (TSInput) { +    &input, +    ts_string_input_read, +    encoding, +  }); +} + +#undef LOG diff --git a/src/tree_sitter/parser.h b/src/tree_sitter/parser.h new file mode 100644 index 0000000000..974a7ca52f --- /dev/null +++ b/src/tree_sitter/parser.h @@ -0,0 +1,220 @@ +#ifndef TREE_SITTER_PARSER_H_ +#define TREE_SITTER_PARSER_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <stdbool.h> +#include <stdint.h> +#include <stdlib.h> + +#define ts_builtin_sym_error ((TSSymbol)-1) +#define ts_builtin_sym_end 0 +#define TREE_SITTER_SERIALIZATION_BUFFER_SIZE 1024 + +#ifndef TREE_SITTER_API_H_ +typedef uint16_t TSSymbol; +typedef uint16_t TSFieldId; +typedef struct TSLanguage TSLanguage; +#endif + +typedef struct { +  TSFieldId field_id; +  uint8_t child_index; +  bool inherited; +} TSFieldMapEntry; + +typedef struct { +  uint16_t index; +  uint16_t length; +} TSFieldMapSlice; + +typedef uint16_t TSStateId; + +typedef struct { +  bool visible : 1; +  bool named : 1; +} TSSymbolMetadata; + +typedef struct TSLexer TSLexer; + +struct TSLexer { +  int32_t lookahead; +  TSSymbol result_symbol; +  void (*advance)(TSLexer *, bool); +  void (*mark_end)(TSLexer *); +  uint32_t (*get_column)(TSLexer *); +  bool (*is_at_included_range_start)(TSLexer *); +}; + +typedef enum { +  TSParseActionTypeShift, +  TSParseActionTypeReduce, +  TSParseActionTypeAccept, +  TSParseActionTypeRecover, +} TSParseActionType; + +typedef struct { +  union { +    struct { +      TSStateId state; +      bool extra : 1; +      bool repetition : 1; +    }; +    struct { +      TSSymbol symbol; +      int16_t dynamic_precedence; +      uint8_t child_count; +      uint8_t production_id; +    }; +  } params; +  TSParseActionType type : 4; +} TSParseAction; + +typedef struct { +  uint16_t lex_state; +  uint16_t external_lex_state; +} TSLexMode; + +typedef union { +  TSParseAction action; +  struct { +    uint8_t count; +    bool reusable : 1; +  }; +} TSParseActionEntry; + +struct TSLanguage { +  uint32_t version; +  uint32_t symbol_count; +  uint32_t alias_count; +  uint32_t token_count; +  uint32_t external_token_count; +  const char **symbol_names; +  const TSSymbolMetadata *symbol_metadata; +  const uint16_t *parse_table; +  const TSParseActionEntry *parse_actions; +  const TSLexMode *lex_modes; +  const TSSymbol *alias_sequences; +  uint16_t max_alias_sequence_length; +  bool (*lex_fn)(TSLexer *, TSStateId); +  bool (*keyword_lex_fn)(TSLexer *, TSStateId); +  TSSymbol keyword_capture_token; +  struct { +    const bool *states; +    const TSSymbol *symbol_map; +    void *(*create)(void); +    void (*destroy)(void *); +    bool (*scan)(void *, TSLexer *, const bool *symbol_whitelist); +    unsigned (*serialize)(void *, char *); +    void (*deserialize)(void *, const char *, unsigned); +  } external_scanner; +  uint32_t field_count; +  const TSFieldMapSlice *field_map_slices; +  const TSFieldMapEntry *field_map_entries; +  const char **field_names; +  uint32_t large_state_count; +  const uint16_t *small_parse_table; +  const uint32_t *small_parse_table_map; +}; + +/* + *  Lexer Macros + */ + +#define START_LEXER()           \ +  bool result = false;          \ +  bool skip = false;            \ +  int32_t lookahead;            \ +  goto start;                   \ +  next_state:                   \ +  lexer->advance(lexer, skip);  \ +  start:                        \ +  skip = false;                 \ +  lookahead = lexer->lookahead; + +#define ADVANCE(state_value) \ +  {                          \ +    state = state_value;     \ +    goto next_state;         \ +  } + +#define SKIP(state_value) \ +  {                       \ +    skip = true;          \ +    state = state_value;  \ +    goto next_state;      \ +  } + +#define ACCEPT_TOKEN(symbol_value)     \ +  result = true;                       \ +  lexer->result_symbol = symbol_value; \ +  lexer->mark_end(lexer); + +#define END_STATE() return result; + +/* + *  Parse Table Macros + */ + +#define SMALL_STATE(id) id - LARGE_STATE_COUNT + +#define STATE(id) id + +#define ACTIONS(id) id + +#define SHIFT(state_value)              \ +  {                                     \ +    {                                   \ +      .type = TSParseActionTypeShift,   \ +      .params = {.state = state_value}, \ +    }                                   \ +  } + +#define SHIFT_REPEAT(state_value)     \ +  {                                   \ +    {                                 \ +      .type = TSParseActionTypeShift, \ +      .params = {                     \ +        .state = state_value,         \ +        .repetition = true            \ +      },                              \ +    }                                 \ +  } + +#define RECOVER()                        \ +  {                                      \ +    { .type = TSParseActionTypeRecover } \ +  } + +#define SHIFT_EXTRA()                 \ +  {                                   \ +    {                                 \ +      .type = TSParseActionTypeShift, \ +      .params = {.extra = true}       \ +    }                                 \ +  } + +#define REDUCE(symbol_val, child_count_val, ...) \ +  {                                              \ +    {                                            \ +      .type = TSParseActionTypeReduce,           \ +      .params = {                                \ +        .symbol = symbol_val,                    \ +        .child_count = child_count_val,          \ +        __VA_ARGS__                              \ +      }                                          \ +    }                                            \ +  } + +#define ACCEPT_INPUT()                  \ +  {                                     \ +    { .type = TSParseActionTypeAccept } \ +  } + +#ifdef __cplusplus +} +#endif + +#endif  // TREE_SITTER_PARSER_H_ diff --git a/src/tree_sitter/point.h b/src/tree_sitter/point.h new file mode 100644 index 0000000000..4d0aed18ef --- /dev/null +++ b/src/tree_sitter/point.h @@ -0,0 +1,53 @@ +#ifndef TREE_SITTER_POINT_H_ +#define TREE_SITTER_POINT_H_ + +#include "tree_sitter/api.h" + +#define POINT_MAX ((TSPoint) {UINT32_MAX, UINT32_MAX}) + +static inline TSPoint point__new(unsigned row, unsigned column) { +  TSPoint result = {row, column}; +  return result; +} + +static inline TSPoint point_add(TSPoint a, TSPoint b) { +  if (b.row > 0) +    return point__new(a.row + b.row, b.column); +  else +    return point__new(a.row, a.column + b.column); +} + +static inline TSPoint point_sub(TSPoint a, TSPoint b) { +  if (a.row > b.row) +    return point__new(a.row - b.row, a.column); +  else +    return point__new(0, a.column - b.column); +} + +static inline bool point_lte(TSPoint a, TSPoint b) { +  return (a.row < b.row) || (a.row == b.row && a.column <= b.column); +} + +static inline bool point_lt(TSPoint a, TSPoint b) { +  return (a.row < b.row) || (a.row == b.row && a.column < b.column); +} + +static inline bool point_eq(TSPoint a, TSPoint b) { +  return a.row == b.row && a.column == b.column; +} + +static inline TSPoint point_min(TSPoint a, TSPoint b) { +  if (a.row < b.row || (a.row == b.row && a.column < b.column)) +    return a; +  else +    return b; +} + +static inline TSPoint point_max(TSPoint a, TSPoint b) { +  if (a.row > b.row || (a.row == b.row && a.column > b.column)) +    return a; +  else +    return b; +} + +#endif diff --git a/src/tree_sitter/reduce_action.h b/src/tree_sitter/reduce_action.h new file mode 100644 index 0000000000..72aff08d73 --- /dev/null +++ b/src/tree_sitter/reduce_action.h @@ -0,0 +1,34 @@ +#ifndef TREE_SITTER_REDUCE_ACTION_H_ +#define TREE_SITTER_REDUCE_ACTION_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "./array.h" +#include "tree_sitter/api.h" + +typedef struct { +  uint32_t count; +  TSSymbol symbol; +  int dynamic_precedence; +  unsigned short production_id; +} ReduceAction; + +typedef Array(ReduceAction) ReduceActionSet; + +static inline void ts_reduce_action_set_add(ReduceActionSet *self, +                                            ReduceAction new_action) { +  for (uint32_t i = 0; i < self->size; i++) { +    ReduceAction action = self->contents[i]; +    if (action.symbol == new_action.symbol && action.count == new_action.count) +      return; +  } +  array_push(self, new_action); +} + +#ifdef __cplusplus +} +#endif + +#endif  // TREE_SITTER_REDUCE_ACTION_H_ diff --git a/src/tree_sitter/reusable_node.h b/src/tree_sitter/reusable_node.h new file mode 100644 index 0000000000..9cba951909 --- /dev/null +++ b/src/tree_sitter/reusable_node.h @@ -0,0 +1,88 @@ +#include "./subtree.h" + +typedef struct { +  Subtree tree; +  uint32_t child_index; +  uint32_t byte_offset; +} StackEntry; + +typedef struct { +  Array(StackEntry) stack; +  Subtree last_external_token; +} ReusableNode; + +static inline ReusableNode reusable_node_new(void) { +  return (ReusableNode) {array_new(), NULL_SUBTREE}; +} + +static inline void reusable_node_clear(ReusableNode *self) { +  array_clear(&self->stack); +  self->last_external_token = NULL_SUBTREE; +} + +static inline void reusable_node_reset(ReusableNode *self, Subtree tree) { +  reusable_node_clear(self); +  array_push(&self->stack, ((StackEntry) { +    .tree = tree, +    .child_index = 0, +    .byte_offset = 0, +  })); +} + +static inline Subtree reusable_node_tree(ReusableNode *self) { +  return self->stack.size > 0 +    ? self->stack.contents[self->stack.size - 1].tree +    : NULL_SUBTREE; +} + +static inline uint32_t reusable_node_byte_offset(ReusableNode *self) { +  return self->stack.size > 0 +    ? self->stack.contents[self->stack.size - 1].byte_offset +    : UINT32_MAX; +} + +static inline void reusable_node_delete(ReusableNode *self) { +  array_delete(&self->stack); +} + +static inline void reusable_node_advance(ReusableNode *self) { +  StackEntry last_entry = *array_back(&self->stack); +  uint32_t byte_offset = last_entry.byte_offset + ts_subtree_total_bytes(last_entry.tree); +  if (ts_subtree_has_external_tokens(last_entry.tree)) { +    self->last_external_token = ts_subtree_last_external_token(last_entry.tree); +  } + +  Subtree tree; +  uint32_t next_index; +  do { +    StackEntry popped_entry = array_pop(&self->stack); +    next_index = popped_entry.child_index + 1; +    if (self->stack.size == 0) return; +    tree = array_back(&self->stack)->tree; +  } while (ts_subtree_child_count(tree) <= next_index); + +  array_push(&self->stack, ((StackEntry) { +    .tree = tree.ptr->children[next_index], +    .child_index = next_index, +    .byte_offset = byte_offset, +  })); +} + +static inline bool reusable_node_descend(ReusableNode *self) { +  StackEntry last_entry = *array_back(&self->stack); +  if (ts_subtree_child_count(last_entry.tree) > 0) { +    array_push(&self->stack, ((StackEntry) { +      .tree = last_entry.tree.ptr->children[0], +      .child_index = 0, +      .byte_offset = last_entry.byte_offset, +    })); +    return true; +  } else { +    return false; +  } +} + +static inline void reusable_node_advance_past_leaf(ReusableNode *self) { +  while (reusable_node_descend(self)) {} +  reusable_node_advance(self); +} diff --git a/src/tree_sitter/stack.c b/src/tree_sitter/stack.c new file mode 100644 index 0000000000..3e842c99c3 --- /dev/null +++ b/src/tree_sitter/stack.c @@ -0,0 +1,846 @@ +#include "./alloc.h" +#include "./language.h" +#include "./subtree.h" +#include "./array.h" +#include "./stack.h" +#include "./length.h" +#include <assert.h> +#include <stdio.h> + +#define MAX_LINK_COUNT 8 +#define MAX_NODE_POOL_SIZE 50 +#define MAX_ITERATOR_COUNT 64 + +#ifdef _WIN32 +#define inline __forceinline +#else +#define inline static inline __attribute__((always_inline)) +#endif + +typedef struct StackNode StackNode; + +typedef struct { +  StackNode *node; +  Subtree subtree; +  bool is_pending; +} StackLink; + +struct StackNode { +  TSStateId state; +  Length position; +  StackLink links[MAX_LINK_COUNT]; +  short unsigned int link_count; +  uint32_t ref_count; +  unsigned error_cost; +  unsigned node_count; +  int dynamic_precedence; +}; + +typedef struct { +  StackNode *node; +  SubtreeArray subtrees; +  uint32_t subtree_count; +  bool is_pending; +} StackIterator; + +typedef struct { +  void *payload; +  StackIterateCallback callback; +} StackIterateSession; + +typedef Array(StackNode *) StackNodeArray; + +typedef enum { +  StackStatusActive, +  StackStatusPaused, +  StackStatusHalted, +} StackStatus; + +typedef struct { +  StackNode *node; +  Subtree last_external_token; +  StackSummary *summary; +  unsigned node_count_at_last_error; +  TSSymbol lookahead_when_paused; +  StackStatus status; +} StackHead; + +struct Stack { +  Array(StackHead) heads; +  StackSliceArray slices; +  Array(StackIterator) iterators; +  StackNodeArray node_pool; +  StackNode *base_node; +  SubtreePool *subtree_pool; +}; + +typedef unsigned StackAction; +enum { +  StackActionNone, +  StackActionStop = 1, +  StackActionPop = 2, +}; + +typedef StackAction (*StackCallback)(void *, const StackIterator *); + +static void stack_node_retain(StackNode *self) { +  if (!self) +    return; +  assert(self->ref_count > 0); +  self->ref_count++; +  assert(self->ref_count != 0); +} + +static void stack_node_release(StackNode *self, StackNodeArray *pool, SubtreePool *subtree_pool) { +recur: +  assert(self->ref_count != 0); +  self->ref_count--; +  if (self->ref_count > 0) return; + +  StackNode *first_predecessor = NULL; +  if (self->link_count > 0) { +    for (unsigned i = self->link_count - 1; i > 0; i--) { +      StackLink link = self->links[i]; +      if (link.subtree.ptr) ts_subtree_release(subtree_pool, link.subtree); +      stack_node_release(link.node, pool, subtree_pool); +    } +    StackLink link = self->links[0]; +    if (link.subtree.ptr) ts_subtree_release(subtree_pool, link.subtree); +    first_predecessor = self->links[0].node; +  } + +  if (pool->size < MAX_NODE_POOL_SIZE) { +    array_push(pool, self); +  } else { +    ts_free(self); +  } + +  if (first_predecessor) { +    self = first_predecessor; +    goto recur; +  } +} + +static StackNode *stack_node_new(StackNode *previous_node, Subtree subtree, +                                 bool is_pending, TSStateId state, StackNodeArray *pool) { +  StackNode *node = pool->size > 0 ? +    array_pop(pool) : +    ts_malloc(sizeof(StackNode)); +  *node = (StackNode){.ref_count = 1, .link_count = 0, .state = state}; + +  if (previous_node) { +    node->link_count = 1; +    node->links[0] = (StackLink){ +      .node = previous_node, +      .subtree = subtree, +      .is_pending = is_pending, +    }; + +    node->position = previous_node->position; +    node->error_cost = previous_node->error_cost; +    node->dynamic_precedence = previous_node->dynamic_precedence; +    node->node_count = previous_node->node_count; + +    if (subtree.ptr) { +      node->error_cost += ts_subtree_error_cost(subtree); +      node->position = length_add(node->position, ts_subtree_total_size(subtree)); +      node->node_count += ts_subtree_node_count(subtree); +      node->dynamic_precedence += ts_subtree_dynamic_precedence(subtree); +    } +  } else { +    node->position = length_zero(); +    node->error_cost = 0; +  } + +  return node; +} + +static bool stack__subtree_is_equivalent(Subtree left, Subtree right) { +  return +    left.ptr == right.ptr || +    (left.ptr && right.ptr && +     ts_subtree_symbol(left) == ts_subtree_symbol(right) && +     ((ts_subtree_error_cost(left) > 0 && ts_subtree_error_cost(right) > 0) || +      (ts_subtree_padding(left).bytes == ts_subtree_padding(right).bytes && +       ts_subtree_size(left).bytes == ts_subtree_size(right).bytes && +       ts_subtree_child_count(left) == ts_subtree_child_count(right) && +       ts_subtree_extra(left) == ts_subtree_extra(right) && +       ts_subtree_external_scanner_state_eq(left, right)))); +} + +static void stack_node_add_link(StackNode *self, StackLink link, SubtreePool *subtree_pool) { +  if (link.node == self) return; + +  for (int i = 0; i < self->link_count; i++) { +    StackLink *existing_link = &self->links[i]; +    if (stack__subtree_is_equivalent(existing_link->subtree, link.subtree)) { +      // In general, we preserve ambiguities until they are removed from the stack +      // during a pop operation where multiple paths lead to the same node. But in +      // the special case where two links directly connect the same pair of nodes, +      // we can safely remove the ambiguity ahead of time without changing behavior. +      if (existing_link->node == link.node) { +        if ( +          ts_subtree_dynamic_precedence(link.subtree) > +          ts_subtree_dynamic_precedence(existing_link->subtree) +        ) { +          ts_subtree_retain(link.subtree); +          ts_subtree_release(subtree_pool, existing_link->subtree); +          existing_link->subtree = link.subtree; +          self->dynamic_precedence = +            link.node->dynamic_precedence + ts_subtree_dynamic_precedence(link.subtree); +        } +        return; +      } + +      // If the previous nodes are mergeable, merge them recursively. +      if (existing_link->node->state == link.node->state && +          existing_link->node->position.bytes == link.node->position.bytes) { +        for (int j = 0; j < link.node->link_count; j++) { +          stack_node_add_link(existing_link->node, link.node->links[j], subtree_pool); +        } +        int32_t dynamic_precedence = link.node->dynamic_precedence; +        if (link.subtree.ptr) { +          dynamic_precedence += ts_subtree_dynamic_precedence(link.subtree); +        } +        if (dynamic_precedence > self->dynamic_precedence) { +          self->dynamic_precedence = dynamic_precedence; +        } +        return; +      } +    } +  } + +  if (self->link_count == MAX_LINK_COUNT) return; + +  stack_node_retain(link.node); +  unsigned node_count = link.node->node_count; +  int dynamic_precedence = link.node->dynamic_precedence; +  self->links[self->link_count++] = link; + +  if (link.subtree.ptr) { +    ts_subtree_retain(link.subtree); +    node_count += ts_subtree_node_count(link.subtree); +    dynamic_precedence += ts_subtree_dynamic_precedence(link.subtree); +  } + +  if (node_count > self->node_count) self->node_count = node_count; +  if (dynamic_precedence > self->dynamic_precedence) self->dynamic_precedence = dynamic_precedence; +} + +static void stack_head_delete(StackHead *self, StackNodeArray *pool, SubtreePool *subtree_pool) { +  if (self->node) { +    if (self->last_external_token.ptr) { +      ts_subtree_release(subtree_pool, self->last_external_token); +    } +    if (self->summary) { +      array_delete(self->summary); +      ts_free(self->summary); +    } +    stack_node_release(self->node, pool, subtree_pool); +  } +} + +static StackVersion ts_stack__add_version(Stack *self, StackVersion original_version, +                                          StackNode *node) { +  StackHead head = { +    .node = node, +    .node_count_at_last_error = self->heads.contents[original_version].node_count_at_last_error, +    .last_external_token = self->heads.contents[original_version].last_external_token, +    .status = StackStatusActive, +    .lookahead_when_paused = 0, +  }; +  array_push(&self->heads, head); +  stack_node_retain(node); +  if (head.last_external_token.ptr) ts_subtree_retain(head.last_external_token); +  return (StackVersion)(self->heads.size - 1); +} + +static void ts_stack__add_slice(Stack *self, StackVersion original_version, +                                StackNode *node, SubtreeArray *subtrees) { +  for (uint32_t i = self->slices.size - 1; i + 1 > 0; i--) { +    StackVersion version = self->slices.contents[i].version; +    if (self->heads.contents[version].node == node) { +      StackSlice slice = {*subtrees, version}; +      array_insert(&self->slices, i + 1, slice); +      return; +    } +  } + +  StackVersion version = ts_stack__add_version(self, original_version, node); +  StackSlice slice = { *subtrees, version }; +  array_push(&self->slices, slice); +} + +inline StackSliceArray stack__iter(Stack *self, StackVersion version, +                                   StackCallback callback, void *payload, +                                   int goal_subtree_count) { +  array_clear(&self->slices); +  array_clear(&self->iterators); + +  StackHead *head = array_get(&self->heads, version); +  StackIterator iterator = { +    .node = head->node, +    .subtrees = array_new(), +    .subtree_count = 0, +    .is_pending = true, +  }; + +  bool include_subtrees = false; +  if (goal_subtree_count >= 0) { +    include_subtrees = true; +    array_reserve(&iterator.subtrees, goal_subtree_count); +  } + +  array_push(&self->iterators, iterator); + +  while (self->iterators.size > 0) { +    for (uint32_t i = 0, size = self->iterators.size; i < size; i++) { +      StackIterator *iterator = &self->iterators.contents[i]; +      StackNode *node = iterator->node; + +      StackAction action = callback(payload, iterator); +      bool should_pop = action & StackActionPop; +      bool should_stop = action & StackActionStop || node->link_count == 0; + +      if (should_pop) { +        SubtreeArray subtrees = iterator->subtrees; +        if (!should_stop) +          ts_subtree_array_copy(subtrees, &subtrees); +        ts_subtree_array_reverse(&subtrees); +        ts_stack__add_slice( +          self, +          version, +          node, +          &subtrees +        ); +      } + +      if (should_stop) { +        if (!should_pop) +          ts_subtree_array_delete(self->subtree_pool, &iterator->subtrees); +        array_erase(&self->iterators, i); +        i--, size--; +        continue; +      } + +      for (uint32_t j = 1; j <= node->link_count; j++) { +        StackIterator *next_iterator; +        StackLink link; +        if (j == node->link_count) { +          link = node->links[0]; +          next_iterator = &self->iterators.contents[i]; +        } else { +          if (self->iterators.size >= MAX_ITERATOR_COUNT) continue; +          link = node->links[j]; +          StackIterator current_iterator = self->iterators.contents[i]; +          array_push(&self->iterators, current_iterator); +          next_iterator = array_back(&self->iterators); +          ts_subtree_array_copy(next_iterator->subtrees, &next_iterator->subtrees); +        } + +        next_iterator->node = link.node; +        if (link.subtree.ptr) { +          if (include_subtrees) { +            array_push(&next_iterator->subtrees, link.subtree); +            ts_subtree_retain(link.subtree); +          } + +          if (!ts_subtree_extra(link.subtree)) { +            next_iterator->subtree_count++; +            if (!link.is_pending) { +              next_iterator->is_pending = false; +            } +          } +        } else { +          next_iterator->subtree_count++; +          next_iterator->is_pending = false; +        } +      } +    } +  } + +  return self->slices; +} + +Stack *ts_stack_new(SubtreePool *subtree_pool) { +  Stack *self = ts_calloc(1, sizeof(Stack)); + +  array_init(&self->heads); +  array_init(&self->slices); +  array_init(&self->iterators); +  array_init(&self->node_pool); +  array_reserve(&self->heads, 4); +  array_reserve(&self->slices, 4); +  array_reserve(&self->iterators, 4); +  array_reserve(&self->node_pool, MAX_NODE_POOL_SIZE); + +  self->subtree_pool = subtree_pool; +  self->base_node = stack_node_new(NULL, NULL_SUBTREE, false, 1, &self->node_pool); +  ts_stack_clear(self); + +  return self; +} + +void ts_stack_delete(Stack *self) { +  if (self->slices.contents) +    array_delete(&self->slices); +  if (self->iterators.contents) +    array_delete(&self->iterators); +  stack_node_release(self->base_node, &self->node_pool, self->subtree_pool); +  for (uint32_t i = 0; i < self->heads.size; i++) { +    stack_head_delete(&self->heads.contents[i], &self->node_pool, self->subtree_pool); +  } +  array_clear(&self->heads); +  if (self->node_pool.contents) { +    for (uint32_t i = 0; i < self->node_pool.size; i++) +      ts_free(self->node_pool.contents[i]); +    array_delete(&self->node_pool); +  } +  array_delete(&self->heads); +  ts_free(self); +} + +uint32_t ts_stack_version_count(const Stack *self) { +  return self->heads.size; +} + +TSStateId ts_stack_state(const Stack *self, StackVersion version) { +  return array_get(&self->heads, version)->node->state; +} + +Length ts_stack_position(const Stack *self, StackVersion version) { +  return array_get(&self->heads, version)->node->position; +} + +Subtree ts_stack_last_external_token(const Stack *self, StackVersion version) { +  return array_get(&self->heads, version)->last_external_token; +} + +void ts_stack_set_last_external_token(Stack *self, StackVersion version, Subtree token) { +  StackHead *head = array_get(&self->heads, version); +  if (token.ptr) ts_subtree_retain(token); +  if (head->last_external_token.ptr) ts_subtree_release(self->subtree_pool, head->last_external_token); +  head->last_external_token = token; +} + +unsigned ts_stack_error_cost(const Stack *self, StackVersion version) { +  StackHead *head = array_get(&self->heads, version); +  unsigned result = head->node->error_cost; +  if ( +    head->status == StackStatusPaused || +    (head->node->state == ERROR_STATE && !head->node->links[0].subtree.ptr)) { +    result += ERROR_COST_PER_RECOVERY; +  } +  return result; +} + +unsigned ts_stack_node_count_since_error(const Stack *self, StackVersion version) { +  StackHead *head = array_get(&self->heads, version); +  if (head->node->node_count < head->node_count_at_last_error) { +    head->node_count_at_last_error = head->node->node_count; +  } +  return head->node->node_count - head->node_count_at_last_error; +} + +void ts_stack_push(Stack *self, StackVersion version, Subtree subtree, +                   bool pending, TSStateId state) { +  StackHead *head = array_get(&self->heads, version); +  StackNode *new_node = stack_node_new(head->node, subtree, pending, state, &self->node_pool); +  if (!subtree.ptr) head->node_count_at_last_error = new_node->node_count; +  head->node = new_node; +} + +inline StackAction iterate_callback(void *payload, const StackIterator *iterator) { +  StackIterateSession *session = payload; +  session->callback( +    session->payload, +    iterator->node->state, +    iterator->subtree_count +  ); +  return StackActionNone; +} + +void ts_stack_iterate(Stack *self, StackVersion version, +                      StackIterateCallback callback, void *payload) { +  StackIterateSession session = {payload, callback}; +  stack__iter(self, version, iterate_callback, &session, -1); +} + +inline StackAction pop_count_callback(void *payload, const StackIterator *iterator) { +  unsigned *goal_subtree_count = payload; +  if (iterator->subtree_count == *goal_subtree_count) { +    return StackActionPop | StackActionStop; +  } else { +    return StackActionNone; +  } +} + +StackSliceArray ts_stack_pop_count(Stack *self, StackVersion version, uint32_t count) { +  return stack__iter(self, version, pop_count_callback, &count, count); +} + +inline StackAction pop_pending_callback(void *payload, const StackIterator *iterator) { +  if (iterator->subtree_count >= 1) { +    if (iterator->is_pending) { +      return StackActionPop | StackActionStop; +    } else { +      return StackActionStop; +    } +  } else { +    return StackActionNone; +  } +} + +StackSliceArray ts_stack_pop_pending(Stack *self, StackVersion version) { +  StackSliceArray pop = stack__iter(self, version, pop_pending_callback, NULL, 0); +  if (pop.size > 0) { +    ts_stack_renumber_version(self, pop.contents[0].version, version); +    pop.contents[0].version = version; +  } +  return pop; +} + +inline StackAction pop_error_callback(void *payload, const StackIterator *iterator) { +  if (iterator->subtrees.size > 0) { +    bool *found_error = payload; +    if (!*found_error && ts_subtree_is_error(iterator->subtrees.contents[0])) { +      *found_error = true; +      return StackActionPop | StackActionStop; +    } else { +      return StackActionStop; +    } +  } else { +    return StackActionNone; +  } +} + +SubtreeArray ts_stack_pop_error(Stack *self, StackVersion version) { +  StackNode *node = array_get(&self->heads, version)->node; +  for (unsigned i = 0; i < node->link_count; i++) { +    if (node->links[i].subtree.ptr && ts_subtree_is_error(node->links[i].subtree)) { +      bool found_error = false; +      StackSliceArray pop = stack__iter(self, version, pop_error_callback, &found_error, 1); +      if (pop.size > 0) { +        assert(pop.size == 1); +        ts_stack_renumber_version(self, pop.contents[0].version, version); +        return pop.contents[0].subtrees; +      } +      break; +    } +  } +  return (SubtreeArray){.size = 0}; +} + +inline StackAction pop_all_callback(void *payload, const StackIterator *iterator) { +  return iterator->node->link_count == 0 ? StackActionPop : StackActionNone; +} + +StackSliceArray ts_stack_pop_all(Stack *self, StackVersion version) { +  return stack__iter(self, version, pop_all_callback, NULL, 0); +} + +typedef struct { +  StackSummary *summary; +  unsigned max_depth; +} SummarizeStackSession; + +inline StackAction summarize_stack_callback(void *payload, const StackIterator *iterator) { +  SummarizeStackSession *session = payload; +  TSStateId state = iterator->node->state; +  unsigned depth = iterator->subtree_count; +  if (depth > session->max_depth) return StackActionStop; +  for (unsigned i = session->summary->size - 1; i + 1 > 0; i--) { +    StackSummaryEntry entry = session->summary->contents[i]; +    if (entry.depth < depth) break; +    if (entry.depth == depth && entry.state == state) return StackActionNone; +  } +  array_push(session->summary, ((StackSummaryEntry){ +    .position = iterator->node->position, +    .depth = depth, +    .state = state, +  })); +  return StackActionNone; +} + +void ts_stack_record_summary(Stack *self, StackVersion version, unsigned max_depth) { +  SummarizeStackSession session = { +    .summary = ts_malloc(sizeof(StackSummary)), +    .max_depth = max_depth +  }; +  array_init(session.summary); +  stack__iter(self, version, summarize_stack_callback, &session, -1); +  self->heads.contents[version].summary = session.summary; +} + +StackSummary *ts_stack_get_summary(Stack *self, StackVersion version) { +  return array_get(&self->heads, version)->summary; +} + +int ts_stack_dynamic_precedence(Stack *self, StackVersion version) { +  return array_get(&self->heads, version)->node->dynamic_precedence; +} + +bool ts_stack_has_advanced_since_error(const Stack *self, StackVersion version) { +  const StackHead *head = array_get(&self->heads, version); +  const StackNode *node = head->node; +  if (node->error_cost == 0) return true; +  while (node) { +    if (node->link_count > 0) { +      Subtree subtree = node->links[0].subtree; +      if (subtree.ptr) { +        if (ts_subtree_total_bytes(subtree) > 0) { +          return true; +        } else if ( +          node->node_count > head->node_count_at_last_error && +          ts_subtree_error_cost(subtree) == 0 +        ) { +          node = node->links[0].node; +          continue; +        } +      } +    } +    break; +  } +  return false; +} + +void ts_stack_remove_version(Stack *self, StackVersion version) { +  stack_head_delete(array_get(&self->heads, version), &self->node_pool, self->subtree_pool); +  array_erase(&self->heads, version); +} + +void ts_stack_renumber_version(Stack *self, StackVersion v1, StackVersion v2) { +  if (v1 == v2) return; +  assert(v2 < v1); +  assert((uint32_t)v1 < self->heads.size); +  StackHead *source_head = &self->heads.contents[v1]; +  StackHead *target_head = &self->heads.contents[v2]; +  if (target_head->summary && !source_head->summary) { +    source_head->summary = target_head->summary; +    target_head->summary = NULL; +  } +  stack_head_delete(target_head, &self->node_pool, self->subtree_pool); +  *target_head = *source_head; +  array_erase(&self->heads, v1); +} + +void ts_stack_swap_versions(Stack *self, StackVersion v1, StackVersion v2) { +  StackHead temporary_head = self->heads.contents[v1]; +  self->heads.contents[v1] = self->heads.contents[v2]; +  self->heads.contents[v2] = temporary_head; +} + +StackVersion ts_stack_copy_version(Stack *self, StackVersion version) { +  assert(version < self->heads.size); +  array_push(&self->heads, self->heads.contents[version]); +  StackHead *head = array_back(&self->heads); +  stack_node_retain(head->node); +  if (head->last_external_token.ptr) ts_subtree_retain(head->last_external_token); +  head->summary = NULL; +  return self->heads.size - 1; +} + +bool ts_stack_merge(Stack *self, StackVersion version1, StackVersion version2) { +  if (!ts_stack_can_merge(self, version1, version2)) return false; +  StackHead *head1 = &self->heads.contents[version1]; +  StackHead *head2 = &self->heads.contents[version2]; +  for (uint32_t i = 0; i < head2->node->link_count; i++) { +    stack_node_add_link(head1->node, head2->node->links[i], self->subtree_pool); +  } +  if (head1->node->state == ERROR_STATE) { +    head1->node_count_at_last_error = head1->node->node_count; +  } +  ts_stack_remove_version(self, version2); +  return true; +} + +bool ts_stack_can_merge(Stack *self, StackVersion version1, StackVersion version2) { +  StackHead *head1 = &self->heads.contents[version1]; +  StackHead *head2 = &self->heads.contents[version2]; +  return +    head1->status == StackStatusActive && +    head2->status == StackStatusActive && +    head1->node->state == head2->node->state && +    head1->node->position.bytes == head2->node->position.bytes && +    head1->node->error_cost == head2->node->error_cost && +    ts_subtree_external_scanner_state_eq(head1->last_external_token, head2->last_external_token); +} + +void ts_stack_halt(Stack *self, StackVersion version) { +  array_get(&self->heads, version)->status = StackStatusHalted; +} + +void ts_stack_pause(Stack *self, StackVersion version, TSSymbol lookahead) { +  StackHead *head = array_get(&self->heads, version); +  head->status = StackStatusPaused; +  head->lookahead_when_paused = lookahead; +  head->node_count_at_last_error = head->node->node_count; +} + +bool ts_stack_is_active(const Stack *self, StackVersion version) { +  return array_get(&self->heads, version)->status == StackStatusActive; +} + +bool ts_stack_is_halted(const Stack *self, StackVersion version) { +  return array_get(&self->heads, version)->status == StackStatusHalted; +} + +bool ts_stack_is_paused(const Stack *self, StackVersion version) { +  return array_get(&self->heads, version)->status == StackStatusPaused; +} + +TSSymbol ts_stack_resume(Stack *self, StackVersion version) { +  StackHead *head = array_get(&self->heads, version); +  assert(head->status == StackStatusPaused); +  TSSymbol result = head->lookahead_when_paused; +  head->status = StackStatusActive; +  head->lookahead_when_paused = 0; +  return result; +} + +void ts_stack_clear(Stack *self) { +  stack_node_retain(self->base_node); +  for (uint32_t i = 0; i < self->heads.size; i++) { +    stack_head_delete(&self->heads.contents[i], &self->node_pool, self->subtree_pool); +  } +  array_clear(&self->heads); +  array_push(&self->heads, ((StackHead){ +    .node = self->base_node, +    .last_external_token = NULL_SUBTREE, +    .status = StackStatusActive, +    .lookahead_when_paused = 0, +  })); +} + +bool ts_stack_print_dot_graph(Stack *self, const TSLanguage *language, FILE *f) { +  array_reserve(&self->iterators, 32); +  bool was_recording_allocations = ts_toggle_allocation_recording(false); +  if (!f) f = stderr; + +  fprintf(f, "digraph stack {\n"); +  fprintf(f, "rankdir=\"RL\";\n"); +  fprintf(f, "edge [arrowhead=none]\n"); + +  Array(StackNode *) visited_nodes = array_new(); + +  array_clear(&self->iterators); +  for (uint32_t i = 0; i < self->heads.size; i++) { +    StackHead *head = &self->heads.contents[i]; +    if (head->status == StackStatusHalted) continue; + +    fprintf(f, "node_head_%u [shape=none, label=\"\"]\n", i); +    fprintf(f, "node_head_%u -> node_%p [", i, head->node); + +    if (head->status == StackStatusPaused) { +      fprintf(f, "color=red "); +    } +    fprintf(f, +      "label=%u, fontcolor=blue, weight=10000, labeltooltip=\"node_count: %u\nerror_cost: %u", +      i, +      ts_stack_node_count_since_error(self, i), +      ts_stack_error_cost(self, i) +    ); + +    if (head->last_external_token.ptr) { +      const ExternalScannerState *state = &head->last_external_token.ptr->external_scanner_state; +      const char *data = ts_external_scanner_state_data(state); +      fprintf(f, "\nexternal_scanner_state:"); +      for (uint32_t j = 0; j < state->length; j++) fprintf(f, " %2X", data[j]); +    } + +    fprintf(f, "\"]\n"); +    array_push(&self->iterators, ((StackIterator){.node = head->node })); +  } + +  bool all_iterators_done = false; +  while (!all_iterators_done) { +    all_iterators_done = true; + +    for (uint32_t i = 0; i < self->iterators.size; i++) { +      StackIterator iterator = self->iterators.contents[i]; +      StackNode *node = iterator.node; + +      for (uint32_t j = 0; j < visited_nodes.size; j++) { +        if (visited_nodes.contents[j] == node) { +          node = NULL; +          break; +        } +      } + +      if (!node) continue; +      all_iterators_done = false; + +      fprintf(f, "node_%p [", node); +      if (node->state == ERROR_STATE) { +        fprintf(f, "label=\"?\""); +      } else if ( +        node->link_count == 1 && +        node->links[0].subtree.ptr && +        ts_subtree_extra(node->links[0].subtree) +      ) { +        fprintf(f, "shape=point margin=0 label=\"\""); +      } else { +        fprintf(f, "label=\"%d\"", node->state); +      } + +      fprintf( +        f, +        " tooltip=\"position: %u,%u\nnode_count:%u\nerror_cost: %u\ndynamic_precedence: %d\"];\n", +        node->position.extent.row + 1, +        node->position.extent.column, +        node->node_count, +        node->error_cost, +        node->dynamic_precedence +      ); + +      for (int j = 0; j < node->link_count; j++) { +        StackLink link = node->links[j]; +        fprintf(f, "node_%p -> node_%p [", node, link.node); +        if (link.is_pending) fprintf(f, "style=dashed "); +        if (link.subtree.ptr && ts_subtree_extra(link.subtree)) fprintf(f, "fontcolor=gray "); + +        if (!link.subtree.ptr) { +          fprintf(f, "color=red"); +        } else { +          fprintf(f, "label=\""); +          bool quoted = ts_subtree_visible(link.subtree) && !ts_subtree_named(link.subtree); +          if (quoted) fprintf(f, "'"); +          const char *name = ts_language_symbol_name(language, ts_subtree_symbol(link.subtree)); +          for (const char *c = name; *c; c++) { +            if (*c == '\"' || *c == '\\') fprintf(f, "\\"); +            fprintf(f, "%c", *c); +          } +          if (quoted) fprintf(f, "'"); +          fprintf(f, "\""); +          fprintf( +            f, +            "labeltooltip=\"error_cost: %u\ndynamic_precedence: %u\"", +            ts_subtree_error_cost(link.subtree), +            ts_subtree_dynamic_precedence(link.subtree) +          ); +        } + +        fprintf(f, "];\n"); + +        StackIterator *next_iterator; +        if (j == 0) { +          next_iterator = &self->iterators.contents[i]; +        } else { +          array_push(&self->iterators, iterator); +          next_iterator = array_back(&self->iterators); +        } +        next_iterator->node = link.node; +      } + +      array_push(&visited_nodes, node); +    } +  } + +  fprintf(f, "}\n"); + +  array_delete(&visited_nodes); +  ts_toggle_allocation_recording(was_recording_allocations); +  return true; +} + +#undef inline diff --git a/src/tree_sitter/stack.h b/src/tree_sitter/stack.h new file mode 100644 index 0000000000..ec7a69d2b4 --- /dev/null +++ b/src/tree_sitter/stack.h @@ -0,0 +1,135 @@ +#ifndef TREE_SITTER_PARSE_STACK_H_ +#define TREE_SITTER_PARSE_STACK_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "./array.h" +#include "./subtree.h" +#include "./error_costs.h" +#include <stdio.h> + +typedef struct Stack Stack; + +typedef unsigned StackVersion; +#define STACK_VERSION_NONE ((StackVersion)-1) + +typedef struct { +  SubtreeArray subtrees; +  StackVersion version; +} StackSlice; +typedef Array(StackSlice) StackSliceArray; + +typedef struct { +  Length position; +  unsigned depth; +  TSStateId state; +} StackSummaryEntry; +typedef Array(StackSummaryEntry) StackSummary; + +// Create a stack. +Stack *ts_stack_new(SubtreePool *); + +// Release the memory reserved for a given stack. +void ts_stack_delete(Stack *); + +// Get the stack's current number of versions. +uint32_t ts_stack_version_count(const Stack *); + +// Get the state at the top of the given version of the stack. If the stack is +// empty, this returns the initial state, 0. +TSStateId ts_stack_state(const Stack *, StackVersion); + +// Get the last external token associated with a given version of the stack. +Subtree ts_stack_last_external_token(const Stack *, StackVersion); + +// Set the last external token associated with a given version of the stack. +void ts_stack_set_last_external_token(Stack *, StackVersion, Subtree ); + +// Get the position of the given version of the stack within the document. +Length ts_stack_position(const Stack *, StackVersion); + +// Push a tree and state onto the given version of the stack. +// +// This transfers ownership of the tree to the Stack. Callers that +// need to retain ownership of the tree for their own purposes should +// first retain the tree. +void ts_stack_push(Stack *, StackVersion, Subtree , bool, TSStateId); + +// Pop the given number of entries from the given version of the stack. This +// operation can increase the number of stack versions by revealing multiple +// versions which had previously been merged. It returns an array that +// specifies the index of each revealed version and the trees that were +// removed from that version. +StackSliceArray ts_stack_pop_count(Stack *, StackVersion, uint32_t count); + +// Remove an error at the top of the given version of the stack. +SubtreeArray ts_stack_pop_error(Stack *, StackVersion); + +// Remove any pending trees from the top of the given version of the stack. +StackSliceArray ts_stack_pop_pending(Stack *, StackVersion); + +// Remove any all trees from the given version of the stack. +StackSliceArray ts_stack_pop_all(Stack *, StackVersion); + +// Get the maximum number of tree nodes reachable from this version of the stack +// since the last error was detected. +unsigned ts_stack_node_count_since_error(const Stack *, StackVersion); + +int ts_stack_dynamic_precedence(Stack *, StackVersion); + +bool ts_stack_has_advanced_since_error(const Stack *, StackVersion); + +// Compute a summary of all the parse states near the top of the given +// version of the stack and store the summary for later retrieval. +void ts_stack_record_summary(Stack *, StackVersion, unsigned max_depth); + +// Retrieve a summary of all the parse states near the top of the +// given version of the stack. +StackSummary *ts_stack_get_summary(Stack *, StackVersion); + +// Get the total cost of all errors on the given version of the stack. +unsigned ts_stack_error_cost(const Stack *, StackVersion version); + +// Merge the given two stack versions if possible, returning true +// if they were successfully merged and false otherwise. +bool ts_stack_merge(Stack *, StackVersion, StackVersion); + +// Determine whether the given two stack versions can be merged. +bool ts_stack_can_merge(Stack *, StackVersion, StackVersion); + +TSSymbol ts_stack_resume(Stack *, StackVersion); + +void ts_stack_pause(Stack *, StackVersion, TSSymbol); + +void ts_stack_halt(Stack *, StackVersion); + +bool ts_stack_is_active(const Stack *, StackVersion); + +bool ts_stack_is_paused(const Stack *, StackVersion); + +bool ts_stack_is_halted(const Stack *, StackVersion); + +void ts_stack_renumber_version(Stack *, StackVersion, StackVersion); + +void ts_stack_swap_versions(Stack *, StackVersion, StackVersion); + +StackVersion ts_stack_copy_version(Stack *, StackVersion); + +// Remove the given version from the stack. +void ts_stack_remove_version(Stack *, StackVersion); + +void ts_stack_clear(Stack *); + +bool ts_stack_print_dot_graph(Stack *, const TSLanguage *, FILE *); + +typedef void (*StackIterateCallback)(void *, TSStateId, uint32_t); + +void ts_stack_iterate(Stack *, StackVersion, StackIterateCallback, void *); + +#ifdef __cplusplus +} +#endif + +#endif  // TREE_SITTER_PARSE_STACK_H_ diff --git a/src/tree_sitter/subtree.c b/src/tree_sitter/subtree.c new file mode 100644 index 0000000000..e95733eb46 --- /dev/null +++ b/src/tree_sitter/subtree.c @@ -0,0 +1,996 @@ +#include <assert.h> +#include <ctype.h> +#include <limits.h> +#include <stdbool.h> +#include <string.h> +#include <stdio.h> +#include "./alloc.h" +#include "./atomic.h" +#include "./subtree.h" +#include "./length.h" +#include "./language.h" +#include "./error_costs.h" +#include <stddef.h> + +typedef struct { +  Length start; +  Length old_end; +  Length new_end; +} Edit; + +#ifdef TREE_SITTER_TEST + +#define TS_MAX_INLINE_TREE_LENGTH 2 +#define TS_MAX_TREE_POOL_SIZE 0 + +#else + +#define TS_MAX_INLINE_TREE_LENGTH UINT8_MAX +#define TS_MAX_TREE_POOL_SIZE 32 + +#endif + +static const ExternalScannerState empty_state = {.length = 0, .short_data = {0}}; + +// ExternalScannerState + +void ts_external_scanner_state_init(ExternalScannerState *self, const char *data, unsigned length) { +  self->length = length; +  if (length > sizeof(self->short_data)) { +    self->long_data = ts_malloc(length); +    memcpy(self->long_data, data, length); +  } else { +    memcpy(self->short_data, data, length); +  } +} + +ExternalScannerState ts_external_scanner_state_copy(const ExternalScannerState *self) { +  ExternalScannerState result = *self; +  if (self->length > sizeof(self->short_data)) { +    result.long_data = ts_malloc(self->length); +    memcpy(result.long_data, self->long_data, self->length); +  } +  return result; +} + +void ts_external_scanner_state_delete(ExternalScannerState *self) { +  if (self->length > sizeof(self->short_data)) { +    ts_free(self->long_data); +  } +} + +const char *ts_external_scanner_state_data(const ExternalScannerState *self) { +  if (self->length > sizeof(self->short_data)) { +    return self->long_data; +  } else { +    return self->short_data; +  } +} + +bool ts_external_scanner_state_eq(const ExternalScannerState *a, const ExternalScannerState *b) { +  return a == b || ( +    a->length == b->length && +    !memcmp(ts_external_scanner_state_data(a), ts_external_scanner_state_data(b), a->length) +  ); +} + +// SubtreeArray + +void ts_subtree_array_copy(SubtreeArray self, SubtreeArray *dest) { +  dest->size = self.size; +  dest->capacity = self.capacity; +  dest->contents = self.contents; +  if (self.capacity > 0) { +    dest->contents = ts_calloc(self.capacity, sizeof(Subtree)); +    memcpy(dest->contents, self.contents, self.size * sizeof(Subtree)); +    for (uint32_t i = 0; i < self.size; i++) { +      ts_subtree_retain(dest->contents[i]); +    } +  } +} + +void ts_subtree_array_delete(SubtreePool *pool, SubtreeArray *self) { +  for (uint32_t i = 0; i < self->size; i++) { +    ts_subtree_release(pool, self->contents[i]); +  } +  array_delete(self); +} + +SubtreeArray ts_subtree_array_remove_trailing_extras(SubtreeArray *self) { +  SubtreeArray result = array_new(); + +  uint32_t i = self->size - 1; +  for (; i + 1 > 0; i--) { +    Subtree child = self->contents[i]; +    if (!ts_subtree_extra(child)) break; +    array_push(&result, child); +  } + +  self->size = i + 1; +  ts_subtree_array_reverse(&result); +  return result; +} + +void ts_subtree_array_reverse(SubtreeArray *self) { +  for (uint32_t i = 0, limit = self->size / 2; i < limit; i++) { +    size_t reverse_index = self->size - 1 - i; +    Subtree swap = self->contents[i]; +    self->contents[i] = self->contents[reverse_index]; +    self->contents[reverse_index] = swap; +  } +} + +// SubtreePool + +SubtreePool ts_subtree_pool_new(uint32_t capacity) { +  SubtreePool self = {array_new(), array_new()}; +  array_reserve(&self.free_trees, capacity); +  return self; +} + +void ts_subtree_pool_delete(SubtreePool *self) { +  if (self->free_trees.contents) { +    for (unsigned i = 0; i < self->free_trees.size; i++) { +      ts_free(self->free_trees.contents[i].ptr); +    } +    array_delete(&self->free_trees); +  } +  if (self->tree_stack.contents) array_delete(&self->tree_stack); +} + +static SubtreeHeapData *ts_subtree_pool_allocate(SubtreePool *self) { +  if (self->free_trees.size > 0) { +    return array_pop(&self->free_trees).ptr; +  } else { +    return ts_malloc(sizeof(SubtreeHeapData)); +  } +} + +static void ts_subtree_pool_free(SubtreePool *self, SubtreeHeapData *tree) { +  if (self->free_trees.capacity > 0 && self->free_trees.size + 1 <= TS_MAX_TREE_POOL_SIZE) { +    array_push(&self->free_trees, (MutableSubtree) {.ptr = tree}); +  } else { +    ts_free(tree); +  } +} + +// Subtree + +static inline bool ts_subtree_can_inline(Length padding, Length size, uint32_t lookahead_bytes) { +  return +    padding.bytes < TS_MAX_INLINE_TREE_LENGTH && +    padding.extent.row < 16 && +    padding.extent.column < TS_MAX_INLINE_TREE_LENGTH && +    size.extent.row == 0 && +    size.extent.column < TS_MAX_INLINE_TREE_LENGTH && +    lookahead_bytes < 16; +} + +Subtree ts_subtree_new_leaf( +  SubtreePool *pool, TSSymbol symbol, Length padding, Length size, +  uint32_t lookahead_bytes, TSStateId parse_state, bool has_external_tokens, +  bool is_keyword, const TSLanguage *language +) { +  TSSymbolMetadata metadata = ts_language_symbol_metadata(language, symbol); +  bool extra = symbol == ts_builtin_sym_end; + +  bool is_inline = ( +    symbol <= UINT8_MAX && +    !has_external_tokens && +    ts_subtree_can_inline(padding, size, lookahead_bytes) +  ); + +  if (is_inline) { +    return (Subtree) {{ +      .parse_state = parse_state, +      .symbol = symbol, +      .padding_bytes = padding.bytes, +      .padding_rows = padding.extent.row, +      .padding_columns = padding.extent.column, +      .size_bytes = size.bytes, +      .lookahead_bytes = lookahead_bytes, +      .visible = metadata.visible, +      .named = metadata.named, +      .extra = extra, +      .has_changes = false, +      .is_missing = false, +      .is_keyword = is_keyword, +      .is_inline = true, +    }}; +  } else { +    SubtreeHeapData *data = ts_subtree_pool_allocate(pool); +    *data = (SubtreeHeapData) { +      .ref_count = 1, +      .padding = padding, +      .size = size, +      .lookahead_bytes = lookahead_bytes, +      .error_cost = 0, +      .child_count = 0, +      .symbol = symbol, +      .parse_state = parse_state, +      .visible = metadata.visible, +      .named = metadata.named, +      .extra = extra, +      .fragile_left = false, +      .fragile_right = false, +      .has_changes = false, +      .has_external_tokens = has_external_tokens, +      .is_missing = false, +      .is_keyword = is_keyword, +      .first_leaf = {.symbol = 0, .parse_state = 0}, +    }; +    return (Subtree) {.ptr = data}; +  } +} + +void ts_subtree_set_symbol( +  MutableSubtree *self, +  TSSymbol symbol, +  const TSLanguage *language +) { +  TSSymbolMetadata metadata = ts_language_symbol_metadata(language, symbol); +  if (self->data.is_inline) { +    assert(symbol < UINT8_MAX); +    self->data.symbol = symbol; +    self->data.named = metadata.named; +    self->data.visible = metadata.visible; +  } else { +    self->ptr->symbol = symbol; +    self->ptr->named = metadata.named; +    self->ptr->visible = metadata.visible; +  } +} + +Subtree ts_subtree_new_error( +  SubtreePool *pool, int32_t lookahead_char, Length padding, Length size, +  uint32_t bytes_scanned, TSStateId parse_state, const TSLanguage *language +) { +  Subtree result = ts_subtree_new_leaf( +    pool, ts_builtin_sym_error, padding, size, bytes_scanned, +    parse_state, false, false, language +  ); +  SubtreeHeapData *data = (SubtreeHeapData *)result.ptr; +  data->fragile_left = true; +  data->fragile_right = true; +  data->lookahead_char = lookahead_char; +  return result; +} + +MutableSubtree ts_subtree_make_mut(SubtreePool *pool, Subtree self) { +  if (self.data.is_inline) return (MutableSubtree) {self.data}; +  if (self.ptr->ref_count == 1) return ts_subtree_to_mut_unsafe(self); + +  SubtreeHeapData *result = ts_subtree_pool_allocate(pool); +  memcpy(result, self.ptr, sizeof(SubtreeHeapData)); +  if (result->child_count > 0) { +    result->children = ts_calloc(self.ptr->child_count, sizeof(Subtree)); +    memcpy(result->children, self.ptr->children, result->child_count * sizeof(Subtree)); +    for (uint32_t i = 0; i < result->child_count; i++) { +      ts_subtree_retain(result->children[i]); +    } +  } else if (result->has_external_tokens) { +    result->external_scanner_state = ts_external_scanner_state_copy(&self.ptr->external_scanner_state); +  } +  result->ref_count = 1; +  ts_subtree_release(pool, self); +  return (MutableSubtree) {.ptr = result}; +} + +static void ts_subtree__compress(MutableSubtree self, unsigned count, const TSLanguage *language, +                                 MutableSubtreeArray *stack) { +  unsigned initial_stack_size = stack->size; + +  MutableSubtree tree = self; +  TSSymbol symbol = tree.ptr->symbol; +  for (unsigned i = 0; i < count; i++) { +    if (tree.ptr->ref_count > 1 || tree.ptr->child_count < 2) break; + +    MutableSubtree child = ts_subtree_to_mut_unsafe(tree.ptr->children[0]); +    if ( +      child.data.is_inline || +      child.ptr->child_count < 2 || +      child.ptr->ref_count > 1 || +      child.ptr->symbol != symbol +    ) break; + +    MutableSubtree grandchild = ts_subtree_to_mut_unsafe(child.ptr->children[0]); +    if ( +      grandchild.data.is_inline || +      grandchild.ptr->child_count < 2 || +      grandchild.ptr->ref_count > 1 || +      grandchild.ptr->symbol != symbol +    ) break; + +    tree.ptr->children[0] = ts_subtree_from_mut(grandchild); +    child.ptr->children[0] = grandchild.ptr->children[grandchild.ptr->child_count - 1]; +    grandchild.ptr->children[grandchild.ptr->child_count - 1] = ts_subtree_from_mut(child); +    array_push(stack, tree); +    tree = grandchild; +  } + +  while (stack->size > initial_stack_size) { +    tree = array_pop(stack); +    MutableSubtree child = ts_subtree_to_mut_unsafe(tree.ptr->children[0]); +    MutableSubtree grandchild = ts_subtree_to_mut_unsafe(child.ptr->children[child.ptr->child_count - 1]); +    ts_subtree_set_children(grandchild, grandchild.ptr->children, grandchild.ptr->child_count, language); +    ts_subtree_set_children(child, child.ptr->children, child.ptr->child_count, language); +    ts_subtree_set_children(tree, tree.ptr->children, tree.ptr->child_count, language); +  } +} + +void ts_subtree_balance(Subtree self, SubtreePool *pool, const TSLanguage *language) { +  array_clear(&pool->tree_stack); + +  if (ts_subtree_child_count(self) > 0 && self.ptr->ref_count == 1) { +    array_push(&pool->tree_stack, ts_subtree_to_mut_unsafe(self)); +  } + +  while (pool->tree_stack.size > 0) { +    MutableSubtree tree = array_pop(&pool->tree_stack); + +    if (tree.ptr->repeat_depth > 0) { +      Subtree child1 = tree.ptr->children[0]; +      Subtree child2 = tree.ptr->children[tree.ptr->child_count - 1]; +      if ( +        ts_subtree_child_count(child1) > 0 && +        ts_subtree_child_count(child2) > 0 && +        child1.ptr->repeat_depth > child2.ptr->repeat_depth +      ) { +        unsigned n = child1.ptr->repeat_depth - child2.ptr->repeat_depth; +        for (unsigned i = n / 2; i > 0; i /= 2) { +          ts_subtree__compress(tree, i, language, &pool->tree_stack); +          n -= i; +        } +      } +    } + +    for (uint32_t i = 0; i < tree.ptr->child_count; i++) { +      Subtree child = tree.ptr->children[i]; +      if (ts_subtree_child_count(child) > 0 && child.ptr->ref_count == 1) { +        array_push(&pool->tree_stack, ts_subtree_to_mut_unsafe(child)); +      } +    } +  } +} + +static inline uint32_t ts_subtree_repeat_depth(Subtree self) { +  return ts_subtree_child_count(self) ? self.ptr->repeat_depth : 0; +} + +void ts_subtree_set_children( +  MutableSubtree self, Subtree *children, uint32_t child_count, const TSLanguage *language +) { +  assert(!self.data.is_inline); + +  if (self.ptr->child_count > 0 && children != self.ptr->children) { +    ts_free(self.ptr->children); +  } + +  self.ptr->child_count = child_count; +  self.ptr->children = children; +  self.ptr->named_child_count = 0; +  self.ptr->visible_child_count = 0; +  self.ptr->error_cost = 0; +  self.ptr->repeat_depth = 0; +  self.ptr->node_count = 1; +  self.ptr->has_external_tokens = false; +  self.ptr->dynamic_precedence = 0; + +  uint32_t non_extra_index = 0; +  const TSSymbol *alias_sequence = ts_language_alias_sequence(language, self.ptr->production_id); +  uint32_t lookahead_end_byte = 0; + +  for (uint32_t i = 0; i < self.ptr->child_count; i++) { +    Subtree child = self.ptr->children[i]; + +    if (i == 0) { +      self.ptr->padding = ts_subtree_padding(child); +      self.ptr->size = ts_subtree_size(child); +    } else { +      self.ptr->size = length_add(self.ptr->size, ts_subtree_total_size(child)); +    } + +    uint32_t child_lookahead_end_byte = +      self.ptr->padding.bytes + +      self.ptr->size.bytes + +      ts_subtree_lookahead_bytes(child); +    if (child_lookahead_end_byte > lookahead_end_byte) lookahead_end_byte = child_lookahead_end_byte; + +    if (ts_subtree_symbol(child) != ts_builtin_sym_error_repeat) { +      self.ptr->error_cost += ts_subtree_error_cost(child); +    } + +    self.ptr->dynamic_precedence += ts_subtree_dynamic_precedence(child); +    self.ptr->node_count += ts_subtree_node_count(child); + +    if (alias_sequence && alias_sequence[non_extra_index] != 0 && !ts_subtree_extra(child)) { +      self.ptr->visible_child_count++; +      if (ts_language_symbol_metadata(language, alias_sequence[non_extra_index]).named) { +        self.ptr->named_child_count++; +      } +    } else if (ts_subtree_visible(child)) { +      self.ptr->visible_child_count++; +      if (ts_subtree_named(child)) self.ptr->named_child_count++; +    } else if (ts_subtree_child_count(child) > 0) { +      self.ptr->visible_child_count += child.ptr->visible_child_count; +      self.ptr->named_child_count += child.ptr->named_child_count; +    } + +    if (ts_subtree_has_external_tokens(child)) self.ptr->has_external_tokens = true; + +    if (ts_subtree_is_error(child)) { +      self.ptr->fragile_left = self.ptr->fragile_right = true; +      self.ptr->parse_state = TS_TREE_STATE_NONE; +    } + +    if (!ts_subtree_extra(child)) non_extra_index++; +  } + +  self.ptr->lookahead_bytes = lookahead_end_byte - self.ptr->size.bytes - self.ptr->padding.bytes; + +  if (self.ptr->symbol == ts_builtin_sym_error || self.ptr->symbol == ts_builtin_sym_error_repeat) { +    self.ptr->error_cost += +      ERROR_COST_PER_RECOVERY + +      ERROR_COST_PER_SKIPPED_CHAR * self.ptr->size.bytes + +      ERROR_COST_PER_SKIPPED_LINE * self.ptr->size.extent.row; +    for (uint32_t i = 0; i < self.ptr->child_count; i++) { +      Subtree child = self.ptr->children[i]; +      uint32_t grandchild_count = ts_subtree_child_count(child); +      if (ts_subtree_extra(child)) continue; +      if (ts_subtree_is_error(child) && grandchild_count == 0) continue; +      if (ts_subtree_visible(child)) { +        self.ptr->error_cost += ERROR_COST_PER_SKIPPED_TREE; +      } else if (grandchild_count > 0) { +        self.ptr->error_cost += ERROR_COST_PER_SKIPPED_TREE * child.ptr->visible_child_count; +      } +    } +  } + +  if (self.ptr->child_count > 0) { +    Subtree first_child = self.ptr->children[0]; +    Subtree last_child = self.ptr->children[self.ptr->child_count - 1]; + +    self.ptr->first_leaf.symbol = ts_subtree_leaf_symbol(first_child); +    self.ptr->first_leaf.parse_state = ts_subtree_leaf_parse_state(first_child); + +    if (ts_subtree_fragile_left(first_child)) self.ptr->fragile_left = true; +    if (ts_subtree_fragile_right(last_child)) self.ptr->fragile_right = true; + +    if ( +      self.ptr->child_count >= 2 && +      !self.ptr->visible && +      !self.ptr->named && +      ts_subtree_symbol(first_child) == self.ptr->symbol +    ) { +      if (ts_subtree_repeat_depth(first_child) > ts_subtree_repeat_depth(last_child)) { +        self.ptr->repeat_depth = ts_subtree_repeat_depth(first_child) + 1; +      } else { +        self.ptr->repeat_depth = ts_subtree_repeat_depth(last_child) + 1; +      } +    } +  } +} + +MutableSubtree ts_subtree_new_node(SubtreePool *pool, TSSymbol symbol, +                                   SubtreeArray *children, unsigned production_id, +                                   const TSLanguage *language) { +  TSSymbolMetadata metadata = ts_language_symbol_metadata(language, symbol); +  bool fragile = symbol == ts_builtin_sym_error || symbol == ts_builtin_sym_error_repeat; +  SubtreeHeapData *data = ts_subtree_pool_allocate(pool); +  *data = (SubtreeHeapData) { +    .ref_count = 1, +    .symbol = symbol, +    .production_id = production_id, +    .visible = metadata.visible, +    .named = metadata.named, +    .has_changes = false, +    .fragile_left = fragile, +    .fragile_right = fragile, +    .is_keyword = false, +    .node_count = 0, +    .first_leaf = {.symbol = 0, .parse_state = 0}, +  }; +  MutableSubtree result = {.ptr = data}; +  ts_subtree_set_children(result, children->contents, children->size, language); +  return result; +} + +Subtree ts_subtree_new_error_node(SubtreePool *pool, SubtreeArray *children, +                                  bool extra, const TSLanguage *language) { +  MutableSubtree result = ts_subtree_new_node( +    pool, ts_builtin_sym_error, children, 0, language +  ); +  result.ptr->extra = extra; +  return ts_subtree_from_mut(result); +} + +Subtree ts_subtree_new_missing_leaf(SubtreePool *pool, TSSymbol symbol, Length padding, +                                    const TSLanguage *language) { +  Subtree result = ts_subtree_new_leaf( +    pool, symbol, padding, length_zero(), 0, +    0, false, false, language +  ); + +  if (result.data.is_inline) { +    result.data.is_missing = true; +  } else { +    ((SubtreeHeapData *)result.ptr)->is_missing = true; +  } + +  return result; +} + +void ts_subtree_retain(Subtree self) { +  if (self.data.is_inline) return; +  assert(self.ptr->ref_count > 0); +  atomic_inc((volatile uint32_t *)&self.ptr->ref_count); +  assert(self.ptr->ref_count != 0); +} + +void ts_subtree_release(SubtreePool *pool, Subtree self) { +  if (self.data.is_inline) return; +  array_clear(&pool->tree_stack); + +  assert(self.ptr->ref_count > 0); +  if (atomic_dec((volatile uint32_t *)&self.ptr->ref_count) == 0) { +    array_push(&pool->tree_stack, ts_subtree_to_mut_unsafe(self)); +  } + +  while (pool->tree_stack.size > 0) { +    MutableSubtree tree = array_pop(&pool->tree_stack); +    if (tree.ptr->child_count > 0) { +      for (uint32_t i = 0; i < tree.ptr->child_count; i++) { +        Subtree child = tree.ptr->children[i]; +        if (child.data.is_inline) continue; +        assert(child.ptr->ref_count > 0); +        if (atomic_dec((volatile uint32_t *)&child.ptr->ref_count) == 0) { +          array_push(&pool->tree_stack, ts_subtree_to_mut_unsafe(child)); +        } +      } +      ts_free(tree.ptr->children); +    } else if (tree.ptr->has_external_tokens) { +      ts_external_scanner_state_delete(&tree.ptr->external_scanner_state); +    } +    ts_subtree_pool_free(pool, tree.ptr); +  } +} + +bool ts_subtree_eq(Subtree self, Subtree other) { +  if (self.data.is_inline || other.data.is_inline) { +    return memcmp(&self, &other, sizeof(SubtreeInlineData)) == 0; +  } + +  if (self.ptr) { +    if (!other.ptr) return false; +  } else { +    return !other.ptr; +  } + +  if (self.ptr->symbol != other.ptr->symbol) return false; +  if (self.ptr->visible != other.ptr->visible) return false; +  if (self.ptr->named != other.ptr->named) return false; +  if (self.ptr->padding.bytes != other.ptr->padding.bytes) return false; +  if (self.ptr->size.bytes != other.ptr->size.bytes) return false; +  if (self.ptr->symbol == ts_builtin_sym_error) return self.ptr->lookahead_char == other.ptr->lookahead_char; +  if (self.ptr->child_count != other.ptr->child_count) return false; +  if (self.ptr->child_count > 0) { +    if (self.ptr->visible_child_count != other.ptr->visible_child_count) return false; +    if (self.ptr->named_child_count != other.ptr->named_child_count) return false; + +    for (uint32_t i = 0; i < self.ptr->child_count; i++) { +      if (!ts_subtree_eq(self.ptr->children[i], other.ptr->children[i])) { +        return false; +      } +    } +  } +  return true; +} + +int ts_subtree_compare(Subtree left, Subtree right) { +  if (ts_subtree_symbol(left) < ts_subtree_symbol(right)) return -1; +  if (ts_subtree_symbol(right) < ts_subtree_symbol(left)) return 1; +  if (ts_subtree_child_count(left) < ts_subtree_child_count(right)) return -1; +  if (ts_subtree_child_count(right) < ts_subtree_child_count(left)) return 1; +  for (uint32_t i = 0, n = ts_subtree_child_count(left); i < n; i++) { +    Subtree left_child = left.ptr->children[i]; +    Subtree right_child = right.ptr->children[i]; +    switch (ts_subtree_compare(left_child, right_child)) { +      case -1: return -1; +      case 1: return 1; +      default: break; +    } +  } +  return 0; +} + +static inline void ts_subtree_set_has_changes(MutableSubtree *self) { +  if (self->data.is_inline) { +    self->data.has_changes = true; +  } else { +    self->ptr->has_changes = true; +  } +} + +Subtree ts_subtree_edit(Subtree self, const TSInputEdit *edit, SubtreePool *pool) { +  typedef struct { +    Subtree *tree; +    Edit edit; +  } StackEntry; + +  Array(StackEntry) stack = array_new(); +  array_push(&stack, ((StackEntry) { +    .tree = &self, +    .edit = (Edit) { +      .start = {edit->start_byte, edit->start_point}, +      .old_end = {edit->old_end_byte, edit->old_end_point}, +      .new_end = {edit->new_end_byte, edit->new_end_point}, +    }, +  })); + +  while (stack.size) { +    StackEntry entry = array_pop(&stack); +    Edit edit = entry.edit; +    bool is_noop = edit.old_end.bytes == edit.start.bytes && edit.new_end.bytes == edit.start.bytes; +    bool is_pure_insertion = edit.old_end.bytes == edit.start.bytes; + +    Length size = ts_subtree_size(*entry.tree); +    Length padding = ts_subtree_padding(*entry.tree); +    uint32_t lookahead_bytes = ts_subtree_lookahead_bytes(*entry.tree); +    uint32_t end_byte = padding.bytes + size.bytes + lookahead_bytes; +    if (edit.start.bytes > end_byte || (is_noop && edit.start.bytes == end_byte)) continue; + +    // If the edit is entirely within the space before this subtree, then shift this +    // subtree over according to the edit without changing its size. +    if (edit.old_end.bytes <= padding.bytes) { +      padding = length_add(edit.new_end, length_sub(padding, edit.old_end)); +    } + +    // If the edit starts in the space before this subtree and extends into this subtree, +    // shrink the subtree's content to compensate for the change in the space before it. +    else if (edit.start.bytes < padding.bytes) { +      size = length_sub(size, length_sub(edit.old_end, padding)); +      padding = edit.new_end; +    } + +    // If the edit is a pure insertion right at the start of the subtree, +    // shift the subtree over according to the insertion. +    else if (edit.start.bytes == padding.bytes && is_pure_insertion) { +      padding = edit.new_end; +    } + +    // If the edit is within this subtree, resize the subtree to reflect the edit. +    else { +      uint32_t total_bytes = padding.bytes + size.bytes; +      if (edit.start.bytes < total_bytes || +         (edit.start.bytes == total_bytes && is_pure_insertion)) { +        size = length_add( +          length_sub(edit.new_end, padding), +          length_sub(size, length_sub(edit.old_end, padding)) +        ); +      } +    } + +    MutableSubtree result = ts_subtree_make_mut(pool, *entry.tree); + +    if (result.data.is_inline) { +      if (ts_subtree_can_inline(padding, size, lookahead_bytes)) { +        result.data.padding_bytes = padding.bytes; +        result.data.padding_rows = padding.extent.row; +        result.data.padding_columns = padding.extent.column; +        result.data.size_bytes = size.bytes; +      } else { +        SubtreeHeapData *data = ts_subtree_pool_allocate(pool); +        data->ref_count = 1; +        data->padding = padding; +        data->size = size; +        data->lookahead_bytes = lookahead_bytes; +        data->error_cost = 0; +        data->child_count = 0; +        data->symbol = result.data.symbol; +        data->parse_state = result.data.parse_state; +        data->visible = result.data.visible; +        data->named = result.data.named; +        data->extra = result.data.extra; +        data->fragile_left = false; +        data->fragile_right = false; +        data->has_changes = false; +        data->has_external_tokens = false; +        data->is_missing = result.data.is_missing; +        data->is_keyword = result.data.is_keyword; +        result.ptr = data; +      } +    } else { +      result.ptr->padding = padding; +      result.ptr->size = size; +    } + +    ts_subtree_set_has_changes(&result); +    *entry.tree = ts_subtree_from_mut(result); + +    Length child_left, child_right = length_zero(); +    for (uint32_t i = 0, n = ts_subtree_child_count(*entry.tree); i < n; i++) { +      Subtree *child = &result.ptr->children[i]; +      Length child_size = ts_subtree_total_size(*child); +      child_left = child_right; +      child_right = length_add(child_left, child_size); + +      // If this child ends before the edit, it is not affected. +      if (child_right.bytes + ts_subtree_lookahead_bytes(*child) < edit.start.bytes) continue; + +      // If this child starts after the edit, then we're done processing children. +      if (child_left.bytes > edit.old_end.bytes || +          (child_left.bytes == edit.old_end.bytes && child_size.bytes > 0 && i > 0)) break; + +      // Transform edit into the child's coordinate space. +      Edit child_edit = { +        .start = length_sub(edit.start, child_left), +        .old_end = length_sub(edit.old_end, child_left), +        .new_end = length_sub(edit.new_end, child_left), +      }; + +      // Clamp child_edit to the child's bounds. +      if (edit.start.bytes < child_left.bytes) child_edit.start = length_zero(); +      if (edit.old_end.bytes < child_left.bytes) child_edit.old_end = length_zero(); +      if (edit.new_end.bytes < child_left.bytes) child_edit.new_end = length_zero(); +      if (edit.old_end.bytes > child_right.bytes) child_edit.old_end = child_size; + +      // Interpret all inserted text as applying to the *first* child that touches the edit. +      // Subsequent children are only never have any text inserted into them; they are only +      // shrunk to compensate for the edit. +      if (child_right.bytes > edit.start.bytes || +          (child_right.bytes == edit.start.bytes && is_pure_insertion)) { +        edit.new_end = edit.start; +      } + +      // Children that occur before the edit are not reshaped by the edit. +      else { +        child_edit.old_end = child_edit.start; +        child_edit.new_end = child_edit.start; +      } + +      // Queue processing of this child's subtree. +      array_push(&stack, ((StackEntry) { +        .tree = child, +        .edit = child_edit, +      })); +    } +  } + +  array_delete(&stack); +  return self; +} + +Subtree ts_subtree_last_external_token(Subtree tree) { +  if (!ts_subtree_has_external_tokens(tree)) return NULL_SUBTREE; +  while (tree.ptr->child_count > 0) { +    for (uint32_t i = tree.ptr->child_count - 1; i + 1 > 0; i--) { +      Subtree child = tree.ptr->children[i]; +      if (ts_subtree_has_external_tokens(child)) { +        tree = child; +        break; +      } +    } +  } +  return tree; +} + +static size_t ts_subtree__write_char_to_string(char *s, size_t n, int32_t c) { +  if (c == 0) +    return snprintf(s, n, "EOF"); +  if (c == -1) +    return snprintf(s, n, "INVALID"); +  else if (c == '\n') +    return snprintf(s, n, "'\\n'"); +  else if (c == '\t') +    return snprintf(s, n, "'\\t'"); +  else if (c == '\r') +    return snprintf(s, n, "'\\r'"); +  else if (0 < c && c < 128 && isprint(c)) +    return snprintf(s, n, "'%c'", c); +  else +    return snprintf(s, n, "%d", c); +} + +static void ts_subtree__write_dot_string(FILE *f, const char *string) { +  for (const char *c = string; *c; c++) { +    if (*c == '"') { +      fputs("\\\"", f); +    } else if (*c == '\n') { +      fputs("\\n", f); +    } else { +      fputc(*c, f); +    } +  } +} + +static const char *ROOT_FIELD = "__ROOT__"; + +static size_t ts_subtree__write_to_string( +  Subtree self, char *string, size_t limit, +  const TSLanguage *language, bool include_all, +  TSSymbol alias_symbol, bool alias_is_named, const char *field_name +) { +  if (!self.ptr) return snprintf(string, limit, "(NULL)"); + +  char *cursor = string; +  char **writer = (limit > 0) ? &cursor : &string; +  bool is_root = field_name == ROOT_FIELD; +  bool is_visible = +    include_all || +    ts_subtree_missing(self) || +    ( +      alias_symbol +        ? alias_is_named +        : ts_subtree_visible(self) && ts_subtree_named(self) +    ); + +  if (is_visible) { +    if (!is_root) { +      cursor += snprintf(*writer, limit, " "); +      if (field_name) { +        cursor += snprintf(*writer, limit, "%s: ", field_name); +      } +    } + +    if (ts_subtree_is_error(self) && ts_subtree_child_count(self) == 0 && self.ptr->size.bytes > 0) { +      cursor += snprintf(*writer, limit, "(UNEXPECTED "); +      cursor += ts_subtree__write_char_to_string(*writer, limit, self.ptr->lookahead_char); +    } else { +      TSSymbol symbol = alias_symbol ? alias_symbol : ts_subtree_symbol(self); +      const char *symbol_name = ts_language_symbol_name(language, symbol); +      if (ts_subtree_missing(self)) { +        cursor += snprintf(*writer, limit, "(MISSING "); +        if (alias_is_named || ts_subtree_named(self)) { +          cursor += snprintf(*writer, limit, "%s", symbol_name); +        } else { +          cursor += snprintf(*writer, limit, "\"%s\"", symbol_name); +        } +      } else { +        cursor += snprintf(*writer, limit, "(%s", symbol_name); +      } +    } +  } else if (is_root) { +    TSSymbol symbol = ts_subtree_symbol(self); +    const char *symbol_name = ts_language_symbol_name(language, symbol); +    cursor += snprintf(*writer, limit, "(\"%s\")", symbol_name); +  } + +  if (ts_subtree_child_count(self)) { +    const TSSymbol *alias_sequence = ts_language_alias_sequence(language, self.ptr->production_id); +    const TSFieldMapEntry *field_map, *field_map_end; +    ts_language_field_map( +      language, +      self.ptr->production_id, +      &field_map, +      &field_map_end +    ); + +    uint32_t structural_child_index = 0; +    for (uint32_t i = 0; i < self.ptr->child_count; i++) { +      Subtree child = self.ptr->children[i]; +      if (ts_subtree_extra(child)) { +        cursor += ts_subtree__write_to_string( +          child, *writer, limit, +          language, include_all, +          0, false, NULL +        ); +      } else { +        TSSymbol alias_symbol = alias_sequence +          ? alias_sequence[structural_child_index] +          : 0; +        bool alias_is_named = alias_symbol +          ? ts_language_symbol_metadata(language, alias_symbol).named +          : false; + +        const char *child_field_name = is_visible ? NULL : field_name; +        for (const TSFieldMapEntry *i = field_map; i < field_map_end; i++) { +          if (!i->inherited && i->child_index == structural_child_index) { +            child_field_name = language->field_names[i->field_id]; +            break; +          } +        } + +        cursor += ts_subtree__write_to_string( +          child, *writer, limit, +          language, include_all, +          alias_symbol, alias_is_named, child_field_name +        ); +        structural_child_index++; +      } +    } +  } + +  if (is_visible) cursor += snprintf(*writer, limit, ")"); + +  return cursor - string; +} + +char *ts_subtree_string( +  Subtree self, +  const TSLanguage *language, +  bool include_all +) { +  char scratch_string[1]; +  size_t size = ts_subtree__write_to_string( +    self, scratch_string, 0, +    language, include_all, +    0, false, ROOT_FIELD +  ) + 1; +  char *result = malloc(size * sizeof(char)); +  ts_subtree__write_to_string( +    self, result, size, +    language, include_all, +    0, false, ROOT_FIELD +  ); +  return result; +} + +void ts_subtree__print_dot_graph(const Subtree *self, uint32_t start_offset, +                                 const TSLanguage *language, TSSymbol alias_symbol, +                                 FILE *f) { +  TSSymbol subtree_symbol = ts_subtree_symbol(*self); +  TSSymbol symbol = alias_symbol ? alias_symbol : subtree_symbol; +  uint32_t end_offset = start_offset + ts_subtree_total_bytes(*self); +  fprintf(f, "tree_%p [label=\"", self); +  ts_subtree__write_dot_string(f, ts_language_symbol_name(language, symbol)); +  fprintf(f, "\""); + +  if (ts_subtree_child_count(*self) == 0) fprintf(f, ", shape=plaintext"); +  if (ts_subtree_extra(*self)) fprintf(f, ", fontcolor=gray"); + +  fprintf(f, ", tooltip=\"" +    "range: %u - %u\n" +    "state: %d\n" +    "error-cost: %u\n" +    "has-changes: %u\n" +    "repeat-depth: %u\n" +    "lookahead-bytes: %u", +    start_offset, end_offset, +    ts_subtree_parse_state(*self), +    ts_subtree_error_cost(*self), +    ts_subtree_has_changes(*self), +    ts_subtree_repeat_depth(*self), +    ts_subtree_lookahead_bytes(*self) +  ); + +  if (ts_subtree_is_error(*self) && ts_subtree_child_count(*self) == 0) { +    fprintf(f, "\ncharacter: '%c'", self->ptr->lookahead_char); +  } + +  fprintf(f, "\"]\n"); + +  uint32_t child_start_offset = start_offset; +  uint32_t child_info_offset = +    language->max_alias_sequence_length * +    ts_subtree_production_id(*self); +  for (uint32_t i = 0, n = ts_subtree_child_count(*self); i < n; i++) { +    const Subtree *child = &self->ptr->children[i]; +    TSSymbol alias_symbol = 0; +    if (!ts_subtree_extra(*child) && child_info_offset) { +      alias_symbol = language->alias_sequences[child_info_offset]; +      child_info_offset++; +    } +    ts_subtree__print_dot_graph(child, child_start_offset, language, alias_symbol, f); +    fprintf(f, "tree_%p -> tree_%p [tooltip=%u]\n", self, child, i); +    child_start_offset += ts_subtree_total_bytes(*child); +  } +} + +void ts_subtree_print_dot_graph(Subtree self, const TSLanguage *language, FILE *f) { +  fprintf(f, "digraph tree {\n"); +  fprintf(f, "edge [arrowhead=none]\n"); +  ts_subtree__print_dot_graph(&self, 0, language, 0, f); +  fprintf(f, "}\n"); +} + +bool ts_subtree_external_scanner_state_eq(Subtree self, Subtree other) { +  const ExternalScannerState *state1 = &empty_state; +  const ExternalScannerState *state2 = &empty_state; +  if (self.ptr && ts_subtree_has_external_tokens(self) && !self.ptr->child_count) { +    state1 = &self.ptr->external_scanner_state; +  } +  if (other.ptr && ts_subtree_has_external_tokens(other) && !other.ptr->child_count) { +    state2 = &other.ptr->external_scanner_state; +  } +  return ts_external_scanner_state_eq(state1, state2); +} diff --git a/src/tree_sitter/subtree.h b/src/tree_sitter/subtree.h new file mode 100644 index 0000000000..79ccd92390 --- /dev/null +++ b/src/tree_sitter/subtree.h @@ -0,0 +1,281 @@ +#ifndef TREE_SITTER_SUBTREE_H_ +#define TREE_SITTER_SUBTREE_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <limits.h> +#include <stdbool.h> +#include <stdio.h> +#include "./length.h" +#include "./array.h" +#include "./error_costs.h" +#include "tree_sitter/api.h" +#include "tree_sitter/parser.h" + +static const TSStateId TS_TREE_STATE_NONE = USHRT_MAX; +#define NULL_SUBTREE ((Subtree) {.ptr = NULL}) + +typedef union Subtree Subtree; +typedef union MutableSubtree MutableSubtree; + +typedef struct { +  union { +    char *long_data; +    char short_data[24]; +  }; +  uint32_t length; +} ExternalScannerState; + +typedef struct { +  bool is_inline : 1; +  bool visible : 1; +  bool named : 1; +  bool extra : 1; +  bool has_changes : 1; +  bool is_missing : 1; +  bool is_keyword : 1; +  uint8_t symbol; +  uint8_t padding_bytes; +  uint8_t size_bytes; +  uint8_t padding_columns; +  uint8_t padding_rows : 4; +  uint8_t lookahead_bytes : 4; +  uint16_t parse_state; +} SubtreeInlineData; + +typedef struct { +  volatile uint32_t ref_count; +  Length padding; +  Length size; +  uint32_t lookahead_bytes; +  uint32_t error_cost; +  uint32_t child_count; +  TSSymbol symbol; +  TSStateId parse_state; + +  bool visible : 1; +  bool named : 1; +  bool extra : 1; +  bool fragile_left : 1; +  bool fragile_right : 1; +  bool has_changes : 1; +  bool has_external_tokens : 1; +  bool is_missing : 1; +  bool is_keyword : 1; + +  union { +    // Non-terminal subtrees (`child_count > 0`) +    struct { +      Subtree *children; +      uint32_t visible_child_count; +      uint32_t named_child_count; +      uint32_t node_count; +      uint32_t repeat_depth; +      int32_t dynamic_precedence; +      uint16_t production_id; +      struct { +        TSSymbol symbol; +        TSStateId parse_state; +      } first_leaf; +    }; + +    // External terminal subtrees (`child_count == 0 && has_external_tokens`) +    ExternalScannerState external_scanner_state; + +    // Error terminal subtrees (`child_count == 0 && symbol == ts_builtin_sym_error`) +    int32_t lookahead_char; +  }; +} SubtreeHeapData; + +union Subtree { +  SubtreeInlineData data; +  const SubtreeHeapData *ptr; +}; + +union MutableSubtree { +  SubtreeInlineData data; +  SubtreeHeapData *ptr; +}; + +typedef Array(Subtree) SubtreeArray; +typedef Array(MutableSubtree) MutableSubtreeArray; + +typedef struct { +  MutableSubtreeArray free_trees; +  MutableSubtreeArray tree_stack; +} SubtreePool; + +void ts_external_scanner_state_init(ExternalScannerState *, const char *, unsigned); +const char *ts_external_scanner_state_data(const ExternalScannerState *); + +void ts_subtree_array_copy(SubtreeArray, SubtreeArray *); +void ts_subtree_array_delete(SubtreePool *, SubtreeArray *); +SubtreeArray ts_subtree_array_remove_trailing_extras(SubtreeArray *); +void ts_subtree_array_reverse(SubtreeArray *); + +SubtreePool ts_subtree_pool_new(uint32_t capacity); +void ts_subtree_pool_delete(SubtreePool *); + +Subtree ts_subtree_new_leaf( +  SubtreePool *, TSSymbol, Length, Length, uint32_t, +  TSStateId, bool, bool, const TSLanguage * +); +Subtree ts_subtree_new_error( +  SubtreePool *, int32_t, Length, Length, uint32_t, TSStateId, const TSLanguage * +); +MutableSubtree ts_subtree_new_node(SubtreePool *, TSSymbol, SubtreeArray *, unsigned, const TSLanguage *); +Subtree ts_subtree_new_error_node(SubtreePool *, SubtreeArray *, bool, const TSLanguage *); +Subtree ts_subtree_new_missing_leaf(SubtreePool *, TSSymbol, Length, const TSLanguage *); +MutableSubtree ts_subtree_make_mut(SubtreePool *, Subtree); +void ts_subtree_retain(Subtree); +void ts_subtree_release(SubtreePool *, Subtree); +bool ts_subtree_eq(Subtree, Subtree); +int ts_subtree_compare(Subtree, Subtree); +void ts_subtree_set_symbol(MutableSubtree *, TSSymbol, const TSLanguage *); +void ts_subtree_set_children(MutableSubtree, Subtree *, uint32_t, const TSLanguage *); +void ts_subtree_balance(Subtree, SubtreePool *, const TSLanguage *); +Subtree ts_subtree_edit(Subtree, const TSInputEdit *edit, SubtreePool *); +char *ts_subtree_string(Subtree, const TSLanguage *, bool include_all); +void ts_subtree_print_dot_graph(Subtree, const TSLanguage *, FILE *); +Subtree ts_subtree_last_external_token(Subtree); +bool ts_subtree_external_scanner_state_eq(Subtree, Subtree); + +#define SUBTREE_GET(self, name) (self.data.is_inline ? self.data.name : self.ptr->name) + +static inline TSSymbol ts_subtree_symbol(Subtree self) { return SUBTREE_GET(self, symbol); } +static inline bool ts_subtree_visible(Subtree self) { return SUBTREE_GET(self, visible); } +static inline bool ts_subtree_named(Subtree self) { return SUBTREE_GET(self, named); } +static inline bool ts_subtree_extra(Subtree self) { return SUBTREE_GET(self, extra); } +static inline bool ts_subtree_has_changes(Subtree self) { return SUBTREE_GET(self, has_changes); } +static inline bool ts_subtree_missing(Subtree self) { return SUBTREE_GET(self, is_missing); } +static inline bool ts_subtree_is_keyword(Subtree self) { return SUBTREE_GET(self, is_keyword); } +static inline TSStateId ts_subtree_parse_state(Subtree self) { return SUBTREE_GET(self, parse_state); } +static inline uint32_t ts_subtree_lookahead_bytes(Subtree self) { return SUBTREE_GET(self, lookahead_bytes); } + +#undef SUBTREE_GET + +static inline void ts_subtree_set_extra(MutableSubtree *self) { +  if (self->data.is_inline) { +    self->data.extra = true; +  } else { +    self->ptr->extra = true; +  } +} + +static inline TSSymbol ts_subtree_leaf_symbol(Subtree self) { +  if (self.data.is_inline) return self.data.symbol; +  if (self.ptr->child_count == 0) return self.ptr->symbol; +  return self.ptr->first_leaf.symbol; +} + +static inline TSStateId ts_subtree_leaf_parse_state(Subtree self) { +  if (self.data.is_inline) return self.data.parse_state; +  if (self.ptr->child_count == 0) return self.ptr->parse_state; +  return self.ptr->first_leaf.parse_state; +} + +static inline Length ts_subtree_padding(Subtree self) { +  if (self.data.is_inline) { +    Length result = {self.data.padding_bytes, {self.data.padding_rows, self.data.padding_columns}}; +    return result; +  } else { +    return self.ptr->padding; +  } +} + +static inline Length ts_subtree_size(Subtree self) { +  if (self.data.is_inline) { +    Length result = {self.data.size_bytes, {0, self.data.size_bytes}}; +    return result; +  } else { +    return self.ptr->size; +  } +} + +static inline Length ts_subtree_total_size(Subtree self) { +  return length_add(ts_subtree_padding(self), ts_subtree_size(self)); +} + +static inline uint32_t ts_subtree_total_bytes(Subtree self) { +  return ts_subtree_total_size(self).bytes; +} + +static inline uint32_t ts_subtree_child_count(Subtree self) { +  return self.data.is_inline ? 0 : self.ptr->child_count; +} + +static inline uint32_t ts_subtree_node_count(Subtree self) { +  return (self.data.is_inline || self.ptr->child_count == 0) ? 1 : self.ptr->node_count; +} + +static inline uint32_t ts_subtree_visible_child_count(Subtree self) { +  if (ts_subtree_child_count(self) > 0) { +    return self.ptr->visible_child_count; +  } else { +    return 0; +  } +} + +static inline uint32_t ts_subtree_error_cost(Subtree self) { +  if (ts_subtree_missing(self)) { +    return ERROR_COST_PER_MISSING_TREE + ERROR_COST_PER_RECOVERY; +  } else { +    return self.data.is_inline ? 0 : self.ptr->error_cost; +  } +} + +static inline int32_t ts_subtree_dynamic_precedence(Subtree self) { +  return (self.data.is_inline || self.ptr->child_count == 0) ? 0 : self.ptr->dynamic_precedence; +} + +static inline uint16_t ts_subtree_production_id(Subtree self) { +  if (ts_subtree_child_count(self) > 0) { +    return self.ptr->production_id; +  } else { +    return 0; +  } +} + +static inline bool ts_subtree_fragile_left(Subtree self) { +  return self.data.is_inline ? false : self.ptr->fragile_left; +} + +static inline bool ts_subtree_fragile_right(Subtree self) { +  return self.data.is_inline ? false : self.ptr->fragile_right; +} + +static inline bool ts_subtree_has_external_tokens(Subtree self) { +  return self.data.is_inline ? false : self.ptr->has_external_tokens; +} + +static inline bool ts_subtree_is_fragile(Subtree self) { +  return self.data.is_inline ? false : (self.ptr->fragile_left || self.ptr->fragile_right); +} + +static inline bool ts_subtree_is_error(Subtree self) { +  return ts_subtree_symbol(self) == ts_builtin_sym_error; +} + +static inline bool ts_subtree_is_eof(Subtree self) { +  return ts_subtree_symbol(self) == ts_builtin_sym_end; +} + +static inline Subtree ts_subtree_from_mut(MutableSubtree self) { +  Subtree result; +  result.data = self.data; +  return result; +} + +static inline MutableSubtree ts_subtree_to_mut_unsafe(Subtree self) { +  MutableSubtree result; +  result.data = self.data; +  return result; +} + +#ifdef __cplusplus +} +#endif + +#endif  // TREE_SITTER_SUBTREE_H_ diff --git a/src/tree_sitter/tree.c b/src/tree_sitter/tree.c new file mode 100644 index 0000000000..04cb1d242f --- /dev/null +++ b/src/tree_sitter/tree.c @@ -0,0 +1,149 @@ +#include "tree_sitter/api.h" +#include "./array.h" +#include "./get_changed_ranges.h" +#include "./subtree.h" +#include "./tree_cursor.h" +#include "./tree.h" + +static const unsigned PARENT_CACHE_CAPACITY = 32; + +TSTree *ts_tree_new( +  Subtree root, const TSLanguage *language, +  const TSRange *included_ranges, unsigned included_range_count +) { +  TSTree *result = ts_malloc(sizeof(TSTree)); +  result->root = root; +  result->language = language; +  result->parent_cache = NULL; +  result->parent_cache_start = 0; +  result->parent_cache_size = 0; +  result->included_ranges = ts_calloc(included_range_count, sizeof(TSRange)); +  memcpy(result->included_ranges, included_ranges, included_range_count * sizeof(TSRange)); +  result->included_range_count = included_range_count; +  return result; +} + +TSTree *ts_tree_copy(const TSTree *self) { +  ts_subtree_retain(self->root); +  return ts_tree_new(self->root, self->language, self->included_ranges, self->included_range_count); +} + +void ts_tree_delete(TSTree *self) { +  if (!self) return; + +  SubtreePool pool = ts_subtree_pool_new(0); +  ts_subtree_release(&pool, self->root); +  ts_subtree_pool_delete(&pool); +  ts_free(self->included_ranges); +  if (self->parent_cache) ts_free(self->parent_cache); +  ts_free(self); +} + +TSNode ts_tree_root_node(const TSTree *self) { +  return ts_node_new(self, &self->root, ts_subtree_padding(self->root), 0); +} + +const TSLanguage *ts_tree_language(const TSTree *self) { +  return self->language; +} + +void ts_tree_edit(TSTree *self, const TSInputEdit *edit) { +  for (unsigned i = 0; i < self->included_range_count; i++) { +    TSRange *range = &self->included_ranges[i]; +    if (range->end_byte >= edit->old_end_byte) { +      if (range->end_byte != UINT32_MAX) { +        range->end_byte = edit->new_end_byte + (range->end_byte - edit->old_end_byte); +        range->end_point = point_add( +          edit->new_end_point, +          point_sub(range->end_point, edit->old_end_point) +        ); +        if (range->end_byte < edit->new_end_byte) { +          range->end_byte = UINT32_MAX; +          range->end_point = POINT_MAX; +        } +      } +      if (range->start_byte >= edit->old_end_byte) { +        range->start_byte = edit->new_end_byte + (range->start_byte - edit->old_end_byte); +        range->start_point = point_add( +          edit->new_end_point, +          point_sub(range->start_point, edit->old_end_point) +        ); +        if (range->start_byte < edit->new_end_byte) { +          range->start_byte = UINT32_MAX; +          range->start_point = POINT_MAX; +        } +      } +    } +  } + +  SubtreePool pool = ts_subtree_pool_new(0); +  self->root = ts_subtree_edit(self->root, edit, &pool); +  self->parent_cache_start = 0; +  self->parent_cache_size = 0; +  ts_subtree_pool_delete(&pool); +} + +TSRange *ts_tree_get_changed_ranges(const TSTree *self, const TSTree *other, uint32_t *count) { +  TSRange *result; +  TreeCursor cursor1 = {NULL, array_new()}; +  TreeCursor cursor2 = {NULL, array_new()}; +  TSNode root = ts_tree_root_node(self); +  ts_tree_cursor_init(&cursor1, root); +  ts_tree_cursor_init(&cursor2, root); + +  TSRangeArray included_range_differences = array_new(); +  ts_range_array_get_changed_ranges( +    self->included_ranges, self->included_range_count, +    other->included_ranges, other->included_range_count, +    &included_range_differences +  ); + +  *count = ts_subtree_get_changed_ranges( +    &self->root, &other->root, &cursor1, &cursor2, +    self->language, &included_range_differences, &result +  ); + +  array_delete(&included_range_differences); +  array_delete(&cursor1.stack); +  array_delete(&cursor2.stack); +  return result; +} + +void ts_tree_print_dot_graph(const TSTree *self, FILE *file) { +  ts_subtree_print_dot_graph(self->root, self->language, file); +} + +TSNode ts_tree_get_cached_parent(const TSTree *self, const TSNode *node) { +  for (uint32_t i = 0; i < self->parent_cache_size; i++) { +    uint32_t index = (self->parent_cache_start + i) % PARENT_CACHE_CAPACITY; +    ParentCacheEntry *entry = &self->parent_cache[index]; +    if (entry->child == node->id) { +      return ts_node_new(self, entry->parent, entry->position, entry->alias_symbol); +    } +  } +  return ts_node_new(NULL, NULL, length_zero(), 0); +} + +void ts_tree_set_cached_parent(const TSTree *_self, const TSNode *node, const TSNode *parent) { +  TSTree *self = (TSTree *)_self; +  if (!self->parent_cache) { +    self->parent_cache = ts_calloc(PARENT_CACHE_CAPACITY, sizeof(ParentCacheEntry)); +  } + +  uint32_t index = (self->parent_cache_start + self->parent_cache_size) % PARENT_CACHE_CAPACITY; +  self->parent_cache[index] = (ParentCacheEntry) { +    .child = node->id, +    .parent = (const Subtree *)parent->id, +    .position = { +      parent->context[0], +      {parent->context[1], parent->context[2]} +    }, +    .alias_symbol = parent->context[3], +  }; + +  if (self->parent_cache_size == PARENT_CACHE_CAPACITY) { +    self->parent_cache_start++; +  } else { +    self->parent_cache_size++; +  } +} diff --git a/src/tree_sitter/tree.h b/src/tree_sitter/tree.h new file mode 100644 index 0000000000..92a7e64179 --- /dev/null +++ b/src/tree_sitter/tree.h @@ -0,0 +1,34 @@ +#ifndef TREE_SITTER_TREE_H_ +#define TREE_SITTER_TREE_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { +  const Subtree *child; +  const Subtree *parent; +  Length position; +  TSSymbol alias_symbol; +} ParentCacheEntry; + +struct TSTree { +  Subtree root; +  const TSLanguage *language; +  ParentCacheEntry *parent_cache; +  uint32_t parent_cache_start; +  uint32_t parent_cache_size; +  TSRange *included_ranges; +  unsigned included_range_count; +}; + +TSTree *ts_tree_new(Subtree root, const TSLanguage *language, const TSRange *, unsigned); +TSNode ts_node_new(const TSTree *, const Subtree *, Length, TSSymbol); +TSNode ts_tree_get_cached_parent(const TSTree *, const TSNode *); +void ts_tree_set_cached_parent(const TSTree *, const TSNode *, const TSNode *); + +#ifdef __cplusplus +} +#endif + +#endif  // TREE_SITTER_TREE_H_ diff --git a/src/tree_sitter/tree_cursor.c b/src/tree_sitter/tree_cursor.c new file mode 100644 index 0000000000..7103fc411d --- /dev/null +++ b/src/tree_sitter/tree_cursor.c @@ -0,0 +1,302 @@ +#include "tree_sitter/api.h" +#include "./alloc.h" +#include "./tree_cursor.h" +#include "./language.h" +#include "./tree.h" + +typedef struct { +  Subtree parent; +  const TSTree *tree; +  Length position; +  uint32_t child_index; +  uint32_t structural_child_index; +  const TSSymbol *alias_sequence; +} CursorChildIterator; + +// CursorChildIterator + +static inline CursorChildIterator ts_tree_cursor_iterate_children(const TreeCursor *self) { +  TreeCursorEntry *last_entry = array_back(&self->stack); +  if (ts_subtree_child_count(*last_entry->subtree) == 0) { +    return (CursorChildIterator) {NULL_SUBTREE, self->tree, length_zero(), 0, 0, NULL}; +  } +  const TSSymbol *alias_sequence = ts_language_alias_sequence( +    self->tree->language, +    last_entry->subtree->ptr->production_id +  ); +  return (CursorChildIterator) { +    .tree = self->tree, +    .parent = *last_entry->subtree, +    .position = last_entry->position, +    .child_index = 0, +    .structural_child_index = 0, +    .alias_sequence = alias_sequence, +  }; +} + +static inline bool ts_tree_cursor_child_iterator_next(CursorChildIterator *self, +                                                      TreeCursorEntry *result, +                                                      bool *visible) { +  if (!self->parent.ptr || self->child_index == self->parent.ptr->child_count) return false; +  const Subtree *child = &self->parent.ptr->children[self->child_index]; +  *result = (TreeCursorEntry) { +    .subtree = child, +    .position = self->position, +    .child_index = self->child_index, +    .structural_child_index = self->structural_child_index, +  }; +  *visible = ts_subtree_visible(*child); +  bool extra = ts_subtree_extra(*child); +  if (!extra && self->alias_sequence) { +    *visible |= self->alias_sequence[self->structural_child_index]; +    self->structural_child_index++; +  } + +  self->position = length_add(self->position, ts_subtree_size(*child)); +  self->child_index++; + +  if (self->child_index < self->parent.ptr->child_count) { +    Subtree next_child = self->parent.ptr->children[self->child_index]; +    self->position = length_add(self->position, ts_subtree_padding(next_child)); +  } + +  return true; +} + +// TSTreeCursor - lifecycle + +TSTreeCursor ts_tree_cursor_new(TSNode node) { +  TSTreeCursor self = {NULL, NULL, {0, 0}}; +  ts_tree_cursor_init((TreeCursor *)&self, node); +  return self; +} + +void ts_tree_cursor_reset(TSTreeCursor *_self, TSNode node) { +  ts_tree_cursor_init((TreeCursor *)_self, node); +} + +void ts_tree_cursor_init(TreeCursor *self, TSNode node) { +  self->tree = node.tree; +  array_clear(&self->stack); +  array_push(&self->stack, ((TreeCursorEntry) { +    .subtree = (const Subtree *)node.id, +    .position = { +      ts_node_start_byte(node), +      ts_node_start_point(node) +    }, +    .child_index = 0, +    .structural_child_index = 0, +  })); +} + +void ts_tree_cursor_delete(TSTreeCursor *_self) { +  TreeCursor *self = (TreeCursor *)_self; +  array_delete(&self->stack); +} + +// TSTreeCursor - walking the tree + +bool ts_tree_cursor_goto_first_child(TSTreeCursor *_self) { +  TreeCursor *self = (TreeCursor *)_self; + +  bool did_descend; +  do { +    did_descend = false; + +    bool visible; +    TreeCursorEntry entry; +    CursorChildIterator iterator = ts_tree_cursor_iterate_children(self); +    while (ts_tree_cursor_child_iterator_next(&iterator, &entry, &visible)) { +      if (visible) { +        array_push(&self->stack, entry); +        return true; +      } + +      if (ts_subtree_visible_child_count(*entry.subtree) > 0) { +        array_push(&self->stack, entry); +        did_descend = true; +        break; +      } +    } +  } while (did_descend); + +  return false; +} + +int64_t ts_tree_cursor_goto_first_child_for_byte(TSTreeCursor *_self, uint32_t goal_byte) { +  TreeCursor *self = (TreeCursor *)_self; +  uint32_t initial_size = self->stack.size; +  uint32_t visible_child_index = 0; + +  bool did_descend; +  do { +    did_descend = false; + +    bool visible; +    TreeCursorEntry entry; +    CursorChildIterator iterator = ts_tree_cursor_iterate_children(self); +    while (ts_tree_cursor_child_iterator_next(&iterator, &entry, &visible)) { +      uint32_t end_byte = entry.position.bytes + ts_subtree_size(*entry.subtree).bytes; +      bool at_goal = end_byte > goal_byte; +      uint32_t visible_child_count = ts_subtree_visible_child_count(*entry.subtree); + +      if (at_goal) { +        if (visible) { +          array_push(&self->stack, entry); +          return visible_child_index; +        } + +        if (visible_child_count > 0) { +          array_push(&self->stack, entry); +          did_descend = true; +          break; +        } +      } else if (visible) { +        visible_child_index++; +      } else { +        visible_child_index += visible_child_count; +      } +    } +  } while (did_descend); + +  if (self->stack.size > initial_size && +      ts_tree_cursor_goto_next_sibling((TSTreeCursor *)self)) { +    return visible_child_index; +  } + +  self->stack.size = initial_size; +  return -1; +} + +bool ts_tree_cursor_goto_next_sibling(TSTreeCursor *_self) { +  TreeCursor *self = (TreeCursor *)_self; +  uint32_t initial_size = self->stack.size; + +  while (self->stack.size > 1) { +    TreeCursorEntry entry = array_pop(&self->stack); +    CursorChildIterator iterator = ts_tree_cursor_iterate_children(self); +    iterator.child_index = entry.child_index; +    iterator.structural_child_index = entry.structural_child_index; +    iterator.position = entry.position; + +    bool visible = false; +    ts_tree_cursor_child_iterator_next(&iterator, &entry, &visible); +    if (visible && self->stack.size + 1 < initial_size) break; + +    while (ts_tree_cursor_child_iterator_next(&iterator, &entry, &visible)) { +      if (visible) { +        array_push(&self->stack, entry); +        return true; +      } + +      if (ts_subtree_visible_child_count(*entry.subtree)) { +        array_push(&self->stack, entry); +        ts_tree_cursor_goto_first_child(_self); +        return true; +      } +    } +  } + +  self->stack.size = initial_size; +  return false; +} + +bool ts_tree_cursor_goto_parent(TSTreeCursor *_self) { +  TreeCursor *self = (TreeCursor *)_self; +  for (unsigned i = self->stack.size - 2; i + 1 > 0; i--) { +    TreeCursorEntry *entry = &self->stack.contents[i]; +    bool is_aliased = false; +    if (i > 0) { +      TreeCursorEntry *parent_entry = &self->stack.contents[i - 1]; +      const TSSymbol *alias_sequence = ts_language_alias_sequence( +        self->tree->language, +        parent_entry->subtree->ptr->production_id +      ); +      is_aliased = alias_sequence && alias_sequence[entry->structural_child_index]; +    } +    if (ts_subtree_visible(*entry->subtree) || is_aliased) { +      self->stack.size = i + 1; +      return true; +    } +  } +  return false; +} + +TSNode ts_tree_cursor_current_node(const TSTreeCursor *_self) { +  const TreeCursor *self = (const TreeCursor *)_self; +  TreeCursorEntry *last_entry = array_back(&self->stack); +  TSSymbol alias_symbol = 0; +  if (self->stack.size > 1) { +    TreeCursorEntry *parent_entry = &self->stack.contents[self->stack.size - 2]; +    const TSSymbol *alias_sequence = ts_language_alias_sequence( +      self->tree->language, +      parent_entry->subtree->ptr->production_id +    ); +    if (alias_sequence && !ts_subtree_extra(*last_entry->subtree)) { +      alias_symbol = alias_sequence[last_entry->structural_child_index]; +    } +  } +  return ts_node_new( +    self->tree, +    last_entry->subtree, +    last_entry->position, +    alias_symbol +  ); +} + +TSFieldId ts_tree_cursor_current_field_id(const TSTreeCursor *_self) { +  const TreeCursor *self = (const TreeCursor *)_self; + +  // Walk up the tree, visiting the current node and its invisible ancestors. +  for (unsigned i = self->stack.size - 1; i > 0; i--) { +    TreeCursorEntry *entry = &self->stack.contents[i]; +    TreeCursorEntry *parent_entry = &self->stack.contents[i - 1]; + +    // Stop walking up when another visible node is found. +    if (i != self->stack.size - 1) { +      if (ts_subtree_visible(*entry->subtree)) break; +      const TSSymbol *alias_sequence = ts_language_alias_sequence( +        self->tree->language, +        parent_entry->subtree->ptr->production_id +      ); +      if (alias_sequence && alias_sequence[entry->structural_child_index]) { +        break; +      } +    } + +    const TSFieldMapEntry *field_map, *field_map_end; +    ts_language_field_map( +      self->tree->language, +      parent_entry->subtree->ptr->production_id, +      &field_map, &field_map_end +    ); + +    while (field_map < field_map_end) { +      if ( +        !field_map->inherited && +        field_map->child_index == entry->structural_child_index +      ) return field_map->field_id; +      field_map++; +    } +  } +  return 0; +} + +const char *ts_tree_cursor_current_field_name(const TSTreeCursor *_self) { +  TSFieldId id = ts_tree_cursor_current_field_id(_self); +  if (id) { +    const TreeCursor *self = (const TreeCursor *)_self; +    return self->tree->language->field_names[id]; +  } else { +    return NULL; +  } +} + +TSTreeCursor ts_tree_cursor_copy(const TSTreeCursor *_cursor) { +  const TreeCursor *cursor = (const TreeCursor *)_cursor; +  TSTreeCursor res = {NULL, NULL, {0, 0}}; +  TreeCursor *copy = (TreeCursor *)&res; +  copy->tree = cursor->tree; +  array_push_all(©->stack, &cursor->stack); +  return res; +} diff --git a/src/tree_sitter/tree_cursor.h b/src/tree_sitter/tree_cursor.h new file mode 100644 index 0000000000..55bdad86da --- /dev/null +++ b/src/tree_sitter/tree_cursor.h @@ -0,0 +1,20 @@ +#ifndef TREE_SITTER_TREE_CURSOR_H_ +#define TREE_SITTER_TREE_CURSOR_H_ + +#include "./subtree.h" + +typedef struct { +  const Subtree *subtree; +  Length position; +  uint32_t child_index; +  uint32_t structural_child_index; +} TreeCursorEntry; + +typedef struct { +  const TSTree *tree; +  Array(TreeCursorEntry) stack; +} TreeCursor; + +void ts_tree_cursor_init(TreeCursor *, TSNode); + +#endif  // TREE_SITTER_TREE_CURSOR_H_ diff --git a/src/tree_sitter/utf16.c b/src/tree_sitter/utf16.c new file mode 100644 index 0000000000..3956c01cb9 --- /dev/null +++ b/src/tree_sitter/utf16.c @@ -0,0 +1,33 @@ +#include "./utf16.h" + +utf8proc_ssize_t utf16_iterate( +  const utf8proc_uint8_t *string, +  utf8proc_ssize_t length, +  utf8proc_int32_t *code_point +) { +  if (length < 2) { +    *code_point = -1; +    return 0; +  } + +  uint16_t *units = (uint16_t *)string; +  uint16_t unit = units[0]; + +  if (unit < 0xd800 || unit >= 0xe000) { +    *code_point = unit; +    return 2; +  } + +  if (unit < 0xdc00) { +    if (length >= 4) { +      uint16_t next_unit = units[1]; +      if (next_unit >= 0xdc00 && next_unit < 0xe000) { +        *code_point = 0x10000 + ((unit - 0xd800) << 10) + (next_unit - 0xdc00); +        return 4; +      } +    } +  } + +  *code_point = -1; +  return 2; +} diff --git a/src/tree_sitter/utf16.h b/src/tree_sitter/utf16.h new file mode 100644 index 0000000000..32fd05e6db --- /dev/null +++ b/src/tree_sitter/utf16.h @@ -0,0 +1,21 @@ +#ifndef TREE_SITTER_UTF16_H_ +#define TREE_SITTER_UTF16_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <stdint.h> +#include <stdlib.h> +#include "utf8proc.h" + +// Analogous to utf8proc's utf8proc_iterate function. Reads one code point from +// the given UTF16 string and stores it in the location pointed to by `code_point`. +// Returns the number of bytes in `string` that were read. +utf8proc_ssize_t utf16_iterate(const utf8proc_uint8_t *, utf8proc_ssize_t, utf8proc_int32_t *); + +#ifdef __cplusplus +} +#endif + +#endif  // TREE_SITTER_UTF16_H_ diff --git a/test/functional/lua/treesitter_spec.lua b/test/functional/lua/treesitter_spec.lua new file mode 100644 index 0000000000..700e4599f2 --- /dev/null +++ b/test/functional/lua/treesitter_spec.lua @@ -0,0 +1,149 @@ +-- Test suite for testing interactions with API bindings +local helpers = require('test.functional.helpers')(after_each) + +local clear = helpers.clear +local eq = helpers.eq +local insert = helpers.insert +local exec_lua = helpers.exec_lua +local iswin = helpers.iswin +local feed = helpers.feed +local pcall_err = helpers.pcall_err +local matches = helpers.matches + +before_each(clear) + +describe('treesitter API', function() +  -- error tests not requiring a parser library +  it('handles missing language', function() +      local path_pat = 'Error executing lua: '..(iswin() and '.+\\vim\\' or '.+/vim/') + +    matches(path_pat..'treesitter.lua:39: no such language: borklang', +       pcall_err(exec_lua, "parser = vim.treesitter.create_parser(0, 'borklang')")) + +    -- actual message depends on platform +    matches('Error executing lua: Failed to load parser: uv_dlopen: .+', +       pcall_err(exec_lua, "parser = vim.treesitter.add_language('borkbork.so', 'borklang')")) + +    eq('Error executing lua: [string "<nvim>"]:1: no such language: borklang', +       pcall_err(exec_lua, "parser = vim.treesitter.inspect_language('borklang')")) +  end) + +  local ts_path = os.getenv("TREE_SITTER_DIR") + +  describe('with C parser', function() +    if ts_path == nil then +      it("works", function() pending("TREE_SITTER_PATH not set, skipping treesitter parser tests") end) +      return +    end + +    before_each(function() +      local path = ts_path .. '/bin/c'..(iswin() and '.dll' or '.so') +      exec_lua([[ +        local path = ... +        vim.treesitter.add_language(path,'c') +      ]], path) +    end) + +    it('parses buffer', function() +      insert([[ +        int main() { +          int x = 3; +        }]]) + +      exec_lua([[ +        parser = vim.treesitter.get_parser(0, "c") +        tree = parser:parse() +        root = tree:root() +        lang = vim.treesitter.inspect_language('c') +      ]]) + +      eq("<tree>", exec_lua("return tostring(tree)")) +      eq("<node translation_unit>", exec_lua("return tostring(root)")) +      eq({0,0,3,0}, exec_lua("return {root:range()}")) + +      eq(1, exec_lua("return root:child_count()")) +      exec_lua("child = root:child(0)") +      eq("<node function_definition>", exec_lua("return tostring(child)")) +      eq({0,0,2,1}, exec_lua("return {child:range()}")) + +      eq("function_definition", exec_lua("return child:type()")) +      eq(true, exec_lua("return child:named()")) +      eq("number", type(exec_lua("return child:symbol()"))) +      eq({'function_definition', true}, exec_lua("return lang.symbols[child:symbol()]")) + +      exec_lua("anon = root:descendant_for_range(0,8,0,9)") +      eq("(", exec_lua("return anon:type()")) +      eq(false, exec_lua("return anon:named()")) +      eq("number", type(exec_lua("return anon:symbol()"))) +      eq({'(', false}, exec_lua("return lang.symbols[anon:symbol()]")) + +      exec_lua("descendant = root:descendant_for_range(1,2,1,12)") +      eq("<node declaration>", exec_lua("return tostring(descendant)")) +      eq({1,2,1,12}, exec_lua("return {descendant:range()}")) +      eq("(declaration type: (primitive_type) declarator: (init_declarator declarator: (identifier) value: (number_literal)))", exec_lua("return descendant:sexpr()")) + +      eq(true, exec_lua("return child == child")) +      -- separate lua object, but represents same node +      eq(true, exec_lua("return child == root:child(0)")) +      eq(false, exec_lua("return child == descendant2")) +      eq(false, exec_lua("return child == nil")) +      eq(false, exec_lua("return child == tree")) + +      feed("2G7|ay") +      exec_lua([[ +        tree2 = parser:parse() +        root2 = tree2:root() +        descendant2 = root2:descendant_for_range(1,2,1,13) +      ]]) +      eq(false, exec_lua("return tree2 == tree1")) +      eq(false, exec_lua("return root2 == root")) +      eq("<node declaration>", exec_lua("return tostring(descendant2)")) +      eq({1,2,1,13}, exec_lua("return {descendant2:range()}")) + +      -- orginal tree did not change +      eq({1,2,1,12}, exec_lua("return {descendant:range()}")) + +      -- unchanged buffer: return the same tree +      eq(true, exec_lua("return parser:parse() == tree2")) +    end) + +    it('inspects language', function() +        local keys, fields, symbols = unpack(exec_lua([[ +          local lang = vim.treesitter.inspect_language('c') +          local keys, symbols = {}, {} +          for k,_ in pairs(lang) do +            keys[k] = true +          end + +          -- symbols array can have "holes" and is thus not a valid msgpack array +          -- but we don't care about the numbers here (checked in the parser test) +          for _, v in pairs(lang.symbols) do +            table.insert(symbols, v) +          end +          return {keys, lang.fields, symbols} +        ]])) + +        eq({fields=true, symbols=true}, keys) + +        local fset = {} +        for _,f in pairs(fields) do +          eq("string", type(f)) +          fset[f] = true +        end +        eq(true, fset["directive"]) +        eq(true, fset["initializer"]) + +        local has_named, has_anonymous +        for _,s in pairs(symbols) do +          eq("string", type(s[1])) +          eq("boolean", type(s[2])) +          if s[1] == "for_statement" and s[2] == true then +            has_named = true +          elseif s[1] == "|=" and s[2] == false then +            has_anonymous = true +          end +        end +        eq({true,true}, {has_named,has_anonymous}) +    end) +  end) +end) diff --git a/third-party/CMakeLists.txt b/third-party/CMakeLists.txt index c555151c35..83692ff587 100644 --- a/third-party/CMakeLists.txt +++ b/third-party/CMakeLists.txt @@ -35,6 +35,7 @@ option(USE_BUNDLED_LIBTERMKEY "Use the bundled libtermkey." ${USE_BUNDLED})  option(USE_BUNDLED_LIBVTERM "Use the bundled libvterm." ${USE_BUNDLED})  option(USE_BUNDLED_LIBUV "Use the bundled libuv." ${USE_BUNDLED})  option(USE_BUNDLED_MSGPACK "Use the bundled msgpack." ${USE_BUNDLED}) +option(USE_BUNDLED_UTF8PROC "Use the bundled utf8proc." ${USE_BUNDLED})  option(USE_BUNDLED_LUAJIT "Use the bundled version of luajit." ${USE_BUNDLED})  option(USE_BUNDLED_LUAROCKS "Use the bundled version of luarocks." ${USE_BUNDLED})  option(USE_BUNDLED_LUV "Use the bundled version of luv." ${USE_BUNDLED}) @@ -195,6 +196,9 @@ set(GETTEXT_SHA256 ff942af0e438ced4a8b0ea4b0b6e0d6d657157c5e2364de57baa279c1c125  set(LIBICONV_URL https://ftp.gnu.org/pub/gnu/libiconv/libiconv-1.15.tar.gz)  set(LIBICONV_SHA256 ccf536620a45458d26ba83887a983b96827001e92a13847b45e4925cc8913178) +set(UTF8PROC_URL https://github.com/JuliaStrings/utf8proc/archive/v2.2.0.tar.gz) +set(UTF8PROC_SHA256 3f8fd1dbdb057ee5ba584a539d5cd1b3952141c0338557cb0bdf8cb9cfed5dbf) +  if(USE_BUNDLED_UNIBILIUM)    include(BuildUnibilium)  endif() @@ -246,6 +250,10 @@ if(USE_BUNDLED_LIBICONV)    include(BuildLibiconv)  endif() +if(USE_BUNDLED_UTF8PROC) +  include(BuildUtf8proc) +endif() +  if(WIN32)    include(GetBinaryDeps) diff --git a/third-party/cmake/BuildUtf8proc.cmake b/third-party/cmake/BuildUtf8proc.cmake new file mode 100644 index 0000000000..7297913f87 --- /dev/null +++ b/third-party/cmake/BuildUtf8proc.cmake @@ -0,0 +1,68 @@ +include(CMakeParseArguments) + +# BuildUtf8proc(CONFIGURE_COMMAND ... BUILD_COMMAND ... INSTALL_COMMAND ...) +# Reusable function to build utf8proc, wraps ExternalProject_Add. +# Failing to pass a command argument will result in no command being run +function(BuildUtf8proc) +  cmake_parse_arguments(_utf8proc +    "" +    "" +    "CONFIGURE_COMMAND;BUILD_COMMAND;INSTALL_COMMAND" +    ${ARGN}) + +  if(NOT _utf8proc_CONFIGURE_COMMAND AND NOT _utf8proc_BUILD_COMMAND +       AND NOT _utf8proc_INSTALL_COMMAND) +    message(FATAL_ERROR "Must pass at least one of CONFIGURE_COMMAND, BUILD_COMMAND, INSTALL_COMMAND") +  endif() + +  ExternalProject_Add(utf8proc +    PREFIX ${DEPS_BUILD_DIR} +    URL ${UTF8PROC_URL} +    DOWNLOAD_DIR ${DEPS_DOWNLOAD_DIR}/utf8proc +    DOWNLOAD_COMMAND ${CMAKE_COMMAND} +      -DPREFIX=${DEPS_BUILD_DIR} +      -DDOWNLOAD_DIR=${DEPS_DOWNLOAD_DIR}/utf8proc +      -DURL=${UTF8PROC_URL} +      -DEXPECTED_SHA256=${UTF8PROC_SHA256} +      -DTARGET=utf8proc +      -DUSE_EXISTING_SRC_DIR=${USE_EXISTING_SRC_DIR} +      -P ${CMAKE_CURRENT_SOURCE_DIR}/cmake/DownloadAndExtractFile.cmake +    CONFIGURE_COMMAND "${_utf8proc_CONFIGURE_COMMAND}" +    BUILD_COMMAND "${_utf8proc_BUILD_COMMAND}" +    INSTALL_COMMAND "${_utf8proc_INSTALL_COMMAND}") +endfunction() + +set(UTF8PROC_CONFIGURE_COMMAND ${CMAKE_COMMAND} ${DEPS_BUILD_DIR}/src/utf8proc +  -DCMAKE_INSTALL_PREFIX=${DEPS_INSTALL_DIR} +  -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} +  -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} +  "-DCMAKE_C_FLAGS:STRING=${CMAKE_C_COMPILER_ARG1} -fPIC" +  -DCMAKE_GENERATOR=${CMAKE_GENERATOR}) + +set(UTF8PROC_BUILD_COMMAND ${CMAKE_COMMAND} --build . --config ${CMAKE_BUILD_TYPE}) +set(UTF8PROC_INSTALL_COMMAND ${CMAKE_COMMAND} --build . --target install --config ${CMAKE_BUILD_TYPE}) + +if(MINGW AND CMAKE_CROSSCOMPILING) +  get_filename_component(TOOLCHAIN ${CMAKE_TOOLCHAIN_FILE} REALPATH) +  set(UTF8PROC_CONFIGURE_COMMAND ${CMAKE_COMMAND} ${DEPS_BUILD_DIR}/src/utf8proc +    -DCMAKE_INSTALL_PREFIX=${DEPS_INSTALL_DIR} +    # Pass toolchain +    -DCMAKE_TOOLCHAIN_FILE=${TOOLCHAIN} +    -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} +    # Hack to avoid -rdynamic in Mingw +    -DCMAKE_SHARED_LIBRARY_LINK_C_FLAGS="") +elseif(MSVC) +  # Same as Unix without fPIC +  set(UTF8PROC_CONFIGURE_COMMAND ${CMAKE_COMMAND} ${DEPS_BUILD_DIR}/src/utf8proc +    -DCMAKE_INSTALL_PREFIX=${DEPS_INSTALL_DIR} +    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} +    "-DCMAKE_C_FLAGS:STRING=${CMAKE_C_COMPILER_ARG1}" +    -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} +    # Make sure we use the same generator, otherwise we may +    # accidentaly end up using different MSVC runtimes +    -DCMAKE_GENERATOR=${CMAKE_GENERATOR}) +endif() + +BuildUtf8proc(CONFIGURE_COMMAND ${UTF8PROC_CONFIGURE_COMMAND} +  BUILD_COMMAND ${UTF8PROC_BUILD_COMMAND} +  INSTALL_COMMAND ${UTF8PROC_INSTALL_COMMAND}) | 
