diff options
author | Thomas Vigouroux <tomvig38@gmail.com> | 2020-09-17 11:25:22 +0200 |
---|---|---|
committer | Thomas Vigouroux <tomvig38@gmail.com> | 2020-11-03 10:39:35 +0100 |
commit | 3eb241d8310f453ce5e5485f12796a0ae43a0a18 (patch) | |
tree | fa214d8724a4c3d3646b1c1c5e12137757464d6f | |
parent | a061d53e18168130aad537a9e8012390834ff8c2 (diff) | |
download | rneovim-3eb241d8310f453ce5e5485f12796a0ae43a0a18.tar.gz rneovim-3eb241d8310f453ce5e5485f12796a0ae43a0a18.tar.bz2 rneovim-3eb241d8310f453ce5e5485f12796a0ae43a0a18.zip |
bundle: move tree-sitter as a bundled dep
fixup! bundle: move tree-sitter as a bundled dep
fixup! bundle: move tree-sitter as a bundled dep
50 files changed, 48 insertions, 13281 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index 87dff54d06..e290500175 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -374,6 +374,9 @@ include_directories(SYSTEM ${MSGPACK_INCLUDE_DIRS}) find_package(LibLUV 1.30.0 REQUIRED) include_directories(SYSTEM ${LIBLUV_INCLUDE_DIRS}) +find_package(Treesitter REQUIRED) +include_directories(SYSTEM ${TREESITTER_INCLUDE_DIRS}) + # Note: The test lib requires LuaJIT; it will be skipped if LuaJIT is missing. option(PREFER_LUA "Prefer Lua over LuaJIT in the nvim executable." OFF) diff --git a/cmake/FindTreesitter.cmake b/cmake/FindTreesitter.cmake new file mode 100644 index 0000000000..8ba3b72d28 --- /dev/null +++ b/cmake/FindTreesitter.cmake @@ -0,0 +1,11 @@ +# - Try to find tree-sitter +# Once done, this will define +# +# TREESITTER_FOUND - system has tree-sitter +# TREESITTER_INCLUDE_DIRS - the tree-sitter include directories +# TREESITTER_LIBRARIES - link these to use tree-sitter + +include(LibFindMacros) + +libfind_pkg_detect(TREESITTER tree-sitter FIND_PATH tree_sitter/api.h FIND_LIBRARY tree-sitter) +libfind_process(TREESITTER) diff --git a/codecov.yml b/codecov.yml index 0f867db668..a83fd916ee 100644 --- a/codecov.yml +++ b/codecov.yml @@ -25,6 +25,3 @@ coverage: changes: no comment: off - -ignore: - - "src/tree_sitter" diff --git a/scripts/update-ts-runtime.sh b/scripts/update-ts-runtime.sh deleted file mode 100755 index 1a947e0ac9..0000000000 --- a/scripts/update-ts-runtime.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/sh -# -# This script will update the treesitter runtime to the provided commit. -# Usage : -# $0 <tree-sitter commit sha> -set -e - -ts_source_dir="/tmp/tree-sitter" -ts_url="https://github.com/tree-sitter/tree-sitter.git" - -base_dir="$(cd "$(dirname $(dirname $0))" && pwd)" -ts_dest_dir="$base_dir/src/tree_sitter/" -ts_current_commit="$ts_dest_dir/treesitter_commit_hash.txt" - -echo "Updating treesitter runtime from $(cat "$ts_current_commit") to $1..." - -if [ ! -d "$ts_source_dir" ]; then - echo "Cloning treesitter..." - git clone "$ts_url" "$ts_source_dir" -else - echo "Found a non-empty $ts_source_dir directory..." - git -C "$ts_source_dir" fetch -fi - -echo "Checking out $1..." -git -C "$ts_source_dir" checkout $1 - -echo "Removing old files..." -find "$ts_dest_dir" -not -name "LICENSE" -not -name "README.md" -not -type d -delete - -echo "Copying files..." -cp -t "$ts_dest_dir" -r "$ts_source_dir/lib/src"/* -cp -t "$ts_dest_dir" "$ts_source_dir/lib/include/tree_sitter"/* - -echo "$1" > "$ts_current_commit" - -make -TEST_FILE="$base_dir/test/functional/lua/treesitter_spec.lua" make test - diff --git a/src/nvim/CMakeLists.txt b/src/nvim/CMakeLists.txt index 2d98f1a659..46f70f850c 100644 --- a/src/nvim/CMakeLists.txt +++ b/src/nvim/CMakeLists.txt @@ -87,10 +87,6 @@ file(GLOB NVIM_HEADERS *.h) file(GLOB XDIFF_SOURCES xdiff/*.c) file(GLOB XDIFF_HEADERS xdiff/*.h) -file(GLOB TREESITTER_SOURCES ../tree_sitter/*.c) -file(GLOB TS_SOURCE_AMALGAM ../tree_sitter/lib.c) -list(REMOVE_ITEM TREESITTER_SOURCES ${TS_SOURCE_AMALGAM}) - foreach(subdir os api @@ -187,13 +183,6 @@ if(NOT MSVC) set_source_files_properties( eval/funcs.c PROPERTIES COMPILE_FLAGS "${COMPILE_FLAGS} -Wno-conversion") endif() - - # tree-sitter: inlined external project, we don't maintain it. #10124 - set(TS_FLAGS "-Wno-conversion -Wno-pedantic -Wno-shadow -Wno-missing-prototypes -Wno-unused-variable") - if(HAVE_WIMPLICIT_FALLTHROUGH_FLAG) - set(TS_FLAGS "${TS_FLAGS} -Wno-implicit-fallthrough") - endif() - set_source_files_properties(${TREESITTER_SOURCES} PROPERTIES COMPILE_FLAGS "${COMPILE_FLAGS} ${TS_FLAGS}") endif() if(NOT "${MIN_LOG_LEVEL}" MATCHES "^$") @@ -453,6 +442,7 @@ list(APPEND NVIM_LINK_LIBRARIES ${LIBTERMKEY_LIBRARIES} ${UNIBILIUM_LIBRARIES} ${UTF8PROC_LIBRARIES} + ${TREESITTER_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT} ) @@ -472,7 +462,7 @@ endif() add_executable(nvim ${NVIM_GENERATED_FOR_SOURCES} ${NVIM_GENERATED_FOR_HEADERS} ${NVIM_GENERATED_SOURCES} ${NVIM_SOURCES} ${NVIM_HEADERS} - ${XDIFF_SOURCES} ${XDIFF_HEADERS} ${TREESITTER_SOURCES}) + ${XDIFF_SOURCES} ${XDIFF_HEADERS}) target_link_libraries(nvim ${NVIM_EXEC_LINK_LIBRARIES}) install_helper(TARGETS nvim) @@ -570,7 +560,7 @@ add_library( EXCLUDE_FROM_ALL ${NVIM_SOURCES} ${NVIM_GENERATED_SOURCES} ${NVIM_HEADERS} ${NVIM_GENERATED_FOR_SOURCES} ${NVIM_GENERATED_FOR_HEADERS} - ${XDIFF_SOURCES} ${XDIFF_HEADERS} ${TREESITTER_SOURCES} + ${XDIFF_SOURCES} ${XDIFF_HEADERS} ) set_property(TARGET libnvim APPEND PROPERTY INCLUDE_DIRECTORIES ${LUA_PREFERRED_INCLUDE_DIRS}) @@ -600,7 +590,7 @@ else() EXCLUDE_FROM_ALL ${NVIM_SOURCES} ${NVIM_GENERATED_SOURCES} ${NVIM_HEADERS} ${NVIM_GENERATED_FOR_SOURCES} ${NVIM_GENERATED_FOR_HEADERS} - ${XDIFF_SOURCES} ${XDIFF_HEADERS} ${TREESITTER_SOURCES} + ${XDIFF_SOURCES} ${XDIFF_HEADERS} ${UNIT_TEST_FIXTURES} ) target_link_libraries(nvim-test ${NVIM_TEST_LINK_LIBRARIES}) diff --git a/src/tree_sitter/LICENSE b/src/tree_sitter/LICENSE deleted file mode 100644 index 971b81f9a8..0000000000 --- a/src/tree_sitter/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ -The MIT License (MIT) - -Copyright (c) 2018 Max Brunsfeld - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. diff --git a/src/tree_sitter/README.md b/src/tree_sitter/README.md deleted file mode 100644 index 20cb35e7c3..0000000000 --- a/src/tree_sitter/README.md +++ /dev/null @@ -1,16 +0,0 @@ -Tree-sitter vendor runtime -========================== - -This is the vendor runtime code for treesitter. - -The original code can be found [here](https://github.com/tree-sitter/tree-sitter). - -As this code is not ours, if you find any bugs, feel free to open an issue, so that we can -investigate and determine if this should go upstream. - -# Updating - -To update the treesitter runtime, use the `update-ts-runtime.sh` script in the `scripts` directory: -```sh -./scripts/update-ts-runtime.sh <commit you want to update to> -``` diff --git a/src/tree_sitter/alloc.h b/src/tree_sitter/alloc.h deleted file mode 100644 index 32c90f23c8..0000000000 --- a/src/tree_sitter/alloc.h +++ /dev/null @@ -1,95 +0,0 @@ -#ifndef TREE_SITTER_ALLOC_H_ -#define TREE_SITTER_ALLOC_H_ - -#ifdef __cplusplus -extern "C" { -#endif - -#include <stdlib.h> -#include <stdbool.h> -#include <stdio.h> - -#include "nvim/memory.h" - -#if 1 - -static inline bool ts_toggle_allocation_recording(bool value) { - return false; -} - -#define ts_malloc xmalloc -#define ts_calloc xcalloc -#define ts_realloc xrealloc -#define ts_free xfree - -#elif defined(TREE_SITTER_TEST) - -void *ts_record_malloc(size_t); -void *ts_record_calloc(size_t, size_t); -void *ts_record_realloc(void *, size_t); -void ts_record_free(void *); -bool ts_toggle_allocation_recording(bool); - -static inline void *ts_malloc(size_t size) { - return ts_record_malloc(size); -} - -static inline void *ts_calloc(size_t count, size_t size) { - return ts_record_calloc(count, size); -} - -static inline void *ts_realloc(void *buffer, size_t size) { - return ts_record_realloc(buffer, size); -} - -static inline void ts_free(void *buffer) { - ts_record_free(buffer); -} - -#else - -#include <stdlib.h> - -static inline bool ts_toggle_allocation_recording(bool value) { - (void)value; - return false; -} - -static inline void *ts_malloc(size_t size) { - void *result = malloc(size); - if (size > 0 && !result) { - fprintf(stderr, "tree-sitter failed to allocate %zu bytes", size); - exit(1); - } - return result; -} - -static inline void *ts_calloc(size_t count, size_t size) { - void *result = calloc(count, size); - if (count > 0 && !result) { - fprintf(stderr, "tree-sitter failed to allocate %zu bytes", count * size); - exit(1); - } - return result; -} - -static inline void *ts_realloc(void *buffer, size_t size) { - void *result = realloc(buffer, size); - if (size > 0 && !result) { - fprintf(stderr, "tree-sitter failed to reallocate %zu bytes", size); - exit(1); - } - return result; -} - -static inline void ts_free(void *buffer) { - free(buffer); -} - -#endif - -#ifdef __cplusplus -} -#endif - -#endif // TREE_SITTER_ALLOC_H_ diff --git a/src/tree_sitter/api.h b/src/tree_sitter/api.h deleted file mode 100644 index 9d832e6ec4..0000000000 --- a/src/tree_sitter/api.h +++ /dev/null @@ -1,876 +0,0 @@ -#ifndef TREE_SITTER_API_H_ -#define TREE_SITTER_API_H_ - -#ifdef __cplusplus -extern "C" { -#endif - -#include <stdio.h> -#include <stdlib.h> -#include <stdint.h> -#include <stdbool.h> - -/****************************/ -/* Section - ABI Versioning */ -/****************************/ - -/** - * The latest ABI version that is supported by the current version of the - * library. When Languages are generated by the Tree-sitter CLI, they are - * assigned an ABI version number that corresponds to the current CLI version. - * The Tree-sitter library is generally backwards-compatible with languages - * generated using older CLI versions, but is not forwards-compatible. - */ -#define TREE_SITTER_LANGUAGE_VERSION 11 - -/** - * The earliest ABI version that is supported by the current version of the - * library. - */ -#define TREE_SITTER_MIN_COMPATIBLE_LANGUAGE_VERSION 9 - -/*******************/ -/* Section - Types */ -/*******************/ - -typedef uint16_t TSSymbol; -typedef uint16_t TSFieldId; -typedef struct TSLanguage TSLanguage; -typedef struct TSParser TSParser; -typedef struct TSTree TSTree; -typedef struct TSQuery TSQuery; -typedef struct TSQueryCursor TSQueryCursor; - -typedef enum { - TSInputEncodingUTF8, - TSInputEncodingUTF16, -} TSInputEncoding; - -typedef enum { - TSSymbolTypeRegular, - TSSymbolTypeAnonymous, - TSSymbolTypeAuxiliary, -} TSSymbolType; - -typedef struct { - uint32_t row; - uint32_t column; -} TSPoint; - -typedef struct { - TSPoint start_point; - TSPoint end_point; - uint32_t start_byte; - uint32_t end_byte; -} TSRange; - -typedef struct { - void *payload; - const char *(*read)(void *payload, uint32_t byte_index, TSPoint position, uint32_t *bytes_read); - TSInputEncoding encoding; -} TSInput; - -typedef enum { - TSLogTypeParse, - TSLogTypeLex, -} TSLogType; - -typedef struct { - void *payload; - void (*log)(void *payload, TSLogType, const char *); -} TSLogger; - -typedef struct { - uint32_t start_byte; - uint32_t old_end_byte; - uint32_t new_end_byte; - TSPoint start_point; - TSPoint old_end_point; - TSPoint new_end_point; -} TSInputEdit; - -typedef struct { - uint32_t context[4]; - const void *id; - const TSTree *tree; -} TSNode; - -typedef struct { - const void *tree; - const void *id; - uint32_t context[2]; -} TSTreeCursor; - -typedef struct { - TSNode node; - uint32_t index; -} TSQueryCapture; - -typedef struct { - uint32_t id; - uint16_t pattern_index; - uint16_t capture_count; - const TSQueryCapture *captures; -} TSQueryMatch; - -typedef enum { - TSQueryPredicateStepTypeDone, - TSQueryPredicateStepTypeCapture, - TSQueryPredicateStepTypeString, -} TSQueryPredicateStepType; - -typedef struct { - TSQueryPredicateStepType type; - uint32_t value_id; -} TSQueryPredicateStep; - -typedef enum { - TSQueryErrorNone = 0, - TSQueryErrorSyntax, - TSQueryErrorNodeType, - TSQueryErrorField, - TSQueryErrorCapture, -} TSQueryError; - -/********************/ -/* Section - Parser */ -/********************/ - -/** - * Create a new parser. - */ -TSParser *ts_parser_new(void); - -/** - * Delete the parser, freeing all of the memory that it used. - */ -void ts_parser_delete(TSParser *parser); - -/** - * Set the language that the parser should use for parsing. - * - * Returns a boolean indicating whether or not the language was successfully - * assigned. True means assignment succeeded. False means there was a version - * mismatch: the language was generated with an incompatible version of the - * Tree-sitter CLI. Check the language's version using `ts_language_version` - * and compare it to this library's `TREE_SITTER_LANGUAGE_VERSION` and - * `TREE_SITTER_MIN_COMPATIBLE_LANGUAGE_VERSION` constants. - */ -bool ts_parser_set_language(TSParser *self, const TSLanguage *language); - -/** - * Get the parser's current language. - */ -const TSLanguage *ts_parser_language(const TSParser *self); - -/** - * Set the ranges of text that the parser should include when parsing. - * - * By default, the parser will always include entire documents. This function - * allows you to parse only a *portion* of a document but still return a syntax - * tree whose ranges match up with the document as a whole. You can also pass - * multiple disjoint ranges. - * - * The second and third parameters specify the location and length of an array - * of ranges. The parser does *not* take ownership of these ranges; it copies - * the data, so it doesn't matter how these ranges are allocated. - * - * If `length` is zero, then the entire document will be parsed. Otherwise, - * the given ranges must be ordered from earliest to latest in the document, - * and they must not overlap. That is, the following must hold for all - * `i` < `length - 1`: - * - * ranges[i].end_byte <= ranges[i + 1].start_byte - * - * If this requirement is not satisfied, the operation will fail, the ranges - * will not be assigned, and this function will return `false`. On success, - * this function returns `true` - */ -bool ts_parser_set_included_ranges( - TSParser *self, - const TSRange *ranges, - uint32_t length -); - -/** - * Get the ranges of text that the parser will include when parsing. - * - * The returned pointer is owned by the parser. The caller should not free it - * or write to it. The length of the array will be written to the given - * `length` pointer. - */ -const TSRange *ts_parser_included_ranges( - const TSParser *self, - uint32_t *length -); - -/** - * Use the parser to parse some source code and create a syntax tree. - * - * If you are parsing this document for the first time, pass `NULL` for the - * `old_tree` parameter. Otherwise, if you have already parsed an earlier - * version of this document and the document has since been edited, pass the - * previous syntax tree so that the unchanged parts of it can be reused. - * This will save time and memory. For this to work correctly, you must have - * already edited the old syntax tree using the `ts_tree_edit` function in a - * way that exactly matches the source code changes. - * - * The `TSInput` parameter lets you specify how to read the text. It has the - * following three fields: - * 1. `read`: A function to retrieve a chunk of text at a given byte offset - * and (row, column) position. The function should return a pointer to the - * text and write its length to the the `bytes_read` pointer. The parser - * does not take ownership of this buffer; it just borrows it until it has - * finished reading it. The function should write a zero value to the - * `bytes_read` pointer to indicate the end of the document. - * 2. `payload`: An arbitrary pointer that will be passed to each invocation - * of the `read` function. - * 3. `encoding`: An indication of how the text is encoded. Either - * `TSInputEncodingUTF8` or `TSInputEncodingUTF16`. - * - * This function returns a syntax tree on success, and `NULL` on failure. There - * are three possible reasons for failure: - * 1. The parser does not have a language assigned. Check for this using the - `ts_parser_language` function. - * 2. Parsing was cancelled due to a timeout that was set by an earlier call to - * the `ts_parser_set_timeout_micros` function. You can resume parsing from - * where the parser left out by calling `ts_parser_parse` again with the - * same arguments. Or you can start parsing from scratch by first calling - * `ts_parser_reset`. - * 3. Parsing was cancelled using a cancellation flag that was set by an - * earlier call to `ts_parser_set_cancellation_flag`. You can resume parsing - * from where the parser left out by calling `ts_parser_parse` again with - * the same arguments. - */ -TSTree *ts_parser_parse( - TSParser *self, - const TSTree *old_tree, - TSInput input -); - -/** - * Use the parser to parse some source code stored in one contiguous buffer. - * The first two parameters are the same as in the `ts_parser_parse` function - * above. The second two parameters indicate the location of the buffer and its - * length in bytes. - */ -TSTree *ts_parser_parse_string( - TSParser *self, - const TSTree *old_tree, - const char *string, - uint32_t length -); - -/** - * Use the parser to parse some source code stored in one contiguous buffer with - * a given encoding. The first four parameters work the same as in the - * `ts_parser_parse_string` method above. The final parameter indicates whether - * the text is encoded as UTF8 or UTF16. - */ -TSTree *ts_parser_parse_string_encoding( - TSParser *self, - const TSTree *old_tree, - const char *string, - uint32_t length, - TSInputEncoding encoding -); - -/** - * Instruct the parser to start the next parse from the beginning. - * - * If the parser previously failed because of a timeout or a cancellation, then - * by default, it will resume where it left off on the next call to - * `ts_parser_parse` or other parsing functions. If you don't want to resume, - * and instead intend to use this parser to parse some other document, you must - * call `ts_parser_reset` first. - */ -void ts_parser_reset(TSParser *self); - -/** - * Set the maximum duration in microseconds that parsing should be allowed to - * take before halting. - * - * If parsing takes longer than this, it will halt early, returning NULL. - * See `ts_parser_parse` for more information. - */ -void ts_parser_set_timeout_micros(TSParser *self, uint64_t timeout); - -/** - * Get the duration in microseconds that parsing is allowed to take. - */ -uint64_t ts_parser_timeout_micros(const TSParser *self); - -/** - * Set the parser's current cancellation flag pointer. - * - * If a non-null pointer is assigned, then the parser will periodically read - * from this pointer during parsing. If it reads a non-zero value, it will - * halt early, returning NULL. See `ts_parser_parse` for more information. - */ -void ts_parser_set_cancellation_flag(TSParser *self, const size_t *flag); - -/** - * Get the parser's current cancellation flag pointer. - */ -const size_t *ts_parser_cancellation_flag(const TSParser *self); - -/** - * Set the logger that a parser should use during parsing. - * - * The parser does not take ownership over the logger payload. If a logger was - * previously assigned, the caller is responsible for releasing any memory - * owned by the previous logger. - */ -void ts_parser_set_logger(TSParser *self, TSLogger logger); - -/** - * Get the parser's current logger. - */ -TSLogger ts_parser_logger(const TSParser *self); - -/** - * Set the file descriptor to which the parser should write debugging graphs - * during parsing. The graphs are formatted in the DOT language. You may want - * to pipe these graphs directly to a `dot(1)` process in order to generate - * SVG output. You can turn off this logging by passing a negative number. - */ -void ts_parser_print_dot_graphs(TSParser *self, int file); - -/******************/ -/* Section - Tree */ -/******************/ - -/** - * Create a shallow copy of the syntax tree. This is very fast. - * - * You need to copy a syntax tree in order to use it on more than one thread at - * a time, as syntax trees are not thread safe. - */ -TSTree *ts_tree_copy(const TSTree *self); - -/** - * Delete the syntax tree, freeing all of the memory that it used. - */ -void ts_tree_delete(TSTree *self); - -/** - * Get the root node of the syntax tree. - */ -TSNode ts_tree_root_node(const TSTree *self); - -/** - * Get the language that was used to parse the syntax tree. - */ -const TSLanguage *ts_tree_language(const TSTree *); - -/** - * Edit the syntax tree to keep it in sync with source code that has been - * edited. - * - * You must describe the edit both in terms of byte offsets and in terms of - * (row, column) coordinates. - */ -void ts_tree_edit(TSTree *self, const TSInputEdit *edit); - -/** - * Compare an old edited syntax tree to a new syntax tree representing the same - * document, returning an array of ranges whose syntactic structure has changed. - * - * For this to work correctly, the old syntax tree must have been edited such - * that its ranges match up to the new tree. Generally, you'll want to call - * this function right after calling one of the `ts_parser_parse` functions. - * You need to pass the old tree that was passed to parse, as well as the new - * tree that was returned from that function. - * - * The returned array is allocated using `malloc` and the caller is responsible - * for freeing it using `free`. The length of the array will be written to the - * given `length` pointer. - */ -TSRange *ts_tree_get_changed_ranges( - const TSTree *old_tree, - const TSTree *new_tree, - uint32_t *length -); - -/** - * Write a DOT graph describing the syntax tree to the given file. - */ -void ts_tree_print_dot_graph(const TSTree *, FILE *); - -/******************/ -/* Section - Node */ -/******************/ - -/** - * Get the node's type as a null-terminated string. - */ -const char *ts_node_type(TSNode); - -/** - * Get the node's type as a numerical id. - */ -TSSymbol ts_node_symbol(TSNode); - -/** - * Get the node's start byte. - */ -uint32_t ts_node_start_byte(TSNode); - -/** - * Get the node's start position in terms of rows and columns. - */ -TSPoint ts_node_start_point(TSNode); - -/** - * Get the node's end byte. - */ -uint32_t ts_node_end_byte(TSNode); - -/** - * Get the node's end position in terms of rows and columns. - */ -TSPoint ts_node_end_point(TSNode); - -/** - * Get an S-expression representing the node as a string. - * - * This string is allocated with `malloc` and the caller is responsible for - * freeing it using `free`. - */ -char *ts_node_string(TSNode); - -/** - * Check if the node is null. Functions like `ts_node_child` and - * `ts_node_next_sibling` will return a null node to indicate that no such node - * was found. - */ -bool ts_node_is_null(TSNode); - -/** - * Check if the node is *named*. Named nodes correspond to named rules in the - * grammar, whereas *anonymous* nodes correspond to string literals in the - * grammar. - */ -bool ts_node_is_named(TSNode); - -/** - * Check if the node is *missing*. Missing nodes are inserted by the parser in - * order to recover from certain kinds of syntax errors. - */ -bool ts_node_is_missing(TSNode); - -/** - * Check if the node is *extra*. Extra nodes represent things like comments, - * which are not required the grammar, but can appear anywhere. - */ -bool ts_node_is_extra(TSNode); - -/** - * Check if a syntax node has been edited. - */ -bool ts_node_has_changes(TSNode); - -/** - * Check if the node is a syntax error or contains any syntax errors. - */ -bool ts_node_has_error(TSNode); - -/** - * Get the node's immediate parent. - */ -TSNode ts_node_parent(TSNode); - -/** - * Get the node's child at the given index, where zero represents the first - * child. - */ -TSNode ts_node_child(TSNode, uint32_t); - -/** - * Get the node's number of children. - */ -uint32_t ts_node_child_count(TSNode); - -/** - * Get the node's *named* child at the given index. - * - * See also `ts_node_is_named`. - */ -TSNode ts_node_named_child(TSNode, uint32_t); - -/** - * Get the node's number of *named* children. - * - * See also `ts_node_is_named`. - */ -uint32_t ts_node_named_child_count(TSNode); - -/** - * Get the node's child with the given field name. - */ -TSNode ts_node_child_by_field_name( - TSNode self, - const char *field_name, - uint32_t field_name_length -); - -/** - * Get the node's child with the given numerical field id. - * - * You can convert a field name to an id using the - * `ts_language_field_id_for_name` function. - */ -TSNode ts_node_child_by_field_id(TSNode, TSFieldId); - -/** - * Get the node's next / previous sibling. - */ -TSNode ts_node_next_sibling(TSNode); -TSNode ts_node_prev_sibling(TSNode); - -/** - * Get the node's next / previous *named* sibling. - */ -TSNode ts_node_next_named_sibling(TSNode); -TSNode ts_node_prev_named_sibling(TSNode); - -/** - * Get the node's first child that extends beyond the given byte offset. - */ -TSNode ts_node_first_child_for_byte(TSNode, uint32_t); - -/** - * Get the node's first named child that extends beyond the given byte offset. - */ -TSNode ts_node_first_named_child_for_byte(TSNode, uint32_t); - -/** - * Get the smallest node within this node that spans the given range of bytes - * or (row, column) positions. - */ -TSNode ts_node_descendant_for_byte_range(TSNode, uint32_t, uint32_t); -TSNode ts_node_descendant_for_point_range(TSNode, TSPoint, TSPoint); - -/** - * Get the smallest named node within this node that spans the given range of - * bytes or (row, column) positions. - */ -TSNode ts_node_named_descendant_for_byte_range(TSNode, uint32_t, uint32_t); -TSNode ts_node_named_descendant_for_point_range(TSNode, TSPoint, TSPoint); - -/** - * Edit the node to keep it in-sync with source code that has been edited. - * - * This function is only rarely needed. When you edit a syntax tree with the - * `ts_tree_edit` function, all of the nodes that you retrieve from the tree - * afterward will already reflect the edit. You only need to use `ts_node_edit` - * when you have a `TSNode` instance that you want to keep and continue to use - * after an edit. - */ -void ts_node_edit(TSNode *, const TSInputEdit *); - -/** - * Check if two nodes are identical. - */ -bool ts_node_eq(TSNode, TSNode); - -/************************/ -/* Section - TreeCursor */ -/************************/ - -/** - * Create a new tree cursor starting from the given node. - * - * A tree cursor allows you to walk a syntax tree more efficiently than is - * possible using the `TSNode` functions. It is a mutable object that is always - * on a certain syntax node, and can be moved imperatively to different nodes. - */ -TSTreeCursor ts_tree_cursor_new(TSNode); - -/** - * Delete a tree cursor, freeing all of the memory that it used. - */ -void ts_tree_cursor_delete(TSTreeCursor *); - -/** - * Re-initialize a tree cursor to start at a different node. - */ -void ts_tree_cursor_reset(TSTreeCursor *, TSNode); - -/** - * Get the tree cursor's current node. - */ -TSNode ts_tree_cursor_current_node(const TSTreeCursor *); - -/** - * Get the field name of the tree cursor's current node. - * - * This returns `NULL` if the current node doesn't have a field. - * See also `ts_node_child_by_field_name`. - */ -const char *ts_tree_cursor_current_field_name(const TSTreeCursor *); - -/** - * Get the field name of the tree cursor's current node. - * - * This returns zero if the current node doesn't have a field. - * See also `ts_node_child_by_field_id`, `ts_language_field_id_for_name`. - */ -TSFieldId ts_tree_cursor_current_field_id(const TSTreeCursor *); - -/** - * Move the cursor to the parent of its current node. - * - * This returns `true` if the cursor successfully moved, and returns `false` - * if there was no parent node (the cursor was already on the root node). - */ -bool ts_tree_cursor_goto_parent(TSTreeCursor *); - -/** - * Move the cursor to the next sibling of its current node. - * - * This returns `true` if the cursor successfully moved, and returns `false` - * if there was no next sibling node. - */ -bool ts_tree_cursor_goto_next_sibling(TSTreeCursor *); - -/** - * Move the cursor to the first child of its current node. - * - * This returns `true` if the cursor successfully moved, and returns `false` - * if there were no children. - */ -bool ts_tree_cursor_goto_first_child(TSTreeCursor *); - -/** - * Move the cursor to the first child of its current node that extends beyond - * the given byte offset. - * - * This returns the index of the child node if one was found, and returns -1 - * if no such child was found. - */ -int64_t ts_tree_cursor_goto_first_child_for_byte(TSTreeCursor *, uint32_t); - -TSTreeCursor ts_tree_cursor_copy(const TSTreeCursor *); - -/*******************/ -/* Section - Query */ -/*******************/ - -/** - * Create a new query from a string containing one or more S-expression - * patterns. The query is associated with a particular language, and can - * only be run on syntax nodes parsed with that language. - * - * If all of the given patterns are valid, this returns a `TSQuery`. - * If a pattern is invalid, this returns `NULL`, and provides two pieces - * of information about the problem: - * 1. The byte offset of the error is written to the `error_offset` parameter. - * 2. The type of error is written to the `error_type` parameter. - */ -TSQuery *ts_query_new( - const TSLanguage *language, - const char *source, - uint32_t source_len, - uint32_t *error_offset, - TSQueryError *error_type -); - -/** - * Delete a query, freeing all of the memory that it used. - */ -void ts_query_delete(TSQuery *); - -/** - * Get the number of patterns, captures, or string literals in the query. - */ -uint32_t ts_query_pattern_count(const TSQuery *); -uint32_t ts_query_capture_count(const TSQuery *); -uint32_t ts_query_string_count(const TSQuery *); - -/** - * Get the byte offset where the given pattern starts in the query's source. - * - * This can be useful when combining queries by concatenating their source - * code strings. - */ -uint32_t ts_query_start_byte_for_pattern(const TSQuery *, uint32_t); - -/** - * Get all of the predicates for the given pattern in the query. - * - * The predicates are represented as a single array of steps. There are three - * types of steps in this array, which correspond to the three legal values for - * the `type` field: - * - `TSQueryPredicateStepTypeCapture` - Steps with this type represent names - * of captures. Their `value_id` can be used with the - * `ts_query_capture_name_for_id` function to obtain the name of the capture. - * - `TSQueryPredicateStepTypeString` - Steps with this type represent literal - * strings. Their `value_id` can be used with the - * `ts_query_string_value_for_id` function to obtain their string value. - * - `TSQueryPredicateStepTypeDone` - Steps with this type are *sentinels* - * that represent the end of an individual predicate. If a pattern has two - * predicates, then there will be two steps with this `type` in the array. - */ -const TSQueryPredicateStep *ts_query_predicates_for_pattern( - const TSQuery *self, - uint32_t pattern_index, - uint32_t *length -); - -/** - * Get the name and length of one of the query's captures, or one of the - * query's string literals. Each capture and string is associated with a - * numeric id based on the order that it appeared in the query's source. - */ -const char *ts_query_capture_name_for_id( - const TSQuery *, - uint32_t id, - uint32_t *length -); -const char *ts_query_string_value_for_id( - const TSQuery *, - uint32_t id, - uint32_t *length -); - -/** - * Disable a certain capture within a query. - * - * This prevents the capture from being returned in matches, and also avoids - * any resource usage associated with recording the capture. Currently, there - * is no way to undo this. - */ -void ts_query_disable_capture(TSQuery *, const char *, uint32_t); - -/** - * Disable a certain pattern within a query. - * - * This prevents the pattern from matching and removes most of the overhead - * associated with the pattern. Currently, there is no way to undo this. - */ -void ts_query_disable_pattern(TSQuery *, uint32_t); - -/** - * Create a new cursor for executing a given query. - * - * The cursor stores the state that is needed to iteratively search - * for matches. To use the query cursor, first call `ts_query_cursor_exec` - * to start running a given query on a given syntax node. Then, there are - * two options for consuming the results of the query: - * 1. Repeatedly call `ts_query_cursor_next_match` to iterate over all of the - * the *matches* in the order that they were found. Each match contains the - * index of the pattern that matched, and an array of captures. Because - * multiple patterns can match the same set of nodes, one match may contain - * captures that appear *before* some of the captures from a previous match. - * 2. Repeatedly call `ts_query_cursor_next_capture` to iterate over all of the - * individual *captures* in the order that they appear. This is useful if - * don't care about which pattern matched, and just want a single ordered - * sequence of captures. - * - * If you don't care about consuming all of the results, you can stop calling - * `ts_query_cursor_next_match` or `ts_query_cursor_next_capture` at any point. - * You can then start executing another query on another node by calling - * `ts_query_cursor_exec` again. - */ -TSQueryCursor *ts_query_cursor_new(void); - -/** - * Delete a query cursor, freeing all of the memory that it used. - */ -void ts_query_cursor_delete(TSQueryCursor *); - -/** - * Start running a given query on a given node. - */ -void ts_query_cursor_exec(TSQueryCursor *, const TSQuery *, TSNode); - -/** - * Set the range of bytes or (row, column) positions in which the query - * will be executed. - */ -void ts_query_cursor_set_byte_range(TSQueryCursor *, uint32_t, uint32_t); -void ts_query_cursor_set_point_range(TSQueryCursor *, TSPoint, TSPoint); - -/** - * Advance to the next match of the currently running query. - * - * If there is a match, write it to `*match` and return `true`. - * Otherwise, return `false`. - */ -bool ts_query_cursor_next_match(TSQueryCursor *, TSQueryMatch *match); -void ts_query_cursor_remove_match(TSQueryCursor *, uint32_t id); - -/** - * Advance to the next capture of the currently running query. - * - * If there is a capture, write its match to `*match` and its index within - * the matche's capture list to `*capture_index`. Otherwise, return `false`. - */ -bool ts_query_cursor_next_capture( - TSQueryCursor *, - TSQueryMatch *match, - uint32_t *capture_index -); - -/**********************/ -/* Section - Language */ -/**********************/ - -/** - * Get the number of distinct node types in the language. - */ -uint32_t ts_language_symbol_count(const TSLanguage *); - -/** - * Get a node type string for the given numerical id. - */ -const char *ts_language_symbol_name(const TSLanguage *, TSSymbol); - -/** - * Get the numerical id for the given node type string. - */ -TSSymbol ts_language_symbol_for_name( - const TSLanguage *self, - const char *string, - uint32_t length, - bool is_named -); - -/** - * Get the number of distinct field names in the language. - */ -uint32_t ts_language_field_count(const TSLanguage *); - -/** - * Get the field name string for the given numerical id. - */ -const char *ts_language_field_name_for_id(const TSLanguage *, TSFieldId); - -/** - * Get the numerical id for the given field name string. - */ -TSFieldId ts_language_field_id_for_name(const TSLanguage *, const char *, uint32_t); - -/** - * Check whether the given node type id belongs to named nodes, anonymous nodes, - * or a hidden nodes. - * - * See also `ts_node_is_named`. Hidden nodes are never returned from the API. - */ -TSSymbolType ts_language_symbol_type(const TSLanguage *, TSSymbol); - -/** - * Get the ABI version number for this language. This version number is used - * to ensure that languages were generated by a compatible version of - * Tree-sitter. - * - * See also `ts_parser_set_language`. - */ -uint32_t ts_language_version(const TSLanguage *); - -#ifdef __cplusplus -} -#endif - -#endif // TREE_SITTER_API_H_ diff --git a/src/tree_sitter/array.h b/src/tree_sitter/array.h deleted file mode 100644 index 26cb8448f1..0000000000 --- a/src/tree_sitter/array.h +++ /dev/null @@ -1,158 +0,0 @@ -#ifndef TREE_SITTER_ARRAY_H_ -#define TREE_SITTER_ARRAY_H_ - -#ifdef __cplusplus -extern "C" { -#endif - -#include <string.h> -#include <stdlib.h> -#include <stdint.h> -#include <assert.h> -#include <stdbool.h> -#include "./alloc.h" - -#define Array(T) \ - struct { \ - T *contents; \ - uint32_t size; \ - uint32_t capacity; \ - } - -#define array_init(self) \ - ((self)->size = 0, (self)->capacity = 0, (self)->contents = NULL) - -#define array_new() \ - { NULL, 0, 0 } - -#define array_get(self, index) \ - (assert((uint32_t)index < (self)->size), &(self)->contents[index]) - -#define array_front(self) array_get(self, 0) - -#define array_back(self) array_get(self, (self)->size - 1) - -#define array_clear(self) ((self)->size = 0) - -#define array_reserve(self, new_capacity) \ - array__reserve((VoidArray *)(self), array__elem_size(self), new_capacity) - -#define array_erase(self, index) \ - array__erase((VoidArray *)(self), array__elem_size(self), index) - -#define array_delete(self) array__delete((VoidArray *)self) - -#define array_push(self, element) \ - (array__grow((VoidArray *)(self), 1, array__elem_size(self)), \ - (self)->contents[(self)->size++] = (element)) - -#define array_grow_by(self, count) \ - (array__grow((VoidArray *)(self), count, array__elem_size(self)), \ - memset((self)->contents + (self)->size, 0, (count) * array__elem_size(self)), \ - (self)->size += (count)) - -#define array_push_all(self, other) \ - array_splice((self), (self)->size, 0, (other)->size, (other)->contents) - -#define array_splice(self, index, old_count, new_count, new_contents) \ - array__splice((VoidArray *)(self), array__elem_size(self), index, old_count, \ - new_count, new_contents) - -#define array_insert(self, index, element) \ - array__splice((VoidArray *)(self), array__elem_size(self), index, 0, 1, &element) - -#define array_pop(self) ((self)->contents[--(self)->size]) - -#define array_assign(self, other) \ - array__assign((VoidArray *)(self), (const VoidArray *)(other), array__elem_size(self)) - -// Private - -typedef Array(void) VoidArray; - -#define array__elem_size(self) sizeof(*(self)->contents) - -static inline void array__delete(VoidArray *self) { - ts_free(self->contents); - self->contents = NULL; - self->size = 0; - self->capacity = 0; -} - -static inline void array__erase(VoidArray *self, size_t element_size, - uint32_t index) { - assert(index < self->size); - char *contents = (char *)self->contents; - memmove(contents + index * element_size, contents + (index + 1) * element_size, - (self->size - index - 1) * element_size); - self->size--; -} - -static inline void array__reserve(VoidArray *self, size_t element_size, uint32_t new_capacity) { - if (new_capacity > self->capacity) { - if (self->contents) { - self->contents = ts_realloc(self->contents, new_capacity * element_size); - } else { - self->contents = ts_calloc(new_capacity, element_size); - } - self->capacity = new_capacity; - } -} - -static inline void array__assign(VoidArray *self, const VoidArray *other, size_t element_size) { - array__reserve(self, element_size, other->size); - self->size = other->size; - memcpy(self->contents, other->contents, self->size * element_size); -} - -static inline void array__grow(VoidArray *self, size_t count, size_t element_size) { - size_t new_size = self->size + count; - if (new_size > self->capacity) { - size_t new_capacity = self->capacity * 2; - if (new_capacity < 8) new_capacity = 8; - if (new_capacity < new_size) new_capacity = new_size; - array__reserve(self, element_size, new_capacity); - } -} - -static inline void array__splice(VoidArray *self, size_t element_size, - uint32_t index, uint32_t old_count, - uint32_t new_count, const void *elements) { - uint32_t new_size = self->size + new_count - old_count; - uint32_t old_end = index + old_count; - uint32_t new_end = index + new_count; - assert(old_end <= self->size); - - array__reserve(self, element_size, new_size); - - char *contents = (char *)self->contents; - if (self->size > old_end) { - memmove( - contents + new_end * element_size, - contents + old_end * element_size, - (self->size - old_end) * element_size - ); - } - if (new_count > 0) { - if (elements) { - memcpy( - (contents + index * element_size), - elements, - new_count * element_size - ); - } else { - memset( - (contents + index * element_size), - 0, - new_count * element_size - ); - } - } - self->size += new_count - old_count; -} - -#ifdef __cplusplus -} -#endif - -#endif // TREE_SITTER_ARRAY_H_ diff --git a/src/tree_sitter/atomic.h b/src/tree_sitter/atomic.h deleted file mode 100644 index 7bd0e850a9..0000000000 --- a/src/tree_sitter/atomic.h +++ /dev/null @@ -1,42 +0,0 @@ -#ifndef TREE_SITTER_ATOMIC_H_ -#define TREE_SITTER_ATOMIC_H_ - -#include <stdint.h> - -#ifdef _WIN32 - -#include <windows.h> - -static inline size_t atomic_load(const volatile size_t *p) { - return *p; -} - -static inline uint32_t atomic_inc(volatile uint32_t *p) { - return InterlockedIncrement((long volatile *)p); -} - -static inline uint32_t atomic_dec(volatile uint32_t *p) { - return InterlockedDecrement((long volatile *)p); -} - -#else - -static inline size_t atomic_load(const volatile size_t *p) { -#ifdef __ATOMIC_RELAXED - return __atomic_load_n(p, __ATOMIC_RELAXED); -#else - return __sync_fetch_and_add((volatile size_t *)p, 0); -#endif -} - -static inline uint32_t atomic_inc(volatile uint32_t *p) { - return __sync_add_and_fetch(p, 1u); -} - -static inline uint32_t atomic_dec(volatile uint32_t *p) { - return __sync_sub_and_fetch(p, 1u); -} - -#endif - -#endif // TREE_SITTER_ATOMIC_H_ diff --git a/src/tree_sitter/bits.h b/src/tree_sitter/bits.h deleted file mode 100644 index ce7a715567..0000000000 --- a/src/tree_sitter/bits.h +++ /dev/null @@ -1,29 +0,0 @@ -#ifndef TREE_SITTER_BITS_H_ -#define TREE_SITTER_BITS_H_ - -#include <stdint.h> - -static inline uint32_t bitmask_for_index(uint16_t id) { - return (1u << (31 - id)); -} - -#if defined _WIN32 && !defined __GNUC__ - -#include <intrin.h> - -static inline uint32_t count_leading_zeros(uint32_t x) { - if (x == 0) return 32; - uint32_t result; - _BitScanReverse(&result, x); - return 31 - result; -} - -#else - -static inline uint32_t count_leading_zeros(uint32_t x) { - if (x == 0) return 32; - return __builtin_clz(x); -} - -#endif -#endif // TREE_SITTER_BITS_H_ diff --git a/src/tree_sitter/clock.h b/src/tree_sitter/clock.h deleted file mode 100644 index 94545f3566..0000000000 --- a/src/tree_sitter/clock.h +++ /dev/null @@ -1,141 +0,0 @@ -#ifndef TREE_SITTER_CLOCK_H_ -#define TREE_SITTER_CLOCK_H_ - -#include <stdint.h> - -typedef uint64_t TSDuration; - -#ifdef _WIN32 - -// Windows: -// * Represent a time as a performance counter value. -// * Represent a duration as a number of performance counter ticks. - -#include <windows.h> -typedef uint64_t TSClock; - -static inline TSDuration duration_from_micros(uint64_t micros) { - LARGE_INTEGER frequency; - QueryPerformanceFrequency(&frequency); - return micros * (uint64_t)frequency.QuadPart / 1000000; -} - -static inline uint64_t duration_to_micros(TSDuration self) { - LARGE_INTEGER frequency; - QueryPerformanceFrequency(&frequency); - return self * 1000000 / (uint64_t)frequency.QuadPart; -} - -static inline TSClock clock_null(void) { - return 0; -} - -static inline TSClock clock_now(void) { - LARGE_INTEGER result; - QueryPerformanceCounter(&result); - return (uint64_t)result.QuadPart; -} - -static inline TSClock clock_after(TSClock base, TSDuration duration) { - return base + duration; -} - -static inline bool clock_is_null(TSClock self) { - return !self; -} - -static inline bool clock_is_gt(TSClock self, TSClock other) { - return self > other; -} - -#elif defined(CLOCK_MONOTONIC) && !defined(__APPLE__) - -// POSIX with monotonic clock support (Linux) -// * Represent a time as a monotonic (seconds, nanoseconds) pair. -// * Represent a duration as a number of microseconds. -// -// On these platforms, parse timeouts will correspond accurately to -// real time, regardless of what other processes are running. - -#include <time.h> -typedef struct timespec TSClock; - -static inline TSDuration duration_from_micros(uint64_t micros) { - return micros; -} - -static inline uint64_t duration_to_micros(TSDuration self) { - return self; -} - -static inline TSClock clock_now(void) { - TSClock result; - clock_gettime(CLOCK_MONOTONIC, &result); - return result; -} - -static inline TSClock clock_null(void) { - return (TSClock) {0, 0}; -} - -static inline TSClock clock_after(TSClock base, TSDuration duration) { - TSClock result = base; - result.tv_sec += duration / 1000000; - result.tv_nsec += (duration % 1000000) * 1000; - return result; -} - -static inline bool clock_is_null(TSClock self) { - return !self.tv_sec; -} - -static inline bool clock_is_gt(TSClock self, TSClock other) { - if (self.tv_sec > other.tv_sec) return true; - if (self.tv_sec < other.tv_sec) return false; - return self.tv_nsec > other.tv_nsec; -} - -#else - -// macOS or POSIX without monotonic clock support -// * Represent a time as a process clock value. -// * Represent a duration as a number of process clock ticks. -// -// On these platforms, parse timeouts may be affected by other processes, -// which is not ideal, but is better than using a non-monotonic time API -// like `gettimeofday`. - -#include <time.h> -typedef uint64_t TSClock; - -static inline TSDuration duration_from_micros(uint64_t micros) { - return micros * (uint64_t)CLOCKS_PER_SEC / 1000000; -} - -static inline uint64_t duration_to_micros(TSDuration self) { - return self * 1000000 / (uint64_t)CLOCKS_PER_SEC; -} - -static inline TSClock clock_null(void) { - return 0; -} - -static inline TSClock clock_now(void) { - return (uint64_t)clock(); -} - -static inline TSClock clock_after(TSClock base, TSDuration duration) { - return base + duration; -} - -static inline bool clock_is_null(TSClock self) { - return !self; -} - -static inline bool clock_is_gt(TSClock self, TSClock other) { - return self > other; -} - -#endif - -#endif // TREE_SITTER_CLOCK_H_ diff --git a/src/tree_sitter/error_costs.h b/src/tree_sitter/error_costs.h deleted file mode 100644 index 32d3666a66..0000000000 --- a/src/tree_sitter/error_costs.h +++ /dev/null @@ -1,11 +0,0 @@ -#ifndef TREE_SITTER_ERROR_COSTS_H_ -#define TREE_SITTER_ERROR_COSTS_H_ - -#define ERROR_STATE 0 -#define ERROR_COST_PER_RECOVERY 500 -#define ERROR_COST_PER_MISSING_TREE 110 -#define ERROR_COST_PER_SKIPPED_TREE 100 -#define ERROR_COST_PER_SKIPPED_LINE 30 -#define ERROR_COST_PER_SKIPPED_CHAR 1 - -#endif diff --git a/src/tree_sitter/get_changed_ranges.c b/src/tree_sitter/get_changed_ranges.c deleted file mode 100644 index 5bd1d814bd..0000000000 --- a/src/tree_sitter/get_changed_ranges.c +++ /dev/null @@ -1,482 +0,0 @@ -#include "./get_changed_ranges.h" -#include "./subtree.h" -#include "./language.h" -#include "./error_costs.h" -#include "./tree_cursor.h" -#include <assert.h> - -// #define DEBUG_GET_CHANGED_RANGES - -static void ts_range_array_add(TSRangeArray *self, Length start, Length end) { - if (self->size > 0) { - TSRange *last_range = array_back(self); - if (start.bytes <= last_range->end_byte) { - last_range->end_byte = end.bytes; - last_range->end_point = end.extent; - return; - } - } - - if (start.bytes < end.bytes) { - TSRange range = { start.extent, end.extent, start.bytes, end.bytes }; - array_push(self, range); - } -} - -bool ts_range_array_intersects(const TSRangeArray *self, unsigned start_index, - uint32_t start_byte, uint32_t end_byte) { - for (unsigned i = start_index; i < self->size; i++) { - TSRange *range = &self->contents[i]; - if (range->end_byte > start_byte) { - if (range->start_byte >= end_byte) break; - return true; - } - } - return false; -} - -void ts_range_array_get_changed_ranges( - const TSRange *old_ranges, unsigned old_range_count, - const TSRange *new_ranges, unsigned new_range_count, - TSRangeArray *differences -) { - unsigned new_index = 0; - unsigned old_index = 0; - Length current_position = length_zero(); - bool in_old_range = false; - bool in_new_range = false; - - while (old_index < old_range_count || new_index < new_range_count) { - const TSRange *old_range = &old_ranges[old_index]; - const TSRange *new_range = &new_ranges[new_index]; - - Length next_old_position; - if (in_old_range) { - next_old_position = (Length) {old_range->end_byte, old_range->end_point}; - } else if (old_index < old_range_count) { - next_old_position = (Length) {old_range->start_byte, old_range->start_point}; - } else { - next_old_position = LENGTH_MAX; - } - - Length next_new_position; - if (in_new_range) { - next_new_position = (Length) {new_range->end_byte, new_range->end_point}; - } else if (new_index < new_range_count) { - next_new_position = (Length) {new_range->start_byte, new_range->start_point}; - } else { - next_new_position = LENGTH_MAX; - } - - if (next_old_position.bytes < next_new_position.bytes) { - if (in_old_range != in_new_range) { - ts_range_array_add(differences, current_position, next_old_position); - } - if (in_old_range) old_index++; - current_position = next_old_position; - in_old_range = !in_old_range; - } else if (next_new_position.bytes < next_old_position.bytes) { - if (in_old_range != in_new_range) { - ts_range_array_add(differences, current_position, next_new_position); - } - if (in_new_range) new_index++; - current_position = next_new_position; - in_new_range = !in_new_range; - } else { - if (in_old_range != in_new_range) { - ts_range_array_add(differences, current_position, next_new_position); - } - if (in_old_range) old_index++; - if (in_new_range) new_index++; - in_old_range = !in_old_range; - in_new_range = !in_new_range; - current_position = next_new_position; - } - } -} - -typedef struct { - TreeCursor cursor; - const TSLanguage *language; - unsigned visible_depth; - bool in_padding; -} Iterator; - -static Iterator iterator_new(TreeCursor *cursor, const Subtree *tree, const TSLanguage *language) { - array_clear(&cursor->stack); - array_push(&cursor->stack, ((TreeCursorEntry){ - .subtree = tree, - .position = length_zero(), - .child_index = 0, - .structural_child_index = 0, - })); - return (Iterator) { - .cursor = *cursor, - .language = language, - .visible_depth = 1, - .in_padding = false, - }; -} - -static bool iterator_done(Iterator *self) { - return self->cursor.stack.size == 0; -} - -static Length iterator_start_position(Iterator *self) { - TreeCursorEntry entry = *array_back(&self->cursor.stack); - if (self->in_padding) { - return entry.position; - } else { - return length_add(entry.position, ts_subtree_padding(*entry.subtree)); - } -} - -static Length iterator_end_position(Iterator *self) { - TreeCursorEntry entry = *array_back(&self->cursor.stack); - Length result = length_add(entry.position, ts_subtree_padding(*entry.subtree)); - if (self->in_padding) { - return result; - } else { - return length_add(result, ts_subtree_size(*entry.subtree)); - } -} - -static bool iterator_tree_is_visible(const Iterator *self) { - TreeCursorEntry entry = *array_back(&self->cursor.stack); - if (ts_subtree_visible(*entry.subtree)) return true; - if (self->cursor.stack.size > 1) { - Subtree parent = *self->cursor.stack.contents[self->cursor.stack.size - 2].subtree; - const TSSymbol *alias_sequence = ts_language_alias_sequence( - self->language, - parent.ptr->production_id - ); - return alias_sequence && alias_sequence[entry.structural_child_index] != 0; - } - return false; -} - -static void iterator_get_visible_state(const Iterator *self, Subtree *tree, - TSSymbol *alias_symbol, uint32_t *start_byte) { - uint32_t i = self->cursor.stack.size - 1; - - if (self->in_padding) { - if (i == 0) return; - i--; - } - - for (; i + 1 > 0; i--) { - TreeCursorEntry entry = self->cursor.stack.contents[i]; - - if (i > 0) { - const Subtree *parent = self->cursor.stack.contents[i - 1].subtree; - const TSSymbol *alias_sequence = ts_language_alias_sequence( - self->language, - parent->ptr->production_id - ); - if (alias_sequence) { - *alias_symbol = alias_sequence[entry.structural_child_index]; - } - } - - if (ts_subtree_visible(*entry.subtree) || *alias_symbol) { - *tree = *entry.subtree; - *start_byte = entry.position.bytes; - break; - } - } -} - -static void iterator_ascend(Iterator *self) { - if (iterator_done(self)) return; - if (iterator_tree_is_visible(self) && !self->in_padding) self->visible_depth--; - if (array_back(&self->cursor.stack)->child_index > 0) self->in_padding = false; - self->cursor.stack.size--; -} - -static bool iterator_descend(Iterator *self, uint32_t goal_position) { - if (self->in_padding) return false; - - bool did_descend; - do { - did_descend = false; - TreeCursorEntry entry = *array_back(&self->cursor.stack); - Length position = entry.position; - uint32_t structural_child_index = 0; - for (uint32_t i = 0, n = ts_subtree_child_count(*entry.subtree); i < n; i++) { - const Subtree *child = &entry.subtree->ptr->children[i]; - Length child_left = length_add(position, ts_subtree_padding(*child)); - Length child_right = length_add(child_left, ts_subtree_size(*child)); - - if (child_right.bytes > goal_position) { - array_push(&self->cursor.stack, ((TreeCursorEntry){ - .subtree = child, - .position = position, - .child_index = i, - .structural_child_index = structural_child_index, - })); - - if (iterator_tree_is_visible(self)) { - if (child_left.bytes > goal_position) { - self->in_padding = true; - } else { - self->visible_depth++; - } - return true; - } - - did_descend = true; - break; - } - - position = child_right; - if (!ts_subtree_extra(*child)) structural_child_index++; - } - } while (did_descend); - - return false; -} - -static void iterator_advance(Iterator *self) { - if (self->in_padding) { - self->in_padding = false; - if (iterator_tree_is_visible(self)) { - self->visible_depth++; - } else { - iterator_descend(self, 0); - } - return; - } - - for (;;) { - if (iterator_tree_is_visible(self)) self->visible_depth--; - TreeCursorEntry entry = array_pop(&self->cursor.stack); - if (iterator_done(self)) return; - - const Subtree *parent = array_back(&self->cursor.stack)->subtree; - uint32_t child_index = entry.child_index + 1; - if (ts_subtree_child_count(*parent) > child_index) { - Length position = length_add(entry.position, ts_subtree_total_size(*entry.subtree)); - uint32_t structural_child_index = entry.structural_child_index; - if (!ts_subtree_extra(*entry.subtree)) structural_child_index++; - const Subtree *next_child = &parent->ptr->children[child_index]; - - array_push(&self->cursor.stack, ((TreeCursorEntry){ - .subtree = next_child, - .position = position, - .child_index = child_index, - .structural_child_index = structural_child_index, - })); - - if (iterator_tree_is_visible(self)) { - if (ts_subtree_padding(*next_child).bytes > 0) { - self->in_padding = true; - } else { - self->visible_depth++; - } - } else { - iterator_descend(self, 0); - } - break; - } - } -} - -typedef enum { - IteratorDiffers, - IteratorMayDiffer, - IteratorMatches, -} IteratorComparison; - -static IteratorComparison iterator_compare(const Iterator *old_iter, const Iterator *new_iter) { - Subtree old_tree = NULL_SUBTREE; - Subtree new_tree = NULL_SUBTREE; - uint32_t old_start = 0; - uint32_t new_start = 0; - TSSymbol old_alias_symbol = 0; - TSSymbol new_alias_symbol = 0; - iterator_get_visible_state(old_iter, &old_tree, &old_alias_symbol, &old_start); - iterator_get_visible_state(new_iter, &new_tree, &new_alias_symbol, &new_start); - - if (!old_tree.ptr && !new_tree.ptr) return IteratorMatches; - if (!old_tree.ptr || !new_tree.ptr) return IteratorDiffers; - - if ( - old_alias_symbol == new_alias_symbol && - ts_subtree_symbol(old_tree) == ts_subtree_symbol(new_tree) - ) { - if (old_start == new_start && - !ts_subtree_has_changes(old_tree) && - ts_subtree_symbol(old_tree) != ts_builtin_sym_error && - ts_subtree_size(old_tree).bytes == ts_subtree_size(new_tree).bytes && - ts_subtree_parse_state(old_tree) != TS_TREE_STATE_NONE && - ts_subtree_parse_state(new_tree) != TS_TREE_STATE_NONE && - (ts_subtree_parse_state(old_tree) == ERROR_STATE) == - (ts_subtree_parse_state(new_tree) == ERROR_STATE)) { - return IteratorMatches; - } else { - return IteratorMayDiffer; - } - } - - return IteratorDiffers; -} - -#ifdef DEBUG_GET_CHANGED_RANGES -static inline void iterator_print_state(Iterator *self) { - TreeCursorEntry entry = *array_back(&self->cursor.stack); - TSPoint start = iterator_start_position(self).extent; - TSPoint end = iterator_end_position(self).extent; - const char *name = ts_language_symbol_name(self->language, ts_subtree_symbol(*entry.subtree)); - printf( - "(%-25s %s\t depth:%u [%u, %u] - [%u, %u])", - name, self->in_padding ? "(p)" : " ", - self->visible_depth, - start.row + 1, start.column, - end.row + 1, end.column - ); -} -#endif - -unsigned ts_subtree_get_changed_ranges(const Subtree *old_tree, const Subtree *new_tree, - TreeCursor *cursor1, TreeCursor *cursor2, - const TSLanguage *language, - const TSRangeArray *included_range_differences, - TSRange **ranges) { - TSRangeArray results = array_new(); - - Iterator old_iter = iterator_new(cursor1, old_tree, language); - Iterator new_iter = iterator_new(cursor2, new_tree, language); - - unsigned included_range_difference_index = 0; - - Length position = iterator_start_position(&old_iter); - Length next_position = iterator_start_position(&new_iter); - if (position.bytes < next_position.bytes) { - ts_range_array_add(&results, position, next_position); - position = next_position; - } else if (position.bytes > next_position.bytes) { - ts_range_array_add(&results, next_position, position); - next_position = position; - } - - do { - #ifdef DEBUG_GET_CHANGED_RANGES - printf("At [%-2u, %-2u] Compare ", position.extent.row + 1, position.extent.column); - iterator_print_state(&old_iter); - printf("\tvs\t"); - iterator_print_state(&new_iter); - puts(""); - #endif - - // Compare the old and new subtrees. - IteratorComparison comparison = iterator_compare(&old_iter, &new_iter); - - // Even if the two subtrees appear to be identical, they could differ - // internally if they contain a range of text that was previously - // excluded from the parse, and is now included, or vice-versa. - if (comparison == IteratorMatches && ts_range_array_intersects( - included_range_differences, - included_range_difference_index, - position.bytes, - iterator_end_position(&old_iter).bytes - )) { - comparison = IteratorMayDiffer; - } - - bool is_changed = false; - switch (comparison) { - // If the subtrees are definitely identical, move to the end - // of both subtrees. - case IteratorMatches: - next_position = iterator_end_position(&old_iter); - break; - - // If the subtrees might differ internally, descend into both - // subtrees, finding the first child that spans the current position. - case IteratorMayDiffer: - if (iterator_descend(&old_iter, position.bytes)) { - if (!iterator_descend(&new_iter, position.bytes)) { - is_changed = true; - next_position = iterator_end_position(&old_iter); - } - } else if (iterator_descend(&new_iter, position.bytes)) { - is_changed = true; - next_position = iterator_end_position(&new_iter); - } else { - next_position = length_min( - iterator_end_position(&old_iter), - iterator_end_position(&new_iter) - ); - } - break; - - // If the subtrees are different, record a change and then move - // to the end of both subtrees. - case IteratorDiffers: - is_changed = true; - next_position = length_min( - iterator_end_position(&old_iter), - iterator_end_position(&new_iter) - ); - break; - } - - // Ensure that both iterators are caught up to the current position. - while ( - !iterator_done(&old_iter) && - iterator_end_position(&old_iter).bytes <= next_position.bytes - ) iterator_advance(&old_iter); - while ( - !iterator_done(&new_iter) && - iterator_end_position(&new_iter).bytes <= next_position.bytes - ) iterator_advance(&new_iter); - - // Ensure that both iterators are at the same depth in the tree. - while (old_iter.visible_depth > new_iter.visible_depth) { - iterator_ascend(&old_iter); - } - while (new_iter.visible_depth > old_iter.visible_depth) { - iterator_ascend(&new_iter); - } - - if (is_changed) { - #ifdef DEBUG_GET_CHANGED_RANGES - printf( - " change: [[%u, %u] - [%u, %u]]\n", - position.extent.row + 1, position.extent.column, - next_position.extent.row + 1, next_position.extent.column - ); - #endif - - ts_range_array_add(&results, position, next_position); - } - - position = next_position; - - // Keep track of the current position in the included range differences - // array in order to avoid scanning the entire array on each iteration. - while (included_range_difference_index < included_range_differences->size) { - const TSRange *range = &included_range_differences->contents[ - included_range_difference_index - ]; - if (range->end_byte <= position.bytes) { - included_range_difference_index++; - } else { - break; - } - } - } while (!iterator_done(&old_iter) && !iterator_done(&new_iter)); - - Length old_size = ts_subtree_total_size(*old_tree); - Length new_size = ts_subtree_total_size(*new_tree); - if (old_size.bytes < new_size.bytes) { - ts_range_array_add(&results, old_size, new_size); - } else if (new_size.bytes < old_size.bytes) { - ts_range_array_add(&results, new_size, old_size); - } - - *cursor1 = old_iter.cursor; - *cursor2 = new_iter.cursor; - *ranges = results.contents; - return results.size; -} diff --git a/src/tree_sitter/get_changed_ranges.h b/src/tree_sitter/get_changed_ranges.h deleted file mode 100644 index a1f1dbb430..0000000000 --- a/src/tree_sitter/get_changed_ranges.h +++ /dev/null @@ -1,36 +0,0 @@ -#ifndef TREE_SITTER_GET_CHANGED_RANGES_H_ -#define TREE_SITTER_GET_CHANGED_RANGES_H_ - -#ifdef __cplusplus -extern "C" { -#endif - -#include "./tree_cursor.h" -#include "./subtree.h" - -typedef Array(TSRange) TSRangeArray; - -void ts_range_array_get_changed_ranges( - const TSRange *old_ranges, unsigned old_range_count, - const TSRange *new_ranges, unsigned new_range_count, - TSRangeArray *differences -); - -bool ts_range_array_intersects( - const TSRangeArray *self, unsigned start_index, - uint32_t start_byte, uint32_t end_byte -); - -unsigned ts_subtree_get_changed_ranges( - const Subtree *old_tree, const Subtree *new_tree, - TreeCursor *cursor1, TreeCursor *cursor2, - const TSLanguage *language, - const TSRangeArray *included_range_differences, - TSRange **ranges -); - -#ifdef __cplusplus -} -#endif - -#endif // TREE_SITTER_GET_CHANGED_RANGES_H_ diff --git a/src/tree_sitter/language.c b/src/tree_sitter/language.c deleted file mode 100644 index c00c49e3c0..0000000000 --- a/src/tree_sitter/language.c +++ /dev/null @@ -1,149 +0,0 @@ -#include "./language.h" -#include "./subtree.h" -#include "./error_costs.h" -#include <string.h> - -uint32_t ts_language_symbol_count(const TSLanguage *self) { - return self->symbol_count + self->alias_count; -} - -uint32_t ts_language_version(const TSLanguage *self) { - return self->version; -} - -uint32_t ts_language_field_count(const TSLanguage *self) { - if (self->version >= TREE_SITTER_LANGUAGE_VERSION_WITH_FIELDS) { - return self->field_count; - } else { - return 0; - } -} - -void ts_language_table_entry( - const TSLanguage *self, - TSStateId state, - TSSymbol symbol, - TableEntry *result -) { - if (symbol == ts_builtin_sym_error || symbol == ts_builtin_sym_error_repeat) { - result->action_count = 0; - result->is_reusable = false; - result->actions = NULL; - } else { - assert(symbol < self->token_count); - uint32_t action_index = ts_language_lookup(self, state, symbol); - const TSParseActionEntry *entry = &self->parse_actions[action_index]; - result->action_count = entry->entry.count; - result->is_reusable = entry->entry.reusable; - result->actions = (const TSParseAction *)(entry + 1); - } -} - -TSSymbolMetadata ts_language_symbol_metadata( - const TSLanguage *self, - TSSymbol symbol -) { - if (symbol == ts_builtin_sym_error) { - return (TSSymbolMetadata){.visible = true, .named = true}; - } else if (symbol == ts_builtin_sym_error_repeat) { - return (TSSymbolMetadata){.visible = false, .named = false}; - } else { - return self->symbol_metadata[symbol]; - } -} - -TSSymbol ts_language_public_symbol( - const TSLanguage *self, - TSSymbol symbol -) { - if (symbol == ts_builtin_sym_error) return symbol; - if (self->version >= TREE_SITTER_LANGUAGE_VERSION_WITH_SYMBOL_DEDUPING) { - return self->public_symbol_map[symbol]; - } else { - return symbol; - } -} - -const char *ts_language_symbol_name( - const TSLanguage *self, - TSSymbol symbol -) { - if (symbol == ts_builtin_sym_error) { - return "ERROR"; - } else if (symbol == ts_builtin_sym_error_repeat) { - return "_ERROR"; - } else if (symbol < ts_language_symbol_count(self)) { - return self->symbol_names[symbol]; - } else { - return NULL; - } -} - -TSSymbol ts_language_symbol_for_name( - const TSLanguage *self, - const char *string, - uint32_t length, - bool is_named -) { - if (!strncmp(string, "ERROR", length)) return ts_builtin_sym_error; - uint32_t count = ts_language_symbol_count(self); - for (TSSymbol i = 0; i < count; i++) { - TSSymbolMetadata metadata = ts_language_symbol_metadata(self, i); - if (!metadata.visible || metadata.named != is_named) continue; - const char *symbol_name = self->symbol_names[i]; - if (!strncmp(symbol_name, string, length) && !symbol_name[length]) { - if (self->version >= TREE_SITTER_LANGUAGE_VERSION_WITH_SYMBOL_DEDUPING) { - return self->public_symbol_map[i]; - } else { - return i; - } - } - } - return 0; -} - -TSSymbolType ts_language_symbol_type( - const TSLanguage *self, - TSSymbol symbol -) { - TSSymbolMetadata metadata = ts_language_symbol_metadata(self, symbol); - if (metadata.named) { - return TSSymbolTypeRegular; - } else if (metadata.visible) { - return TSSymbolTypeAnonymous; - } else { - return TSSymbolTypeAuxiliary; - } -} - -const char *ts_language_field_name_for_id( - const TSLanguage *self, - TSFieldId id -) { - uint32_t count = ts_language_field_count(self); - if (count && id <= count) { - return self->field_names[id]; - } else { - return NULL; - } -} - -TSFieldId ts_language_field_id_for_name( - const TSLanguage *self, - const char *name, - uint32_t name_length -) { - uint32_t count = ts_language_field_count(self); - for (TSSymbol i = 1; i < count + 1; i++) { - switch (strncmp(name, self->field_names[i], name_length)) { - case 0: - if (self->field_names[i][name_length] == 0) return i; - break; - case -1: - return 0; - default: - break; - } - } - return 0; -} diff --git a/src/tree_sitter/language.h b/src/tree_sitter/language.h deleted file mode 100644 index 341f0f85af..0000000000 --- a/src/tree_sitter/language.h +++ /dev/null @@ -1,143 +0,0 @@ -#ifndef TREE_SITTER_LANGUAGE_H_ -#define TREE_SITTER_LANGUAGE_H_ - -#ifdef __cplusplus -extern "C" { -#endif - -#include "./subtree.h" -#include "tree_sitter/parser.h" - -#define ts_builtin_sym_error_repeat (ts_builtin_sym_error - 1) -#define TREE_SITTER_LANGUAGE_VERSION_WITH_FIELDS 10 -#define TREE_SITTER_LANGUAGE_VERSION_WITH_SYMBOL_DEDUPING 11 -#define TREE_SITTER_LANGUAGE_VERSION_WITH_SMALL_STATES 11 - -typedef struct { - const TSParseAction *actions; - uint32_t action_count; - bool is_reusable; -} TableEntry; - -void ts_language_table_entry(const TSLanguage *, TSStateId, TSSymbol, TableEntry *); - -TSSymbolMetadata ts_language_symbol_metadata(const TSLanguage *, TSSymbol); - -TSSymbol ts_language_public_symbol(const TSLanguage *, TSSymbol); - -static inline bool ts_language_is_symbol_external(const TSLanguage *self, TSSymbol symbol) { - return 0 < symbol && symbol < self->external_token_count + 1; -} - -static inline const TSParseAction *ts_language_actions( - const TSLanguage *self, - TSStateId state, - TSSymbol symbol, - uint32_t *count -) { - TableEntry entry; - ts_language_table_entry(self, state, symbol, &entry); - *count = entry.action_count; - return entry.actions; -} - -static inline bool ts_language_has_actions(const TSLanguage *self, - TSStateId state, - TSSymbol symbol) { - TableEntry entry; - ts_language_table_entry(self, state, symbol, &entry); - return entry.action_count > 0; -} - -static inline bool ts_language_has_reduce_action(const TSLanguage *self, - TSStateId state, - TSSymbol symbol) { - TableEntry entry; - ts_language_table_entry(self, state, symbol, &entry); - return entry.action_count > 0 && entry.actions[0].type == TSParseActionTypeReduce; -} - -static inline uint16_t ts_language_lookup( - const TSLanguage *self, - TSStateId state, - TSSymbol symbol -) { - if ( - self->version >= TREE_SITTER_LANGUAGE_VERSION_WITH_SMALL_STATES && - state >= self->large_state_count - ) { - uint32_t index = self->small_parse_table_map[state - self->large_state_count]; - const uint16_t *data = &self->small_parse_table[index]; - uint16_t section_count = *(data++); - for (unsigned i = 0; i < section_count; i++) { - uint16_t section_value = *(data++); - uint16_t symbol_count = *(data++); - for (unsigned i = 0; i < symbol_count; i++) { - if (*(data++) == symbol) return section_value; - } - } - return 0; - } else { - return self->parse_table[state * self->symbol_count + symbol]; - } -} - -static inline TSStateId ts_language_next_state(const TSLanguage *self, - TSStateId state, - TSSymbol symbol) { - if (symbol == ts_builtin_sym_error || symbol == ts_builtin_sym_error_repeat) { - return 0; - } else if (symbol < self->token_count) { - uint32_t count; - const TSParseAction *actions = ts_language_actions(self, state, symbol, &count); - if (count > 0) { - TSParseAction action = actions[count - 1]; - if (action.type == TSParseActionTypeShift) { - return action.params.shift.extra ? state : action.params.shift.state; - } - } - return 0; - } else { - return ts_language_lookup(self, state, symbol); - } -} - -static inline const bool * -ts_language_enabled_external_tokens(const TSLanguage *self, - unsigned external_scanner_state) { - if (external_scanner_state == 0) { - return NULL; - } else { - return self->external_scanner.states + self->external_token_count * external_scanner_state; - } -} - -static inline const TSSymbol * -ts_language_alias_sequence(const TSLanguage *self, uint32_t production_id) { - return production_id > 0 ? - self->alias_sequences + production_id * self->max_alias_sequence_length : - NULL; -} - -static inline void ts_language_field_map( - const TSLanguage *self, - uint32_t production_id, - const TSFieldMapEntry **start, - const TSFieldMapEntry **end -) { - if (self->version < TREE_SITTER_LANGUAGE_VERSION_WITH_FIELDS || self->field_count == 0) { - *start = NULL; - *end = NULL; - return; - } - - TSFieldMapSlice slice = self->field_map_slices[production_id]; - *start = &self->field_map_entries[slice.index]; - *end = &self->field_map_entries[slice.index] + slice.length; -} - -#ifdef __cplusplus -} -#endif - -#endif // TREE_SITTER_LANGUAGE_H_ diff --git a/src/tree_sitter/length.h b/src/tree_sitter/length.h deleted file mode 100644 index 61de9fc1d5..0000000000 --- a/src/tree_sitter/length.h +++ /dev/null @@ -1,44 +0,0 @@ -#ifndef TREE_SITTER_LENGTH_H_ -#define TREE_SITTER_LENGTH_H_ - -#include <stdlib.h> -#include <stdbool.h> -#include "./point.h" -#include "tree_sitter/api.h" - -typedef struct { - uint32_t bytes; - TSPoint extent; -} Length; - -static const Length LENGTH_UNDEFINED = {0, {0, 1}}; -static const Length LENGTH_MAX = {UINT32_MAX, {UINT32_MAX, UINT32_MAX}}; - -static inline bool length_is_undefined(Length length) { - return length.bytes == 0 && length.extent.column != 0; -} - -static inline Length length_min(Length len1, Length len2) { - return (len1.bytes < len2.bytes) ? len1 : len2; -} - -static inline Length length_add(Length len1, Length len2) { - Length result; - result.bytes = len1.bytes + len2.bytes; - result.extent = point_add(len1.extent, len2.extent); - return result; -} - -static inline Length length_sub(Length len1, Length len2) { - Length result; - result.bytes = len1.bytes - len2.bytes; - result.extent = point_sub(len1.extent, len2.extent); - return result; -} - -static inline Length length_zero(void) { - Length result = {0, {0, 0}}; - return result; -} - -#endif diff --git a/src/tree_sitter/lexer.c b/src/tree_sitter/lexer.c deleted file mode 100644 index a3c29544d3..0000000000 --- a/src/tree_sitter/lexer.c +++ /dev/null @@ -1,391 +0,0 @@ -#include <stdio.h> -#include "./lexer.h" -#include "./subtree.h" -#include "./length.h" -#include "./unicode.h" - -#define LOG(message, character) \ - if (self->logger.log) { \ - snprintf( \ - self->debug_buffer, \ - TREE_SITTER_SERIALIZATION_BUFFER_SIZE, \ - 32 <= character && character < 127 ? \ - message " character:'%c'" : \ - message " character:%d", \ - character \ - ); \ - self->logger.log( \ - self->logger.payload, \ - TSLogTypeLex, \ - self->debug_buffer \ - ); \ - } - -static const int32_t BYTE_ORDER_MARK = 0xFEFF; - -static const TSRange DEFAULT_RANGE = { - .start_point = { - .row = 0, - .column = 0, - }, - .end_point = { - .row = UINT32_MAX, - .column = UINT32_MAX, - }, - .start_byte = 0, - .end_byte = UINT32_MAX -}; - -// Check if the lexer has reached EOF. This state is stored -// by setting the lexer's `current_included_range_index` such that -// it has consumed all of its available ranges. -static bool ts_lexer__eof(const TSLexer *_self) { - Lexer *self = (Lexer *)_self; - return self->current_included_range_index == self->included_range_count; -} - -// Clear the currently stored chunk of source code, because the lexer's -// position has changed. -static void ts_lexer__clear_chunk(Lexer *self) { - self->chunk = NULL; - self->chunk_size = 0; - self->chunk_start = 0; -} - -// Call the lexer's input callback to obtain a new chunk of source code -// for the current position. -static void ts_lexer__get_chunk(Lexer *self) { - self->chunk_start = self->current_position.bytes; - self->chunk = self->input.read( - self->input.payload, - self->current_position.bytes, - self->current_position.extent, - &self->chunk_size - ); - if (!self->chunk_size) { - self->current_included_range_index = self->included_range_count; - self->chunk = NULL; - } -} - -// Decode the next unicode character in the current chunk of source code. -// This assumes that the lexer has already retrieved a chunk of source -// code that spans the current position. -static void ts_lexer__get_lookahead(Lexer *self) { - uint32_t position_in_chunk = self->current_position.bytes - self->chunk_start; - uint32_t size = self->chunk_size - position_in_chunk; - - if (size == 0) { - self->lookahead_size = 1; - self->data.lookahead = '\0'; - return; - } - - const uint8_t *chunk = (const uint8_t *)self->chunk + position_in_chunk; - UnicodeDecodeFunction decode = self->input.encoding == TSInputEncodingUTF8 - ? ts_decode_utf8 - : ts_decode_utf16; - - self->lookahead_size = decode(chunk, size, &self->data.lookahead); - - // If this chunk ended in the middle of a multi-byte character, - // try again with a fresh chunk. - if (self->data.lookahead == TS_DECODE_ERROR && size < 4) { - ts_lexer__get_chunk(self); - chunk = (const uint8_t *)self->chunk; - size = self->chunk_size; - self->lookahead_size = decode(chunk, size, &self->data.lookahead); - } - - if (self->data.lookahead == TS_DECODE_ERROR) { - self->lookahead_size = 1; - } -} - -// Advance to the next character in the source code, retrieving a new -// chunk of source code if needed. -static void ts_lexer__advance(TSLexer *_self, bool skip) { - Lexer *self = (Lexer *)_self; - if (!self->chunk) return; - - if (skip) { - LOG("skip", self->data.lookahead); - } else { - LOG("consume", self->data.lookahead); - } - - if (self->lookahead_size) { - self->current_position.bytes += self->lookahead_size; - if (self->data.lookahead == '\n') { - self->current_position.extent.row++; - self->current_position.extent.column = 0; - } else { - self->current_position.extent.column += self->lookahead_size; - } - } - - const TSRange *current_range = NULL; - if (self->current_included_range_index < self->included_range_count) { - current_range = &self->included_ranges[self->current_included_range_index]; - if (self->current_position.bytes == current_range->end_byte) { - self->current_included_range_index++; - if (self->current_included_range_index < self->included_range_count) { - current_range++; - self->current_position = (Length) { - current_range->start_byte, - current_range->start_point, - }; - } else { - current_range = NULL; - } - } - } - - if (skip) self->token_start_position = self->current_position; - - if (current_range) { - if (self->current_position.bytes >= self->chunk_start + self->chunk_size) { - ts_lexer__get_chunk(self); - } - ts_lexer__get_lookahead(self); - } else { - ts_lexer__clear_chunk(self); - self->data.lookahead = '\0'; - self->lookahead_size = 1; - } -} - -// Mark that a token match has completed. This can be called multiple -// times if a longer match is found later. -static void ts_lexer__mark_end(TSLexer *_self) { - Lexer *self = (Lexer *)_self; - if (!ts_lexer__eof(&self->data)) { - // If the lexer is right at the beginning of included range, - // then the token should be considered to end at the *end* of the - // previous included range, rather than here. - TSRange *current_included_range = &self->included_ranges[ - self->current_included_range_index - ]; - if ( - self->current_included_range_index > 0 && - self->current_position.bytes == current_included_range->start_byte - ) { - TSRange *previous_included_range = current_included_range - 1; - self->token_end_position = (Length) { - previous_included_range->end_byte, - previous_included_range->end_point, - }; - return; - } - } - self->token_end_position = self->current_position; -} - -static uint32_t ts_lexer__get_column(TSLexer *_self) { - Lexer *self = (Lexer *)_self; - uint32_t goal_byte = self->current_position.bytes; - - self->current_position.bytes -= self->current_position.extent.column; - self->current_position.extent.column = 0; - - if (self->current_position.bytes < self->chunk_start) { - ts_lexer__get_chunk(self); - } - - uint32_t result = 0; - while (self->current_position.bytes < goal_byte) { - ts_lexer__advance(&self->data, false); - result++; - } - - return result; -} - -// Is the lexer at a boundary between two disjoint included ranges of -// source code? This is exposed as an API because some languages' external -// scanners need to perform custom actions at these bounaries. -static bool ts_lexer__is_at_included_range_start(const TSLexer *_self) { - const Lexer *self = (const Lexer *)_self; - if (self->current_included_range_index < self->included_range_count) { - TSRange *current_range = &self->included_ranges[self->current_included_range_index]; - return self->current_position.bytes == current_range->start_byte; - } else { - return false; - } -} - -void ts_lexer_init(Lexer *self) { - *self = (Lexer) { - .data = { - // The lexer's methods are stored as struct fields so that generated - // parsers can call them without needing to be linked against this - // library. - .advance = ts_lexer__advance, - .mark_end = ts_lexer__mark_end, - .get_column = ts_lexer__get_column, - .is_at_included_range_start = ts_lexer__is_at_included_range_start, - .eof = ts_lexer__eof, - .lookahead = 0, - .result_symbol = 0, - }, - .chunk = NULL, - .chunk_size = 0, - .chunk_start = 0, - .current_position = {0, {0, 0}}, - .logger = { - .payload = NULL, - .log = NULL - }, - .included_ranges = NULL, - .included_range_count = 0, - .current_included_range_index = 0, - }; - ts_lexer_set_included_ranges(self, NULL, 0); -} - -void ts_lexer_delete(Lexer *self) { - ts_free(self->included_ranges); -} - -static void ts_lexer_goto(Lexer *self, Length position) { - self->current_position = position; - bool found_included_range = false; - - // Move to the first valid position at or after the given position. - for (unsigned i = 0; i < self->included_range_count; i++) { - TSRange *included_range = &self->included_ranges[i]; - if (included_range->end_byte > position.bytes) { - if (included_range->start_byte > position.bytes) { - self->current_position = (Length) { - .bytes = included_range->start_byte, - .extent = included_range->start_point, - }; - } - - self->current_included_range_index = i; - found_included_range = true; - break; - } - } - - if (found_included_range) { - // If the current position is outside of the current chunk of text, - // then clear out the current chunk of text. - if (self->chunk && ( - position.bytes < self->chunk_start || - position.bytes >= self->chunk_start + self->chunk_size - )) { - ts_lexer__clear_chunk(self); - } - - self->lookahead_size = 0; - self->data.lookahead = '\0'; - } - - // If the given position is beyond any of included ranges, move to the EOF - // state - past the end of the included ranges. - else { - self->current_included_range_index = self->included_range_count; - TSRange *last_included_range = &self->included_ranges[self->included_range_count - 1]; - self->current_position = (Length) { - .bytes = last_included_range->end_byte, - .extent = last_included_range->end_point, - }; - ts_lexer__clear_chunk(self); - self->lookahead_size = 1; - self->data.lookahead = '\0'; - } -} - -void ts_lexer_set_input(Lexer *self, TSInput input) { - self->input = input; - ts_lexer__clear_chunk(self); - ts_lexer_goto(self, self->current_position); -} - -// Move the lexer to the given position. This doesn't do any work -// if the parser is already at the given position. -void ts_lexer_reset(Lexer *self, Length position) { - if (position.bytes != self->current_position.bytes) { - ts_lexer_goto(self, position); - } -} - -void ts_lexer_start(Lexer *self) { - self->token_start_position = self->current_position; - self->token_end_position = LENGTH_UNDEFINED; - self->data.result_symbol = 0; - if (!ts_lexer__eof(&self->data)) { - if (!self->chunk_size) ts_lexer__get_chunk(self); - if (!self->lookahead_size) ts_lexer__get_lookahead(self); - if ( - self->current_position.bytes == 0 && - self->data.lookahead == BYTE_ORDER_MARK - ) ts_lexer__advance(&self->data, true); - } -} - -void ts_lexer_finish(Lexer *self, uint32_t *lookahead_end_byte) { - if (length_is_undefined(self->token_end_position)) { - ts_lexer__mark_end(&self->data); - } - - uint32_t current_lookahead_end_byte = self->current_position.bytes + 1; - - // In order to determine that a byte sequence is invalid UTF8 or UTF16, - // the character decoding algorithm may have looked at the following byte. - // Therefore, the next byte *after* the current (invalid) character - // affects the interpretation of the current character. - if (self->data.lookahead == TS_DECODE_ERROR) { - current_lookahead_end_byte++; - } - - if (current_lookahead_end_byte > *lookahead_end_byte) { - *lookahead_end_byte = current_lookahead_end_byte; - } -} - -void ts_lexer_advance_to_end(Lexer *self) { - while (self->chunk) { - ts_lexer__advance(&self->data, false); - } -} - -void ts_lexer_mark_end(Lexer *self) { - ts_lexer__mark_end(&self->data); -} - -bool ts_lexer_set_included_ranges( - Lexer *self, - const TSRange *ranges, - uint32_t count -) { - if (count == 0 || !ranges) { - ranges = &DEFAULT_RANGE; - count = 1; - } else { - uint32_t previous_byte = 0; - for (unsigned i = 0; i < count; i++) { - const TSRange *range = &ranges[i]; - if ( - range->start_byte < previous_byte || - range->end_byte < range->start_byte - ) return false; - previous_byte = range->end_byte; - } - } - - size_t size = count * sizeof(TSRange); - self->included_ranges = ts_realloc(self->included_ranges, size); - memcpy(self->included_ranges, ranges, size); - self->included_range_count = count; - ts_lexer_goto(self, self->current_position); - return true; -} - -TSRange *ts_lexer_included_ranges(const Lexer *self, uint32_t *count) { - *count = self->included_range_count; - return self->included_ranges; -} - -#undef LOG diff --git a/src/tree_sitter/lexer.h b/src/tree_sitter/lexer.h deleted file mode 100644 index 5e39294529..0000000000 --- a/src/tree_sitter/lexer.h +++ /dev/null @@ -1,48 +0,0 @@ -#ifndef TREE_SITTER_LEXER_H_ -#define TREE_SITTER_LEXER_H_ - -#ifdef __cplusplus -extern "C" { -#endif - -#include "./length.h" -#include "./subtree.h" -#include "tree_sitter/api.h" -#include "tree_sitter/parser.h" - -typedef struct { - TSLexer data; - Length current_position; - Length token_start_position; - Length token_end_position; - - TSRange *included_ranges; - size_t included_range_count; - size_t current_included_range_index; - - const char *chunk; - uint32_t chunk_start; - uint32_t chunk_size; - uint32_t lookahead_size; - - TSInput input; - TSLogger logger; - char debug_buffer[TREE_SITTER_SERIALIZATION_BUFFER_SIZE]; -} Lexer; - -void ts_lexer_init(Lexer *); -void ts_lexer_delete(Lexer *); -void ts_lexer_set_input(Lexer *, TSInput); -void ts_lexer_reset(Lexer *, Length); -void ts_lexer_start(Lexer *); -void ts_lexer_finish(Lexer *, uint32_t *); -void ts_lexer_advance_to_end(Lexer *); -void ts_lexer_mark_end(Lexer *); -bool ts_lexer_set_included_ranges(Lexer *self, const TSRange *ranges, uint32_t count); -TSRange *ts_lexer_included_ranges(const Lexer *self, uint32_t *count); - -#ifdef __cplusplus -} -#endif - -#endif // TREE_SITTER_LEXER_H_ diff --git a/src/tree_sitter/lib.c b/src/tree_sitter/lib.c deleted file mode 100644 index 289d32f4c5..0000000000 --- a/src/tree_sitter/lib.c +++ /dev/null @@ -1,17 +0,0 @@ -// The Tree-sitter library can be built by compiling this one source file. -// -// The following directories must be added to the include path: -// - include - -#define _POSIX_C_SOURCE 200112L - -#include "./get_changed_ranges.c" -#include "./language.c" -#include "./lexer.c" -#include "./node.c" -#include "./parser.c" -#include "./query.c" -#include "./stack.c" -#include "./subtree.c" -#include "./tree_cursor.c" -#include "./tree.c" diff --git a/src/tree_sitter/node.c b/src/tree_sitter/node.c deleted file mode 100644 index 576f3ef38e..0000000000 --- a/src/tree_sitter/node.c +++ /dev/null @@ -1,677 +0,0 @@ -#include <stdbool.h> -#include "./subtree.h" -#include "./tree.h" -#include "./language.h" - -typedef struct { - Subtree parent; - const TSTree *tree; - Length position; - uint32_t child_index; - uint32_t structural_child_index; - const TSSymbol *alias_sequence; -} NodeChildIterator; - -// TSNode - constructors - -TSNode ts_node_new( - const TSTree *tree, - const Subtree *subtree, - Length position, - TSSymbol alias -) { - return (TSNode) { - {position.bytes, position.extent.row, position.extent.column, alias}, - subtree, - tree, - }; -} - -static inline TSNode ts_node__null(void) { - return ts_node_new(NULL, NULL, length_zero(), 0); -} - -// TSNode - accessors - -uint32_t ts_node_start_byte(TSNode self) { - return self.context[0]; -} - -TSPoint ts_node_start_point(TSNode self) { - return (TSPoint) {self.context[1], self.context[2]}; -} - -static inline uint32_t ts_node__alias(const TSNode *self) { - return self->context[3]; -} - -static inline Subtree ts_node__subtree(TSNode self) { - return *(const Subtree *)self.id; -} - -// NodeChildIterator - -static inline NodeChildIterator ts_node_iterate_children(const TSNode *node) { - Subtree subtree = ts_node__subtree(*node); - if (ts_subtree_child_count(subtree) == 0) { - return (NodeChildIterator) {NULL_SUBTREE, node->tree, length_zero(), 0, 0, NULL}; - } - const TSSymbol *alias_sequence = ts_language_alias_sequence( - node->tree->language, - subtree.ptr->production_id - ); - return (NodeChildIterator) { - .tree = node->tree, - .parent = subtree, - .position = {ts_node_start_byte(*node), ts_node_start_point(*node)}, - .child_index = 0, - .structural_child_index = 0, - .alias_sequence = alias_sequence, - }; -} - -static inline bool ts_node_child_iterator_done(NodeChildIterator *self) { - return self->child_index == self->parent.ptr->child_count; -} - -static inline bool ts_node_child_iterator_next( - NodeChildIterator *self, - TSNode *result -) { - if (!self->parent.ptr || ts_node_child_iterator_done(self)) return false; - const Subtree *child = &self->parent.ptr->children[self->child_index]; - TSSymbol alias_symbol = 0; - if (!ts_subtree_extra(*child)) { - if (self->alias_sequence) { - alias_symbol = self->alias_sequence[self->structural_child_index]; - } - self->structural_child_index++; - } - if (self->child_index > 0) { - self->position = length_add(self->position, ts_subtree_padding(*child)); - } - *result = ts_node_new( - self->tree, - child, - self->position, - alias_symbol - ); - self->position = length_add(self->position, ts_subtree_size(*child)); - self->child_index++; - return true; -} - -// TSNode - private - -static inline bool ts_node__is_relevant(TSNode self, bool include_anonymous) { - Subtree tree = ts_node__subtree(self); - if (include_anonymous) { - return ts_subtree_visible(tree) || ts_node__alias(&self); - } else { - TSSymbol alias = ts_node__alias(&self); - if (alias) { - return ts_language_symbol_metadata(self.tree->language, alias).named; - } else { - return ts_subtree_visible(tree) && ts_subtree_named(tree); - } - } -} - -static inline uint32_t ts_node__relevant_child_count( - TSNode self, - bool include_anonymous -) { - Subtree tree = ts_node__subtree(self); - if (ts_subtree_child_count(tree) > 0) { - if (include_anonymous) { - return tree.ptr->visible_child_count; - } else { - return tree.ptr->named_child_count; - } - } else { - return 0; - } -} - -static inline TSNode ts_node__child( - TSNode self, - uint32_t child_index, - bool include_anonymous -) { - TSNode result = self; - bool did_descend = true; - - while (did_descend) { - did_descend = false; - - TSNode child; - uint32_t index = 0; - NodeChildIterator iterator = ts_node_iterate_children(&result); - while (ts_node_child_iterator_next(&iterator, &child)) { - if (ts_node__is_relevant(child, include_anonymous)) { - if (index == child_index) { - if (ts_node__is_relevant(self, true)) { - ts_tree_set_cached_parent(self.tree, &child, &self); - } - return child; - } - index++; - } else { - uint32_t grandchild_index = child_index - index; - uint32_t grandchild_count = ts_node__relevant_child_count(child, include_anonymous); - if (grandchild_index < grandchild_count) { - did_descend = true; - result = child; - child_index = grandchild_index; - break; - } - index += grandchild_count; - } - } - } - - return ts_node__null(); -} - -static bool ts_subtree_has_trailing_empty_descendant( - Subtree self, - Subtree other -) { - for (unsigned i = ts_subtree_child_count(self) - 1; i + 1 > 0; i--) { - Subtree child = self.ptr->children[i]; - if (ts_subtree_total_bytes(child) > 0) break; - if (child.ptr == other.ptr || ts_subtree_has_trailing_empty_descendant(child, other)) { - return true; - } - } - return false; -} - -static inline TSNode ts_node__prev_sibling(TSNode self, bool include_anonymous) { - Subtree self_subtree = ts_node__subtree(self); - bool self_is_empty = ts_subtree_total_bytes(self_subtree) == 0; - uint32_t target_end_byte = ts_node_end_byte(self); - - TSNode node = ts_node_parent(self); - TSNode earlier_node = ts_node__null(); - bool earlier_node_is_relevant = false; - - while (!ts_node_is_null(node)) { - TSNode earlier_child = ts_node__null(); - bool earlier_child_is_relevant = false; - bool found_child_containing_target = false; - - TSNode child; - NodeChildIterator iterator = ts_node_iterate_children(&node); - while (ts_node_child_iterator_next(&iterator, &child)) { - if (child.id == self.id) break; - if (iterator.position.bytes > target_end_byte) { - found_child_containing_target = true; - break; - } - - if (iterator.position.bytes == target_end_byte && - (!self_is_empty || - ts_subtree_has_trailing_empty_descendant(ts_node__subtree(child), self_subtree))) { - found_child_containing_target = true; - break; - } - - if (ts_node__is_relevant(child, include_anonymous)) { - earlier_child = child; - earlier_child_is_relevant = true; - } else if (ts_node__relevant_child_count(child, include_anonymous) > 0) { - earlier_child = child; - earlier_child_is_relevant = false; - } - } - - if (found_child_containing_target) { - if (!ts_node_is_null(earlier_child)) { - earlier_node = earlier_child; - earlier_node_is_relevant = earlier_child_is_relevant; - } - node = child; - } else if (earlier_child_is_relevant) { - return earlier_child; - } else if (!ts_node_is_null(earlier_child)) { - node = earlier_child; - } else if (earlier_node_is_relevant) { - return earlier_node; - } else { - node = earlier_node; - } - } - - return ts_node__null(); -} - -static inline TSNode ts_node__next_sibling(TSNode self, bool include_anonymous) { - uint32_t target_end_byte = ts_node_end_byte(self); - - TSNode node = ts_node_parent(self); - TSNode later_node = ts_node__null(); - bool later_node_is_relevant = false; - - while (!ts_node_is_null(node)) { - TSNode later_child = ts_node__null(); - bool later_child_is_relevant = false; - TSNode child_containing_target = ts_node__null(); - - TSNode child; - NodeChildIterator iterator = ts_node_iterate_children(&node); - while (ts_node_child_iterator_next(&iterator, &child)) { - if (iterator.position.bytes < target_end_byte) continue; - if (ts_node_start_byte(child) <= ts_node_start_byte(self)) { - if (ts_node__subtree(child).ptr != ts_node__subtree(self).ptr) { - child_containing_target = child; - } - } else if (ts_node__is_relevant(child, include_anonymous)) { - later_child = child; - later_child_is_relevant = true; - break; - } else if (ts_node__relevant_child_count(child, include_anonymous) > 0) { - later_child = child; - later_child_is_relevant = false; - break; - } - } - - if (!ts_node_is_null(child_containing_target)) { - if (!ts_node_is_null(later_child)) { - later_node = later_child; - later_node_is_relevant = later_child_is_relevant; - } - node = child_containing_target; - } else if (later_child_is_relevant) { - return later_child; - } else if (!ts_node_is_null(later_child)) { - node = later_child; - } else if (later_node_is_relevant) { - return later_node; - } else { - node = later_node; - } - } - - return ts_node__null(); -} - -static inline TSNode ts_node__first_child_for_byte( - TSNode self, - uint32_t goal, - bool include_anonymous -) { - TSNode node = self; - bool did_descend = true; - - while (did_descend) { - did_descend = false; - - TSNode child; - NodeChildIterator iterator = ts_node_iterate_children(&node); - while (ts_node_child_iterator_next(&iterator, &child)) { - if (ts_node_end_byte(child) > goal) { - if (ts_node__is_relevant(child, include_anonymous)) { - return child; - } else if (ts_node_child_count(child) > 0) { - did_descend = true; - node = child; - break; - } - } - } - } - - return ts_node__null(); -} - -static inline TSNode ts_node__descendant_for_byte_range( - TSNode self, - uint32_t range_start, - uint32_t range_end, - bool include_anonymous -) { - TSNode node = self; - TSNode last_visible_node = self; - - bool did_descend = true; - while (did_descend) { - did_descend = false; - - TSNode child; - NodeChildIterator iterator = ts_node_iterate_children(&node); - while (ts_node_child_iterator_next(&iterator, &child)) { - uint32_t node_end = iterator.position.bytes; - - // The end of this node must extend far enough forward to touch - // the end of the range and exceed the start of the range. - if (node_end < range_end) continue; - if (node_end <= range_start) continue; - - // The start of this node must extend far enough backward to - // touch the start of the range. - if (range_start < ts_node_start_byte(child)) break; - - node = child; - if (ts_node__is_relevant(node, include_anonymous)) { - ts_tree_set_cached_parent(self.tree, &child, &last_visible_node); - last_visible_node = node; - } - did_descend = true; - break; - } - } - - return last_visible_node; -} - -static inline TSNode ts_node__descendant_for_point_range( - TSNode self, - TSPoint range_start, - TSPoint range_end, - bool include_anonymous -) { - TSNode node = self; - TSNode last_visible_node = self; - - bool did_descend = true; - while (did_descend) { - did_descend = false; - - TSNode child; - NodeChildIterator iterator = ts_node_iterate_children(&node); - while (ts_node_child_iterator_next(&iterator, &child)) { - TSPoint node_end = iterator.position.extent; - - // The end of this node must extend far enough forward to touch - // the end of the range and exceed the start of the range. - if (point_lt(node_end, range_end)) continue; - if (point_lte(node_end, range_start)) continue; - - // The start of this node must extend far enough backward to - // touch the start of the range. - if (point_lt(range_start, ts_node_start_point(child))) break; - - node = child; - if (ts_node__is_relevant(node, include_anonymous)) { - ts_tree_set_cached_parent(self.tree, &child, &last_visible_node); - last_visible_node = node; - } - did_descend = true; - break; - } - } - - return last_visible_node; -} - -// TSNode - public - -uint32_t ts_node_end_byte(TSNode self) { - return ts_node_start_byte(self) + ts_subtree_size(ts_node__subtree(self)).bytes; -} - -TSPoint ts_node_end_point(TSNode self) { - return point_add(ts_node_start_point(self), ts_subtree_size(ts_node__subtree(self)).extent); -} - -TSSymbol ts_node_symbol(TSNode self) { - TSSymbol symbol = ts_node__alias(&self); - if (!symbol) symbol = ts_subtree_symbol(ts_node__subtree(self)); - return ts_language_public_symbol(self.tree->language, symbol); -} - -const char *ts_node_type(TSNode self) { - TSSymbol symbol = ts_node__alias(&self); - if (!symbol) symbol = ts_subtree_symbol(ts_node__subtree(self)); - return ts_language_symbol_name(self.tree->language, symbol); -} - -char *ts_node_string(TSNode self) { - return ts_subtree_string(ts_node__subtree(self), self.tree->language, false); -} - -bool ts_node_eq(TSNode self, TSNode other) { - return self.tree == other.tree && self.id == other.id; -} - -bool ts_node_is_null(TSNode self) { - return self.id == 0; -} - -bool ts_node_is_extra(TSNode self) { - return ts_subtree_extra(ts_node__subtree(self)); -} - -bool ts_node_is_named(TSNode self) { - TSSymbol alias = ts_node__alias(&self); - return alias - ? ts_language_symbol_metadata(self.tree->language, alias).named - : ts_subtree_named(ts_node__subtree(self)); -} - -bool ts_node_is_missing(TSNode self) { - return ts_subtree_missing(ts_node__subtree(self)); -} - -bool ts_node_has_changes(TSNode self) { - return ts_subtree_has_changes(ts_node__subtree(self)); -} - -bool ts_node_has_error(TSNode self) { - return ts_subtree_error_cost(ts_node__subtree(self)) > 0; -} - -TSNode ts_node_parent(TSNode self) { - TSNode node = ts_tree_get_cached_parent(self.tree, &self); - if (node.id) return node; - - node = ts_tree_root_node(self.tree); - uint32_t end_byte = ts_node_end_byte(self); - if (node.id == self.id) return ts_node__null(); - - TSNode last_visible_node = node; - bool did_descend = true; - while (did_descend) { - did_descend = false; - - TSNode child; - NodeChildIterator iterator = ts_node_iterate_children(&node); - while (ts_node_child_iterator_next(&iterator, &child)) { - if ( - ts_node_start_byte(child) > ts_node_start_byte(self) || - child.id == self.id - ) break; - if (iterator.position.bytes >= end_byte) { - node = child; - if (ts_node__is_relevant(child, true)) { - ts_tree_set_cached_parent(self.tree, &node, &last_visible_node); - last_visible_node = node; - } - did_descend = true; - break; - } - } - } - - return last_visible_node; -} - -TSNode ts_node_child(TSNode self, uint32_t child_index) { - return ts_node__child(self, child_index, true); -} - -TSNode ts_node_named_child(TSNode self, uint32_t child_index) { - return ts_node__child(self, child_index, false); -} - -TSNode ts_node_child_by_field_id(TSNode self, TSFieldId field_id) { -recur: - if (!field_id || ts_node_child_count(self) == 0) return ts_node__null(); - - const TSFieldMapEntry *field_map, *field_map_end; - ts_language_field_map( - self.tree->language, - ts_node__subtree(self).ptr->production_id, - &field_map, - &field_map_end - ); - if (field_map == field_map_end) return ts_node__null(); - - // The field mappings are sorted by their field id. Scan all - // the mappings to find the ones for the given field id. - while (field_map->field_id < field_id) { - field_map++; - if (field_map == field_map_end) return ts_node__null(); - } - while (field_map_end[-1].field_id > field_id) { - field_map_end--; - if (field_map == field_map_end) return ts_node__null(); - } - - TSNode child; - NodeChildIterator iterator = ts_node_iterate_children(&self); - while (ts_node_child_iterator_next(&iterator, &child)) { - if (!ts_subtree_extra(ts_node__subtree(child))) { - uint32_t index = iterator.structural_child_index - 1; - if (index < field_map->child_index) continue; - - // Hidden nodes' fields are "inherited" by their visible parent. - if (field_map->inherited) { - - // If this is the *last* possible child node for this field, - // then perform a tail call to avoid recursion. - if (field_map + 1 == field_map_end) { - self = child; - goto recur; - } - - // Otherwise, descend into this child, but if it doesn't contain - // the field, continue searching subsequent children. - else { - TSNode result = ts_node_child_by_field_id(child, field_id); - if (result.id) return result; - field_map++; - if (field_map == field_map_end) return ts_node__null(); - } - } - - else if (ts_node__is_relevant(child, true)) { - return child; - } - - // If the field refers to a hidden node, return its first visible - // child. - else { - return ts_node_child(child, 0); - } - } - } - - return ts_node__null(); -} - -TSNode ts_node_child_by_field_name( - TSNode self, - const char *name, - uint32_t name_length -) { - TSFieldId field_id = ts_language_field_id_for_name( - self.tree->language, - name, - name_length - ); - return ts_node_child_by_field_id(self, field_id); -} - -uint32_t ts_node_child_count(TSNode self) { - Subtree tree = ts_node__subtree(self); - if (ts_subtree_child_count(tree) > 0) { - return tree.ptr->visible_child_count; - } else { - return 0; - } -} - -uint32_t ts_node_named_child_count(TSNode self) { - Subtree tree = ts_node__subtree(self); - if (ts_subtree_child_count(tree) > 0) { - return tree.ptr->named_child_count; - } else { - return 0; - } -} - -TSNode ts_node_next_sibling(TSNode self) { - return ts_node__next_sibling(self, true); -} - -TSNode ts_node_next_named_sibling(TSNode self) { - return ts_node__next_sibling(self, false); -} - -TSNode ts_node_prev_sibling(TSNode self) { - return ts_node__prev_sibling(self, true); -} - -TSNode ts_node_prev_named_sibling(TSNode self) { - return ts_node__prev_sibling(self, false); -} - -TSNode ts_node_first_child_for_byte(TSNode self, uint32_t byte) { - return ts_node__first_child_for_byte(self, byte, true); -} - -TSNode ts_node_first_named_child_for_byte(TSNode self, uint32_t byte) { - return ts_node__first_child_for_byte(self, byte, false); -} - -TSNode ts_node_descendant_for_byte_range( - TSNode self, - uint32_t start, - uint32_t end -) { - return ts_node__descendant_for_byte_range(self, start, end, true); -} - -TSNode ts_node_named_descendant_for_byte_range( - TSNode self, - uint32_t start, - uint32_t end -) { - return ts_node__descendant_for_byte_range(self, start, end, false); -} - -TSNode ts_node_descendant_for_point_range( - TSNode self, - TSPoint start, - TSPoint end -) { - return ts_node__descendant_for_point_range(self, start, end, true); -} - -TSNode ts_node_named_descendant_for_point_range( - TSNode self, - TSPoint start, - TSPoint end -) { - return ts_node__descendant_for_point_range(self, start, end, false); -} - -void ts_node_edit(TSNode *self, const TSInputEdit *edit) { - uint32_t start_byte = ts_node_start_byte(*self); - TSPoint start_point = ts_node_start_point(*self); - - if (start_byte >= edit->old_end_byte) { - start_byte = edit->new_end_byte + (start_byte - edit->old_end_byte); - start_point = point_add(edit->new_end_point, point_sub(start_point, edit->old_end_point)); - } else if (start_byte > edit->start_byte) { - start_byte = edit->new_end_byte; - start_point = edit->new_end_point; - } - - self->context[0] = start_byte; - self->context[1] = start_point.row; - self->context[2] = start_point.column; -} diff --git a/src/tree_sitter/parser.c b/src/tree_sitter/parser.c deleted file mode 100644 index 79cad797a0..0000000000 --- a/src/tree_sitter/parser.c +++ /dev/null @@ -1,1906 +0,0 @@ -#include <time.h> -#include <assert.h> -#include <stdio.h> -#include <limits.h> -#include <stdbool.h> -#include "tree_sitter/api.h" -#include "./alloc.h" -#include "./array.h" -#include "./atomic.h" -#include "./clock.h" -#include "./error_costs.h" -#include "./get_changed_ranges.h" -#include "./language.h" -#include "./length.h" -#include "./lexer.h" -#include "./reduce_action.h" -#include "./reusable_node.h" -#include "./stack.h" -#include "./subtree.h" -#include "./tree.h" - -#define LOG(...) \ - if (self->lexer.logger.log || self->dot_graph_file) { \ - snprintf(self->lexer.debug_buffer, TREE_SITTER_SERIALIZATION_BUFFER_SIZE, __VA_ARGS__); \ - ts_parser__log(self); \ - } - -#define LOG_STACK() \ - if (self->dot_graph_file) { \ - ts_stack_print_dot_graph(self->stack, self->language, self->dot_graph_file); \ - fputs("\n\n", self->dot_graph_file); \ - } - -#define LOG_TREE(tree) \ - if (self->dot_graph_file) { \ - ts_subtree_print_dot_graph(tree, self->language, self->dot_graph_file); \ - fputs("\n", self->dot_graph_file); \ - } - -#define SYM_NAME(symbol) ts_language_symbol_name(self->language, symbol) - -#define TREE_NAME(tree) SYM_NAME(ts_subtree_symbol(tree)) - -static const unsigned MAX_VERSION_COUNT = 6; -static const unsigned MAX_VERSION_COUNT_OVERFLOW = 4; -static const unsigned MAX_SUMMARY_DEPTH = 16; -static const unsigned MAX_COST_DIFFERENCE = 16 * ERROR_COST_PER_SKIPPED_TREE; -static const unsigned OP_COUNT_PER_TIMEOUT_CHECK = 100; - -typedef struct { - Subtree token; - Subtree last_external_token; - uint32_t byte_index; -} TokenCache; - -struct TSParser { - Lexer lexer; - Stack *stack; - SubtreePool tree_pool; - const TSLanguage *language; - ReduceActionSet reduce_actions; - Subtree finished_tree; - SubtreeHeapData scratch_tree_data; - MutableSubtree scratch_tree; - TokenCache token_cache; - ReusableNode reusable_node; - void *external_scanner_payload; - FILE *dot_graph_file; - TSClock end_clock; - TSDuration timeout_duration; - unsigned accept_count; - unsigned operation_count; - const volatile size_t *cancellation_flag; - Subtree old_tree; - TSRangeArray included_range_differences; - unsigned included_range_difference_index; -}; - -typedef struct { - unsigned cost; - unsigned node_count; - int dynamic_precedence; - bool is_in_error; -} ErrorStatus; - -typedef enum { - ErrorComparisonTakeLeft, - ErrorComparisonPreferLeft, - ErrorComparisonNone, - ErrorComparisonPreferRight, - ErrorComparisonTakeRight, -} ErrorComparison; - -typedef struct { - const char *string; - uint32_t length; -} TSStringInput; - -// StringInput - -static const char *ts_string_input_read( - void *_self, - uint32_t byte, - TSPoint pt, - uint32_t *length -) { - (void)pt; - TSStringInput *self = (TSStringInput *)_self; - if (byte >= self->length) { - *length = 0; - return ""; - } else { - *length = self->length - byte; - return self->string + byte; - } -} - -// Parser - Private - -static void ts_parser__log(TSParser *self) { - if (self->lexer.logger.log) { - self->lexer.logger.log( - self->lexer.logger.payload, - TSLogTypeParse, - self->lexer.debug_buffer - ); - } - - if (self->dot_graph_file) { - fprintf(self->dot_graph_file, "graph {\nlabel=\""); - for (char *c = &self->lexer.debug_buffer[0]; *c != 0; c++) { - if (*c == '"') fputc('\\', self->dot_graph_file); - fputc(*c, self->dot_graph_file); - } - fprintf(self->dot_graph_file, "\"\n}\n\n"); - } -} - -static bool ts_parser__breakdown_top_of_stack( - TSParser *self, - StackVersion version -) { - bool did_break_down = false; - bool pending = false; - - do { - StackSliceArray pop = ts_stack_pop_pending(self->stack, version); - if (!pop.size) break; - - did_break_down = true; - pending = false; - for (uint32_t i = 0; i < pop.size; i++) { - StackSlice slice = pop.contents[i]; - TSStateId state = ts_stack_state(self->stack, slice.version); - Subtree parent = *array_front(&slice.subtrees); - - for (uint32_t j = 0, n = ts_subtree_child_count(parent); j < n; j++) { - Subtree child = parent.ptr->children[j]; - pending = ts_subtree_child_count(child) > 0; - - if (ts_subtree_is_error(child)) { - state = ERROR_STATE; - } else if (!ts_subtree_extra(child)) { - state = ts_language_next_state(self->language, state, ts_subtree_symbol(child)); - } - - ts_subtree_retain(child); - ts_stack_push(self->stack, slice.version, child, pending, state); - } - - for (uint32_t j = 1; j < slice.subtrees.size; j++) { - Subtree tree = slice.subtrees.contents[j]; - ts_stack_push(self->stack, slice.version, tree, false, state); - } - - ts_subtree_release(&self->tree_pool, parent); - array_delete(&slice.subtrees); - - LOG("breakdown_top_of_stack tree:%s", TREE_NAME(parent)); - LOG_STACK(); - } - } while (pending); - - return did_break_down; -} - -static void ts_parser__breakdown_lookahead( - TSParser *self, - Subtree *lookahead, - TSStateId state, - ReusableNode *reusable_node -) { - bool did_descend = false; - Subtree tree = reusable_node_tree(reusable_node); - while (ts_subtree_child_count(tree) > 0 && ts_subtree_parse_state(tree) != state) { - LOG("state_mismatch sym:%s", TREE_NAME(tree)); - reusable_node_descend(reusable_node); - tree = reusable_node_tree(reusable_node); - did_descend = true; - } - - if (did_descend) { - ts_subtree_release(&self->tree_pool, *lookahead); - *lookahead = tree; - ts_subtree_retain(*lookahead); - } -} - -static ErrorComparison ts_parser__compare_versions( - TSParser *self, - ErrorStatus a, - ErrorStatus b -) { - (void)self; - if (!a.is_in_error && b.is_in_error) { - if (a.cost < b.cost) { - return ErrorComparisonTakeLeft; - } else { - return ErrorComparisonPreferLeft; - } - } - - if (a.is_in_error && !b.is_in_error) { - if (b.cost < a.cost) { - return ErrorComparisonTakeRight; - } else { - return ErrorComparisonPreferRight; - } - } - - if (a.cost < b.cost) { - if ((b.cost - a.cost) * (1 + a.node_count) > MAX_COST_DIFFERENCE) { - return ErrorComparisonTakeLeft; - } else { - return ErrorComparisonPreferLeft; - } - } - - if (b.cost < a.cost) { - if ((a.cost - b.cost) * (1 + b.node_count) > MAX_COST_DIFFERENCE) { - return ErrorComparisonTakeRight; - } else { - return ErrorComparisonPreferRight; - } - } - - if (a.dynamic_precedence > b.dynamic_precedence) return ErrorComparisonPreferLeft; - if (b.dynamic_precedence > a.dynamic_precedence) return ErrorComparisonPreferRight; - return ErrorComparisonNone; -} - -static ErrorStatus ts_parser__version_status( - TSParser *self, - StackVersion version -) { - unsigned cost = ts_stack_error_cost(self->stack, version); - bool is_paused = ts_stack_is_paused(self->stack, version); - if (is_paused) cost += ERROR_COST_PER_SKIPPED_TREE; - return (ErrorStatus) { - .cost = cost, - .node_count = ts_stack_node_count_since_error(self->stack, version), - .dynamic_precedence = ts_stack_dynamic_precedence(self->stack, version), - .is_in_error = is_paused || ts_stack_state(self->stack, version) == ERROR_STATE - }; -} - -static bool ts_parser__better_version_exists( - TSParser *self, - StackVersion version, - bool is_in_error, - unsigned cost -) { - if (self->finished_tree.ptr && ts_subtree_error_cost(self->finished_tree) <= cost) { - return true; - } - - Length position = ts_stack_position(self->stack, version); - ErrorStatus status = { - .cost = cost, - .is_in_error = is_in_error, - .dynamic_precedence = ts_stack_dynamic_precedence(self->stack, version), - .node_count = ts_stack_node_count_since_error(self->stack, version), - }; - - for (StackVersion i = 0, n = ts_stack_version_count(self->stack); i < n; i++) { - if (i == version || - !ts_stack_is_active(self->stack, i) || - ts_stack_position(self->stack, i).bytes < position.bytes) continue; - ErrorStatus status_i = ts_parser__version_status(self, i); - switch (ts_parser__compare_versions(self, status, status_i)) { - case ErrorComparisonTakeRight: - return true; - case ErrorComparisonPreferRight: - if (ts_stack_can_merge(self->stack, i, version)) return true; - break; - default: - break; - } - } - - return false; -} - -static void ts_parser__restore_external_scanner( - TSParser *self, - Subtree external_token -) { - if (external_token.ptr) { - self->language->external_scanner.deserialize( - self->external_scanner_payload, - ts_external_scanner_state_data(&external_token.ptr->external_scanner_state), - external_token.ptr->external_scanner_state.length - ); - } else { - self->language->external_scanner.deserialize(self->external_scanner_payload, NULL, 0); - } -} - -static bool ts_parser__can_reuse_first_leaf( - TSParser *self, - TSStateId state, - Subtree tree, - TableEntry *table_entry -) { - TSLexMode current_lex_mode = self->language->lex_modes[state]; - TSSymbol leaf_symbol = ts_subtree_leaf_symbol(tree); - TSStateId leaf_state = ts_subtree_leaf_parse_state(tree); - TSLexMode leaf_lex_mode = self->language->lex_modes[leaf_state]; - - // At the end of a non-terminal extra node, the lexer normally returns - // NULL, which indicates that the parser should look for a reduce action - // at symbol `0`. Avoid reusing tokens in this situation to ensure that - // the same thing happens when incrementally reparsing. - if (current_lex_mode.lex_state == (uint16_t)(-1)) return false; - - // If the token was created in a state with the same set of lookaheads, it is reusable. - if ( - table_entry->action_count > 0 && - memcmp(&leaf_lex_mode, ¤t_lex_mode, sizeof(TSLexMode)) == 0 && - ( - leaf_symbol != self->language->keyword_capture_token || - (!ts_subtree_is_keyword(tree) && ts_subtree_parse_state(tree) == state) - ) - ) return true; - - // Empty tokens are not reusable in states with different lookaheads. - if (ts_subtree_size(tree).bytes == 0 && leaf_symbol != ts_builtin_sym_end) return false; - - // If the current state allows external tokens or other tokens that conflict with this - // token, this token is not reusable. - return current_lex_mode.external_lex_state == 0 && table_entry->is_reusable; -} - -static Subtree ts_parser__lex( - TSParser *self, - StackVersion version, - TSStateId parse_state -) { - TSLexMode lex_mode = self->language->lex_modes[parse_state]; - if (lex_mode.lex_state == (uint16_t)-1) { - LOG("no_lookahead_after_non_terminal_extra"); - return NULL_SUBTREE; - } - - Length start_position = ts_stack_position(self->stack, version); - Subtree external_token = ts_stack_last_external_token(self->stack, version); - const bool *valid_external_tokens = ts_language_enabled_external_tokens( - self->language, - lex_mode.external_lex_state - ); - - bool found_external_token = false; - bool error_mode = parse_state == ERROR_STATE; - bool skipped_error = false; - int32_t first_error_character = 0; - Length error_start_position = length_zero(); - Length error_end_position = length_zero(); - uint32_t lookahead_end_byte = 0; - ts_lexer_reset(&self->lexer, start_position); - - for (;;) { - Length current_position = self->lexer.current_position; - - if (valid_external_tokens) { - LOG( - "lex_external state:%d, row:%u, column:%u", - lex_mode.external_lex_state, - current_position.extent.row + 1, - current_position.extent.column - ); - ts_lexer_start(&self->lexer); - ts_parser__restore_external_scanner(self, external_token); - bool found_token = self->language->external_scanner.scan( - self->external_scanner_payload, - &self->lexer.data, - valid_external_tokens - ); - ts_lexer_finish(&self->lexer, &lookahead_end_byte); - - // Zero-length external tokens are generally allowed, but they're not - // allowed right after a syntax error. This is for two reasons: - // 1. After a syntax error, the lexer is looking for any possible token, - // as opposed to the specific set of tokens that are valid in some - // parse state. In this situation, it's very easy for an external - // scanner to produce unwanted zero-length tokens. - // 2. The parser sometimes inserts *missing* tokens to recover from - // errors. These tokens are also zero-length. If we allow more - // zero-length tokens to be created after missing tokens, it - // can lead to infinite loops. Forbidding zero-length tokens - // right at the point of error recovery is a conservative strategy - // for preventing this kind of infinite loop. - if (found_token && ( - self->lexer.token_end_position.bytes > current_position.bytes || - (!error_mode && ts_stack_has_advanced_since_error(self->stack, version)) - )) { - found_external_token = true; - break; - } - - ts_lexer_reset(&self->lexer, current_position); - } - - LOG( - "lex_internal state:%d, row:%u, column:%u", - lex_mode.lex_state, - current_position.extent.row + 1, - current_position.extent.column - ); - ts_lexer_start(&self->lexer); - bool found_token = self->language->lex_fn(&self->lexer.data, lex_mode.lex_state); - ts_lexer_finish(&self->lexer, &lookahead_end_byte); - if (found_token) break; - - if (!error_mode) { - error_mode = true; - lex_mode = self->language->lex_modes[ERROR_STATE]; - valid_external_tokens = ts_language_enabled_external_tokens( - self->language, - lex_mode.external_lex_state - ); - ts_lexer_reset(&self->lexer, start_position); - continue; - } - - if (!skipped_error) { - LOG("skip_unrecognized_character"); - skipped_error = true; - error_start_position = self->lexer.token_start_position; - error_end_position = self->lexer.token_start_position; - first_error_character = self->lexer.data.lookahead; - } - - if (self->lexer.current_position.bytes == error_end_position.bytes) { - if (self->lexer.data.eof(&self->lexer.data)) { - self->lexer.data.result_symbol = ts_builtin_sym_error; - break; - } - self->lexer.data.advance(&self->lexer.data, false); - } - - error_end_position = self->lexer.current_position; - } - - Subtree result; - if (skipped_error) { - Length padding = length_sub(error_start_position, start_position); - Length size = length_sub(error_end_position, error_start_position); - uint32_t lookahead_bytes = lookahead_end_byte - error_end_position.bytes; - result = ts_subtree_new_error( - &self->tree_pool, - first_error_character, - padding, - size, - lookahead_bytes, - parse_state, - self->language - ); - - LOG( - "lexed_lookahead sym:%s, size:%u, character:'%c'", - SYM_NAME(ts_subtree_symbol(result)), - ts_subtree_total_size(result).bytes, - first_error_character - ); - } else { - if (self->lexer.token_end_position.bytes < self->lexer.token_start_position.bytes) { - self->lexer.token_start_position = self->lexer.token_end_position; - } - - bool is_keyword = false; - TSSymbol symbol = self->lexer.data.result_symbol; - Length padding = length_sub(self->lexer.token_start_position, start_position); - Length size = length_sub(self->lexer.token_end_position, self->lexer.token_start_position); - uint32_t lookahead_bytes = lookahead_end_byte - self->lexer.token_end_position.bytes; - - if (found_external_token) { - symbol = self->language->external_scanner.symbol_map[symbol]; - } else if (symbol == self->language->keyword_capture_token && symbol != 0) { - uint32_t end_byte = self->lexer.token_end_position.bytes; - ts_lexer_reset(&self->lexer, self->lexer.token_start_position); - ts_lexer_start(&self->lexer); - if ( - self->language->keyword_lex_fn(&self->lexer.data, 0) && - self->lexer.token_end_position.bytes == end_byte && - ts_language_has_actions(self->language, parse_state, self->lexer.data.result_symbol) - ) { - is_keyword = true; - symbol = self->lexer.data.result_symbol; - } - } - - result = ts_subtree_new_leaf( - &self->tree_pool, - symbol, - padding, - size, - lookahead_bytes, - parse_state, - found_external_token, - is_keyword, - self->language - ); - - if (found_external_token) { - unsigned length = self->language->external_scanner.serialize( - self->external_scanner_payload, - self->lexer.debug_buffer - ); - ts_external_scanner_state_init( - &((SubtreeHeapData *)result.ptr)->external_scanner_state, - self->lexer.debug_buffer, - length - ); - } - - LOG( - "lexed_lookahead sym:%s, size:%u", - SYM_NAME(ts_subtree_symbol(result)), - ts_subtree_total_size(result).bytes - ); - } - - return result; -} - -static Subtree ts_parser__get_cached_token( - TSParser *self, - TSStateId state, - size_t position, - Subtree last_external_token, - TableEntry *table_entry -) { - TokenCache *cache = &self->token_cache; - if ( - cache->token.ptr && cache->byte_index == position && - ts_subtree_external_scanner_state_eq(cache->last_external_token, last_external_token) - ) { - ts_language_table_entry(self->language, state, ts_subtree_symbol(cache->token), table_entry); - if (ts_parser__can_reuse_first_leaf(self, state, cache->token, table_entry)) { - ts_subtree_retain(cache->token); - return cache->token; - } - } - return NULL_SUBTREE; -} - -static void ts_parser__set_cached_token( - TSParser *self, - size_t byte_index, - Subtree last_external_token, - Subtree token -) { - TokenCache *cache = &self->token_cache; - if (token.ptr) ts_subtree_retain(token); - if (last_external_token.ptr) ts_subtree_retain(last_external_token); - if (cache->token.ptr) ts_subtree_release(&self->tree_pool, cache->token); - if (cache->last_external_token.ptr) ts_subtree_release(&self->tree_pool, cache->last_external_token); - cache->token = token; - cache->byte_index = byte_index; - cache->last_external_token = last_external_token; -} - -static bool ts_parser__has_included_range_difference( - const TSParser *self, - uint32_t start_position, - uint32_t end_position -) { - return ts_range_array_intersects( - &self->included_range_differences, - self->included_range_difference_index, - start_position, - end_position - ); -} - -static Subtree ts_parser__reuse_node( - TSParser *self, - StackVersion version, - TSStateId *state, - uint32_t position, - Subtree last_external_token, - TableEntry *table_entry -) { - Subtree result; - while ((result = reusable_node_tree(&self->reusable_node)).ptr) { - uint32_t byte_offset = reusable_node_byte_offset(&self->reusable_node); - uint32_t end_byte_offset = byte_offset + ts_subtree_total_bytes(result); - - // Do not reuse an EOF node if the included ranges array has changes - // later on in the file. - if (ts_subtree_is_eof(result)) end_byte_offset = UINT32_MAX; - - if (byte_offset > position) { - LOG("before_reusable_node symbol:%s", TREE_NAME(result)); - break; - } - - if (byte_offset < position) { - LOG("past_reusable_node symbol:%s", TREE_NAME(result)); - if (end_byte_offset <= position || !reusable_node_descend(&self->reusable_node)) { - reusable_node_advance(&self->reusable_node); - } - continue; - } - - if (!ts_subtree_external_scanner_state_eq(self->reusable_node.last_external_token, last_external_token)) { - LOG("reusable_node_has_different_external_scanner_state symbol:%s", TREE_NAME(result)); - reusable_node_advance(&self->reusable_node); - continue; - } - - const char *reason = NULL; - if (ts_subtree_has_changes(result)) { - reason = "has_changes"; - } else if (ts_subtree_is_error(result)) { - reason = "is_error"; - } else if (ts_subtree_missing(result)) { - reason = "is_missing"; - } else if (ts_subtree_is_fragile(result)) { - reason = "is_fragile"; - } else if (ts_parser__has_included_range_difference(self, byte_offset, end_byte_offset)) { - reason = "contains_different_included_range"; - } - - if (reason) { - LOG("cant_reuse_node_%s tree:%s", reason, TREE_NAME(result)); - if (!reusable_node_descend(&self->reusable_node)) { - reusable_node_advance(&self->reusable_node); - ts_parser__breakdown_top_of_stack(self, version); - *state = ts_stack_state(self->stack, version); - } - continue; - } - - TSSymbol leaf_symbol = ts_subtree_leaf_symbol(result); - ts_language_table_entry(self->language, *state, leaf_symbol, table_entry); - if (!ts_parser__can_reuse_first_leaf(self, *state, result, table_entry)) { - LOG( - "cant_reuse_node symbol:%s, first_leaf_symbol:%s", - TREE_NAME(result), - SYM_NAME(leaf_symbol) - ); - reusable_node_advance_past_leaf(&self->reusable_node); - break; - } - - LOG("reuse_node symbol:%s", TREE_NAME(result)); - ts_subtree_retain(result); - return result; - } - - return NULL_SUBTREE; -} - -static bool ts_parser__select_tree(TSParser *self, Subtree left, Subtree right) { - if (!left.ptr) return true; - if (!right.ptr) return false; - - if (ts_subtree_error_cost(right) < ts_subtree_error_cost(left)) { - LOG("select_smaller_error symbol:%s, over_symbol:%s", TREE_NAME(right), TREE_NAME(left)); - return true; - } - - if (ts_subtree_error_cost(left) < ts_subtree_error_cost(right)) { - LOG("select_smaller_error symbol:%s, over_symbol:%s", TREE_NAME(left), TREE_NAME(right)); - return false; - } - - if (ts_subtree_dynamic_precedence(right) > ts_subtree_dynamic_precedence(left)) { - LOG("select_higher_precedence symbol:%s, prec:%u, over_symbol:%s, other_prec:%u", - TREE_NAME(right), ts_subtree_dynamic_precedence(right), TREE_NAME(left), - ts_subtree_dynamic_precedence(left)); - return true; - } - - if (ts_subtree_dynamic_precedence(left) > ts_subtree_dynamic_precedence(right)) { - LOG("select_higher_precedence symbol:%s, prec:%u, over_symbol:%s, other_prec:%u", - TREE_NAME(left), ts_subtree_dynamic_precedence(left), TREE_NAME(right), - ts_subtree_dynamic_precedence(right)); - return false; - } - - if (ts_subtree_error_cost(left) > 0) return true; - - int comparison = ts_subtree_compare(left, right); - switch (comparison) { - case -1: - LOG("select_earlier symbol:%s, over_symbol:%s", TREE_NAME(left), TREE_NAME(right)); - return false; - break; - case 1: - LOG("select_earlier symbol:%s, over_symbol:%s", TREE_NAME(right), TREE_NAME(left)); - return true; - default: - LOG("select_existing symbol:%s, over_symbol:%s", TREE_NAME(left), TREE_NAME(right)); - return false; - } -} - -static void ts_parser__shift( - TSParser *self, - StackVersion version, - TSStateId state, - Subtree lookahead, - bool extra -) { - Subtree subtree_to_push; - if (extra != ts_subtree_extra(lookahead)) { - MutableSubtree result = ts_subtree_make_mut(&self->tree_pool, lookahead); - ts_subtree_set_extra(&result); - subtree_to_push = ts_subtree_from_mut(result); - } else { - subtree_to_push = lookahead; - } - - bool is_pending = ts_subtree_child_count(subtree_to_push) > 0; - ts_stack_push(self->stack, version, subtree_to_push, is_pending, state); - if (ts_subtree_has_external_tokens(subtree_to_push)) { - ts_stack_set_last_external_token( - self->stack, version, ts_subtree_last_external_token(subtree_to_push) - ); - } -} - -static bool ts_parser__replace_children( - TSParser *self, - MutableSubtree *tree, - SubtreeArray *children -) { - *self->scratch_tree.ptr = *tree->ptr; - self->scratch_tree.ptr->child_count = 0; - ts_subtree_set_children(self->scratch_tree, children->contents, children->size, self->language); - if (ts_parser__select_tree(self, ts_subtree_from_mut(*tree), ts_subtree_from_mut(self->scratch_tree))) { - *tree->ptr = *self->scratch_tree.ptr; - return true; - } else { - return false; - } -} - -static StackVersion ts_parser__reduce( - TSParser *self, - StackVersion version, - TSSymbol symbol, - uint32_t count, - int dynamic_precedence, - uint16_t production_id, - bool is_fragile, - bool end_of_non_terminal_extra -) { - uint32_t initial_version_count = ts_stack_version_count(self->stack); - - // Pop the given number of nodes from the given version of the parse stack. - // If stack versions have previously merged, then there may be more than one - // path back through the stack. For each path, create a new parent node to - // contain the popped children, and push it onto the stack in place of the - // children. - StackSliceArray pop = ts_stack_pop_count(self->stack, version, count); - uint32_t removed_version_count = 0; - for (uint32_t i = 0; i < pop.size; i++) { - StackSlice slice = pop.contents[i]; - StackVersion slice_version = slice.version - removed_version_count; - - // This is where new versions are added to the parse stack. The versions - // will all be sorted and truncated at the end of the outer parsing loop. - // Allow the maximum version count to be temporarily exceeded, but only - // by a limited threshold. - if (slice_version > MAX_VERSION_COUNT + MAX_VERSION_COUNT_OVERFLOW) { - ts_stack_remove_version(self->stack, slice_version); - ts_subtree_array_delete(&self->tree_pool, &slice.subtrees); - removed_version_count++; - while (i + 1 < pop.size) { - StackSlice next_slice = pop.contents[i + 1]; - if (next_slice.version != slice.version) break; - ts_subtree_array_delete(&self->tree_pool, &next_slice.subtrees); - i++; - } - continue; - } - - // Extra tokens on top of the stack should not be included in this new parent - // node. They will be re-pushed onto the stack after the parent node is - // created and pushed. - SubtreeArray children = slice.subtrees; - while (children.size > 0 && ts_subtree_extra(children.contents[children.size - 1])) { - children.size--; - } - - MutableSubtree parent = ts_subtree_new_node(&self->tree_pool, - symbol, &children, production_id, self->language - ); - - // This pop operation may have caused multiple stack versions to collapse - // into one, because they all diverged from a common state. In that case, - // choose one of the arrays of trees to be the parent node's children, and - // delete the rest of the tree arrays. - while (i + 1 < pop.size) { - StackSlice next_slice = pop.contents[i + 1]; - if (next_slice.version != slice.version) break; - i++; - - SubtreeArray children = next_slice.subtrees; - while (children.size > 0 && ts_subtree_extra(children.contents[children.size - 1])) { - children.size--; - } - - if (ts_parser__replace_children(self, &parent, &children)) { - ts_subtree_array_delete(&self->tree_pool, &slice.subtrees); - slice = next_slice; - } else { - ts_subtree_array_delete(&self->tree_pool, &next_slice.subtrees); - } - } - - parent.ptr->dynamic_precedence += dynamic_precedence; - parent.ptr->production_id = production_id; - - TSStateId state = ts_stack_state(self->stack, slice_version); - TSStateId next_state = ts_language_next_state(self->language, state, symbol); - if (end_of_non_terminal_extra && next_state == state) { - parent.ptr->extra = true; - } - if (is_fragile || pop.size > 1 || initial_version_count > 1) { - parent.ptr->fragile_left = true; - parent.ptr->fragile_right = true; - parent.ptr->parse_state = TS_TREE_STATE_NONE; - } else { - parent.ptr->parse_state = state; - } - - // Push the parent node onto the stack, along with any extra tokens that - // were previously on top of the stack. - ts_stack_push(self->stack, slice_version, ts_subtree_from_mut(parent), false, next_state); - for (uint32_t j = parent.ptr->child_count; j < slice.subtrees.size; j++) { - ts_stack_push(self->stack, slice_version, slice.subtrees.contents[j], false, next_state); - } - - for (StackVersion j = 0; j < slice_version; j++) { - if (j == version) continue; - if (ts_stack_merge(self->stack, j, slice_version)) { - removed_version_count++; - break; - } - } - } - - // Return the first new stack version that was created. - return ts_stack_version_count(self->stack) > initial_version_count - ? initial_version_count - : STACK_VERSION_NONE; -} - -static void ts_parser__accept( - TSParser *self, - StackVersion version, - Subtree lookahead -) { - assert(ts_subtree_is_eof(lookahead)); - ts_stack_push(self->stack, version, lookahead, false, 1); - - StackSliceArray pop = ts_stack_pop_all(self->stack, version); - for (uint32_t i = 0; i < pop.size; i++) { - SubtreeArray trees = pop.contents[i].subtrees; - - Subtree root = NULL_SUBTREE; - for (uint32_t j = trees.size - 1; j + 1 > 0; j--) { - Subtree child = trees.contents[j]; - if (!ts_subtree_extra(child)) { - assert(!child.data.is_inline); - uint32_t child_count = ts_subtree_child_count(child); - for (uint32_t k = 0; k < child_count; k++) { - ts_subtree_retain(child.ptr->children[k]); - } - array_splice(&trees, j, 1, child_count, child.ptr->children); - root = ts_subtree_from_mut(ts_subtree_new_node( - &self->tree_pool, - ts_subtree_symbol(child), - &trees, - child.ptr->production_id, - self->language - )); - ts_subtree_release(&self->tree_pool, child); - break; - } - } - - assert(root.ptr); - self->accept_count++; - - if (self->finished_tree.ptr) { - if (ts_parser__select_tree(self, self->finished_tree, root)) { - ts_subtree_release(&self->tree_pool, self->finished_tree); - self->finished_tree = root; - } else { - ts_subtree_release(&self->tree_pool, root); - } - } else { - self->finished_tree = root; - } - } - - ts_stack_remove_version(self->stack, pop.contents[0].version); - ts_stack_halt(self->stack, version); -} - -static bool ts_parser__do_all_potential_reductions( - TSParser *self, - StackVersion starting_version, - TSSymbol lookahead_symbol -) { - uint32_t initial_version_count = ts_stack_version_count(self->stack); - - bool can_shift_lookahead_symbol = false; - StackVersion version = starting_version; - for (unsigned i = 0; true; i++) { - uint32_t version_count = ts_stack_version_count(self->stack); - if (version >= version_count) break; - - bool merged = false; - for (StackVersion i = initial_version_count; i < version; i++) { - if (ts_stack_merge(self->stack, i, version)) { - merged = true; - break; - } - } - if (merged) continue; - - TSStateId state = ts_stack_state(self->stack, version); - bool has_shift_action = false; - array_clear(&self->reduce_actions); - - TSSymbol first_symbol, end_symbol; - if (lookahead_symbol != 0) { - first_symbol = lookahead_symbol; - end_symbol = lookahead_symbol + 1; - } else { - first_symbol = 1; - end_symbol = self->language->token_count; - } - - for (TSSymbol symbol = first_symbol; symbol < end_symbol; symbol++) { - TableEntry entry; - ts_language_table_entry(self->language, state, symbol, &entry); - for (uint32_t i = 0; i < entry.action_count; i++) { - TSParseAction action = entry.actions[i]; - switch (action.type) { - case TSParseActionTypeShift: - case TSParseActionTypeRecover: - if (!action.params.shift.extra && !action.params.shift.repetition) has_shift_action = true; - break; - case TSParseActionTypeReduce: - if (action.params.reduce.child_count > 0) - ts_reduce_action_set_add(&self->reduce_actions, (ReduceAction){ - .symbol = action.params.reduce.symbol, - .count = action.params.reduce.child_count, - .dynamic_precedence = action.params.reduce.dynamic_precedence, - .production_id = action.params.reduce.production_id, - }); - break; - default: - break; - } - } - } - - StackVersion reduction_version = STACK_VERSION_NONE; - for (uint32_t i = 0; i < self->reduce_actions.size; i++) { - ReduceAction action = self->reduce_actions.contents[i]; - - reduction_version = ts_parser__reduce( - self, version, action.symbol, action.count, - action.dynamic_precedence, action.production_id, - true, false - ); - } - - if (has_shift_action) { - can_shift_lookahead_symbol = true; - } else if (reduction_version != STACK_VERSION_NONE && i < MAX_VERSION_COUNT) { - ts_stack_renumber_version(self->stack, reduction_version, version); - continue; - } else if (lookahead_symbol != 0) { - ts_stack_remove_version(self->stack, version); - } - - if (version == starting_version) { - version = version_count; - } else { - version++; - } - } - - return can_shift_lookahead_symbol; -} - -static void ts_parser__handle_error( - TSParser *self, - StackVersion version, - TSSymbol lookahead_symbol -) { - uint32_t previous_version_count = ts_stack_version_count(self->stack); - - // Perform any reductions that can happen in this state, regardless of the lookahead. After - // skipping one or more invalid tokens, the parser might find a token that would have allowed - // a reduction to take place. - ts_parser__do_all_potential_reductions(self, version, 0); - uint32_t version_count = ts_stack_version_count(self->stack); - Length position = ts_stack_position(self->stack, version); - - // Push a discontinuity onto the stack. Merge all of the stack versions that - // were created in the previous step. - bool did_insert_missing_token = false; - for (StackVersion v = version; v < version_count;) { - if (!did_insert_missing_token) { - TSStateId state = ts_stack_state(self->stack, v); - for (TSSymbol missing_symbol = 1; - missing_symbol < self->language->token_count; - missing_symbol++) { - TSStateId state_after_missing_symbol = ts_language_next_state( - self->language, state, missing_symbol - ); - if (state_after_missing_symbol == 0 || state_after_missing_symbol == state) { - continue; - } - - if (ts_language_has_reduce_action( - self->language, - state_after_missing_symbol, - lookahead_symbol - )) { - // In case the parser is currently outside of any included range, the lexer will - // snap to the beginning of the next included range. The missing token's padding - // must be assigned to position it within the next included range. - ts_lexer_reset(&self->lexer, position); - ts_lexer_mark_end(&self->lexer); - Length padding = length_sub(self->lexer.token_end_position, position); - - StackVersion version_with_missing_tree = ts_stack_copy_version(self->stack, v); - Subtree missing_tree = ts_subtree_new_missing_leaf( - &self->tree_pool, missing_symbol, padding, self->language - ); - ts_stack_push( - self->stack, version_with_missing_tree, - missing_tree, false, - state_after_missing_symbol - ); - - if (ts_parser__do_all_potential_reductions( - self, version_with_missing_tree, - lookahead_symbol - )) { - LOG( - "recover_with_missing symbol:%s, state:%u", - SYM_NAME(missing_symbol), - ts_stack_state(self->stack, version_with_missing_tree) - ); - did_insert_missing_token = true; - break; - } - } - } - } - - ts_stack_push(self->stack, v, NULL_SUBTREE, false, ERROR_STATE); - v = (v == version) ? previous_version_count : v + 1; - } - - for (unsigned i = previous_version_count; i < version_count; i++) { - bool did_merge = ts_stack_merge(self->stack, version, previous_version_count); - assert(did_merge); - } - - ts_stack_record_summary(self->stack, version, MAX_SUMMARY_DEPTH); - LOG_STACK(); -} - -static bool ts_parser__recover_to_state( - TSParser *self, - StackVersion version, - unsigned depth, - TSStateId goal_state -) { - StackSliceArray pop = ts_stack_pop_count(self->stack, version, depth); - StackVersion previous_version = STACK_VERSION_NONE; - - for (unsigned i = 0; i < pop.size; i++) { - StackSlice slice = pop.contents[i]; - - if (slice.version == previous_version) { - ts_subtree_array_delete(&self->tree_pool, &slice.subtrees); - array_erase(&pop, i--); - continue; - } - - if (ts_stack_state(self->stack, slice.version) != goal_state) { - ts_stack_halt(self->stack, slice.version); - ts_subtree_array_delete(&self->tree_pool, &slice.subtrees); - array_erase(&pop, i--); - continue; - } - - SubtreeArray error_trees = ts_stack_pop_error(self->stack, slice.version); - if (error_trees.size > 0) { - assert(error_trees.size == 1); - Subtree error_tree = error_trees.contents[0]; - uint32_t error_child_count = ts_subtree_child_count(error_tree); - if (error_child_count > 0) { - array_splice(&slice.subtrees, 0, 0, error_child_count, error_tree.ptr->children); - for (unsigned j = 0; j < error_child_count; j++) { - ts_subtree_retain(slice.subtrees.contents[j]); - } - } - ts_subtree_array_delete(&self->tree_pool, &error_trees); - } - - SubtreeArray trailing_extras = ts_subtree_array_remove_trailing_extras(&slice.subtrees); - - if (slice.subtrees.size > 0) { - Subtree error = ts_subtree_new_error_node(&self->tree_pool, &slice.subtrees, true, self->language); - ts_stack_push(self->stack, slice.version, error, false, goal_state); - } else { - array_delete(&slice.subtrees); - } - - for (unsigned j = 0; j < trailing_extras.size; j++) { - Subtree tree = trailing_extras.contents[j]; - ts_stack_push(self->stack, slice.version, tree, false, goal_state); - } - - previous_version = slice.version; - array_delete(&trailing_extras); - } - - return previous_version != STACK_VERSION_NONE; -} - -static void ts_parser__recover( - TSParser *self, - StackVersion version, - Subtree lookahead -) { - bool did_recover = false; - unsigned previous_version_count = ts_stack_version_count(self->stack); - Length position = ts_stack_position(self->stack, version); - StackSummary *summary = ts_stack_get_summary(self->stack, version); - unsigned node_count_since_error = ts_stack_node_count_since_error(self->stack, version); - unsigned current_error_cost = ts_stack_error_cost(self->stack, version); - - // When the parser is in the error state, there are two strategies for recovering with a - // given lookahead token: - // 1. Find a previous state on the stack in which that lookahead token would be valid. Then, - // create a new stack version that is in that state again. This entails popping all of the - // subtrees that have been pushed onto the stack since that previous state, and wrapping - // them in an ERROR node. - // 2. Wrap the lookahead token in an ERROR node, push that ERROR node onto the stack, and - // move on to the next lookahead token, remaining in the error state. - // - // First, try the strategy 1. Upon entering the error state, the parser recorded a summary - // of the previous parse states and their depths. Look at each state in the summary, to see - // if the current lookahead token would be valid in that state. - if (summary && !ts_subtree_is_error(lookahead)) { - for (unsigned i = 0; i < summary->size; i++) { - StackSummaryEntry entry = summary->contents[i]; - - if (entry.state == ERROR_STATE) continue; - if (entry.position.bytes == position.bytes) continue; - unsigned depth = entry.depth; - if (node_count_since_error > 0) depth++; - - // Do not recover in ways that create redundant stack versions. - bool would_merge = false; - for (unsigned j = 0; j < previous_version_count; j++) { - if ( - ts_stack_state(self->stack, j) == entry.state && - ts_stack_position(self->stack, j).bytes == position.bytes - ) { - would_merge = true; - break; - } - } - if (would_merge) continue; - - // Do not recover if the result would clearly be worse than some existing stack version. - unsigned new_cost = - current_error_cost + - entry.depth * ERROR_COST_PER_SKIPPED_TREE + - (position.bytes - entry.position.bytes) * ERROR_COST_PER_SKIPPED_CHAR + - (position.extent.row - entry.position.extent.row) * ERROR_COST_PER_SKIPPED_LINE; - if (ts_parser__better_version_exists(self, version, false, new_cost)) break; - - // If the current lookahead token is valid in some previous state, recover to that state. - // Then stop looking for further recoveries. - if (ts_language_has_actions(self->language, entry.state, ts_subtree_symbol(lookahead))) { - if (ts_parser__recover_to_state(self, version, depth, entry.state)) { - did_recover = true; - LOG("recover_to_previous state:%u, depth:%u", entry.state, depth); - LOG_STACK(); - break; - } - } - } - } - - // In the process of attemping to recover, some stack versions may have been created - // and subsequently halted. Remove those versions. - for (unsigned i = previous_version_count; i < ts_stack_version_count(self->stack); i++) { - if (!ts_stack_is_active(self->stack, i)) { - ts_stack_remove_version(self->stack, i--); - } - } - - // If strategy 1 succeeded, a new stack version will have been created which is able to handle - // the current lookahead token. Now, in addition, try strategy 2 described above: skip the - // current lookahead token by wrapping it in an ERROR node. - - // Don't pursue this additional strategy if there are already too many stack versions. - if (did_recover && ts_stack_version_count(self->stack) > MAX_VERSION_COUNT) { - ts_stack_halt(self->stack, version); - ts_subtree_release(&self->tree_pool, lookahead); - return; - } - - // If the parser is still in the error state at the end of the file, just wrap everything - // in an ERROR node and terminate. - if (ts_subtree_is_eof(lookahead)) { - LOG("recover_eof"); - SubtreeArray children = array_new(); - Subtree parent = ts_subtree_new_error_node(&self->tree_pool, &children, false, self->language); - ts_stack_push(self->stack, version, parent, false, 1); - ts_parser__accept(self, version, lookahead); - return; - } - - // Do not recover if the result would clearly be worse than some existing stack version. - unsigned new_cost = - current_error_cost + ERROR_COST_PER_SKIPPED_TREE + - ts_subtree_total_bytes(lookahead) * ERROR_COST_PER_SKIPPED_CHAR + - ts_subtree_total_size(lookahead).extent.row * ERROR_COST_PER_SKIPPED_LINE; - if (ts_parser__better_version_exists(self, version, false, new_cost)) { - ts_stack_halt(self->stack, version); - ts_subtree_release(&self->tree_pool, lookahead); - return; - } - - // If the current lookahead token is an extra token, mark it as extra. This means it won't - // be counted in error cost calculations. - unsigned n; - const TSParseAction *actions = ts_language_actions(self->language, 1, ts_subtree_symbol(lookahead), &n); - if (n > 0 && actions[n - 1].type == TSParseActionTypeShift && actions[n - 1].params.shift.extra) { - MutableSubtree mutable_lookahead = ts_subtree_make_mut(&self->tree_pool, lookahead); - ts_subtree_set_extra(&mutable_lookahead); - lookahead = ts_subtree_from_mut(mutable_lookahead); - } - - // Wrap the lookahead token in an ERROR. - LOG("skip_token symbol:%s", TREE_NAME(lookahead)); - SubtreeArray children = array_new(); - array_reserve(&children, 1); - array_push(&children, lookahead); - MutableSubtree error_repeat = ts_subtree_new_node( - &self->tree_pool, - ts_builtin_sym_error_repeat, - &children, - 0, - self->language - ); - - // If other tokens have already been skipped, so there is already an ERROR at the top of the - // stack, then pop that ERROR off the stack and wrap the two ERRORs together into one larger - // ERROR. - if (node_count_since_error > 0) { - StackSliceArray pop = ts_stack_pop_count(self->stack, version, 1); - - // TODO: Figure out how to make this condition occur. - // See https://github.com/atom/atom/issues/18450#issuecomment-439579778 - // If multiple stack versions have merged at this point, just pick one of the errors - // arbitrarily and discard the rest. - if (pop.size > 1) { - for (unsigned i = 1; i < pop.size; i++) { - ts_subtree_array_delete(&self->tree_pool, &pop.contents[i].subtrees); - } - while (ts_stack_version_count(self->stack) > pop.contents[0].version + 1) { - ts_stack_remove_version(self->stack, pop.contents[0].version + 1); - } - } - - ts_stack_renumber_version(self->stack, pop.contents[0].version, version); - array_push(&pop.contents[0].subtrees, ts_subtree_from_mut(error_repeat)); - error_repeat = ts_subtree_new_node( - &self->tree_pool, - ts_builtin_sym_error_repeat, - &pop.contents[0].subtrees, - 0, - self->language - ); - } - - // Push the new ERROR onto the stack. - ts_stack_push(self->stack, version, ts_subtree_from_mut(error_repeat), false, ERROR_STATE); - if (ts_subtree_has_external_tokens(lookahead)) { - ts_stack_set_last_external_token( - self->stack, version, ts_subtree_last_external_token(lookahead) - ); - } -} - -static bool ts_parser__advance( - TSParser *self, - StackVersion version, - bool allow_node_reuse -) { - TSStateId state = ts_stack_state(self->stack, version); - uint32_t position = ts_stack_position(self->stack, version).bytes; - Subtree last_external_token = ts_stack_last_external_token(self->stack, version); - - bool did_reuse = true; - Subtree lookahead = NULL_SUBTREE; - TableEntry table_entry = {.action_count = 0}; - - // If possible, reuse a node from the previous syntax tree. - if (allow_node_reuse) { - lookahead = ts_parser__reuse_node( - self, version, &state, position, last_external_token, &table_entry - ); - } - - // If no node from the previous syntax tree could be reused, then try to - // reuse the token previously returned by the lexer. - if (!lookahead.ptr) { - did_reuse = false; - lookahead = ts_parser__get_cached_token( - self, state, position, last_external_token, &table_entry - ); - } - - bool needs_lex = !lookahead.ptr; - for (;;) { - // Otherwise, re-run the lexer. - if (needs_lex) { - needs_lex = false; - lookahead = ts_parser__lex(self, version, state); - - if (lookahead.ptr) { - ts_parser__set_cached_token(self, position, last_external_token, lookahead); - ts_language_table_entry(self->language, state, ts_subtree_symbol(lookahead), &table_entry); - } - - // When parsing a non-terminal extra, a null lookahead indicates the - // end of the rule. The reduction is stored in the EOF table entry. - // After the reduction, the lexer needs to be run again. - else { - ts_language_table_entry(self->language, state, ts_builtin_sym_end, &table_entry); - } - } - - // If a cancellation flag or a timeout was provided, then check every - // time a fixed number of parse actions has been processed. - if (++self->operation_count == OP_COUNT_PER_TIMEOUT_CHECK) { - self->operation_count = 0; - } - if ( - self->operation_count == 0 && - ((self->cancellation_flag && atomic_load(self->cancellation_flag)) || - (!clock_is_null(self->end_clock) && clock_is_gt(clock_now(), self->end_clock))) - ) { - ts_subtree_release(&self->tree_pool, lookahead); - return false; - } - - // Process each parse action for the current lookahead token in - // the current state. If there are multiple actions, then this is - // an ambiguous state. REDUCE actions always create a new stack - // version, whereas SHIFT actions update the existing stack version - // and terminate this loop. - StackVersion last_reduction_version = STACK_VERSION_NONE; - for (uint32_t i = 0; i < table_entry.action_count; i++) { - TSParseAction action = table_entry.actions[i]; - - switch (action.type) { - case TSParseActionTypeShift: { - if (action.params.shift.repetition) break; - TSStateId next_state; - if (action.params.shift.extra) { - - // TODO: remove when TREE_SITTER_LANGUAGE_VERSION 9 is out. - if (state == ERROR_STATE) continue; - - next_state = state; - LOG("shift_extra"); - } else { - next_state = action.params.shift.state; - LOG("shift state:%u", next_state); - } - - if (ts_subtree_child_count(lookahead) > 0) { - ts_parser__breakdown_lookahead(self, &lookahead, state, &self->reusable_node); - next_state = ts_language_next_state(self->language, state, ts_subtree_symbol(lookahead)); - } - - ts_parser__shift(self, version, next_state, lookahead, action.params.shift.extra); - if (did_reuse) reusable_node_advance(&self->reusable_node); - return true; - } - - case TSParseActionTypeReduce: { - bool is_fragile = table_entry.action_count > 1; - bool end_of_non_terminal_extra = lookahead.ptr == NULL; - LOG("reduce sym:%s, child_count:%u", SYM_NAME(action.params.reduce.symbol), action.params.reduce.child_count); - StackVersion reduction_version = ts_parser__reduce( - self, version, action.params.reduce.symbol, action.params.reduce.child_count, - action.params.reduce.dynamic_precedence, action.params.reduce.production_id, - is_fragile, end_of_non_terminal_extra - ); - if (reduction_version != STACK_VERSION_NONE) { - last_reduction_version = reduction_version; - } - break; - } - - case TSParseActionTypeAccept: { - LOG("accept"); - ts_parser__accept(self, version, lookahead); - return true; - } - - case TSParseActionTypeRecover: { - if (ts_subtree_child_count(lookahead) > 0) { - ts_parser__breakdown_lookahead(self, &lookahead, ERROR_STATE, &self->reusable_node); - } - - ts_parser__recover(self, version, lookahead); - if (did_reuse) reusable_node_advance(&self->reusable_node); - return true; - } - } - } - - // If a reduction was performed, then replace the current stack version - // with one of the stack versions created by a reduction, and continue - // processing this version of the stack with the same lookahead symbol. - if (last_reduction_version != STACK_VERSION_NONE) { - ts_stack_renumber_version(self->stack, last_reduction_version, version); - LOG_STACK(); - state = ts_stack_state(self->stack, version); - - // At the end of a non-terminal extra rule, the lexer will return a - // null subtree, because the parser needs to perform a fixed reduction - // regardless of the lookahead node. After performing that reduction, - // (and completing the non-terminal extra rule) run the lexer again based - // on the current parse state. - if (!lookahead.ptr) { - needs_lex = true; - continue; - } - - ts_language_table_entry( - self->language, - state, - ts_subtree_leaf_symbol(lookahead), - &table_entry - ); - continue; - } - - if (!lookahead.ptr) { - ts_stack_pause(self->stack, version, ts_builtin_sym_end); - return true; - } - - // If there were no parse actions for the current lookahead token, then - // it is not valid in this state. If the current lookahead token is a - // keyword, then switch to treating it as the normal word token if that - // token is valid in this state. - if ( - ts_subtree_is_keyword(lookahead) && - ts_subtree_symbol(lookahead) != self->language->keyword_capture_token - ) { - ts_language_table_entry(self->language, state, self->language->keyword_capture_token, &table_entry); - if (table_entry.action_count > 0) { - LOG( - "switch from_keyword:%s, to_word_token:%s", - TREE_NAME(lookahead), - SYM_NAME(self->language->keyword_capture_token) - ); - - MutableSubtree mutable_lookahead = ts_subtree_make_mut(&self->tree_pool, lookahead); - ts_subtree_set_symbol(&mutable_lookahead, self->language->keyword_capture_token, self->language); - lookahead = ts_subtree_from_mut(mutable_lookahead); - continue; - } - } - - // If the current lookahead token is not valid and the parser is - // already in the error state, restart the error recovery process. - // TODO - can this be unified with the other `RECOVER` case above? - if (state == ERROR_STATE) { - ts_parser__recover(self, version, lookahead); - return true; - } - - // If the current lookahead token is not valid and the previous - // subtree on the stack was reused from an old tree, it isn't actually - // valid to reuse it. Remove it from the stack, and in its place, - // push each of its children. Then try again to process the current - // lookahead. - if (ts_parser__breakdown_top_of_stack(self, version)) { - state = ts_stack_state(self->stack, version); - ts_subtree_release(&self->tree_pool, lookahead); - needs_lex = true; - continue; - } - - // At this point, the current lookahead token is definitely not valid - // for this parse stack version. Mark this version as paused and continue - // processing any other stack versions that might exist. If some other - // version advances successfully, then this version can simply be removed. - // But if all versions end up paused, then error recovery is needed. - LOG("detect_error"); - ts_stack_pause(self->stack, version, ts_subtree_leaf_symbol(lookahead)); - ts_subtree_release(&self->tree_pool, lookahead); - return true; - } -} - -static unsigned ts_parser__condense_stack(TSParser *self) { - bool made_changes = false; - unsigned min_error_cost = UINT_MAX; - for (StackVersion i = 0; i < ts_stack_version_count(self->stack); i++) { - // Prune any versions that have been marked for removal. - if (ts_stack_is_halted(self->stack, i)) { - ts_stack_remove_version(self->stack, i); - i--; - continue; - } - - // Keep track of the minimum error cost of any stack version so - // that it can be returned. - ErrorStatus status_i = ts_parser__version_status(self, i); - if (!status_i.is_in_error && status_i.cost < min_error_cost) { - min_error_cost = status_i.cost; - } - - // Examine each pair of stack versions, removing any versions that - // are clearly worse than another version. Ensure that the versions - // are ordered from most promising to least promising. - for (StackVersion j = 0; j < i; j++) { - ErrorStatus status_j = ts_parser__version_status(self, j); - - switch (ts_parser__compare_versions(self, status_j, status_i)) { - case ErrorComparisonTakeLeft: - made_changes = true; - ts_stack_remove_version(self->stack, i); - i--; - j = i; - break; - - case ErrorComparisonPreferLeft: - case ErrorComparisonNone: - if (ts_stack_merge(self->stack, j, i)) { - made_changes = true; - i--; - j = i; - } - break; - - case ErrorComparisonPreferRight: - made_changes = true; - if (ts_stack_merge(self->stack, j, i)) { - i--; - j = i; - } else { - ts_stack_swap_versions(self->stack, i, j); - } - break; - - case ErrorComparisonTakeRight: - made_changes = true; - ts_stack_remove_version(self->stack, j); - i--; - j--; - break; - } - } - } - - // Enfore a hard upper bound on the number of stack versions by - // discarding the least promising versions. - while (ts_stack_version_count(self->stack) > MAX_VERSION_COUNT) { - ts_stack_remove_version(self->stack, MAX_VERSION_COUNT); - made_changes = true; - } - - // If the best-performing stack version is currently paused, or all - // versions are paused, then resume the best paused version and begin - // the error recovery process. Otherwise, remove the paused versions. - if (ts_stack_version_count(self->stack) > 0) { - bool has_unpaused_version = false; - for (StackVersion i = 0, n = ts_stack_version_count(self->stack); i < n; i++) { - if (ts_stack_is_paused(self->stack, i)) { - if (!has_unpaused_version && self->accept_count < MAX_VERSION_COUNT) { - LOG("resume version:%u", i); - min_error_cost = ts_stack_error_cost(self->stack, i); - TSSymbol lookahead_symbol = ts_stack_resume(self->stack, i); - ts_parser__handle_error(self, i, lookahead_symbol); - has_unpaused_version = true; - } else { - ts_stack_remove_version(self->stack, i); - i--; - n--; - } - } else { - has_unpaused_version = true; - } - } - } - - if (made_changes) { - LOG("condense"); - LOG_STACK(); - } - - return min_error_cost; -} - -static bool ts_parser_has_outstanding_parse(TSParser *self) { - return ( - ts_stack_state(self->stack, 0) != 1 || - ts_stack_node_count_since_error(self->stack, 0) != 0 - ); -} - -// Parser - Public - -TSParser *ts_parser_new(void) { - TSParser *self = ts_calloc(1, sizeof(TSParser)); - ts_lexer_init(&self->lexer); - array_init(&self->reduce_actions); - array_reserve(&self->reduce_actions, 4); - self->tree_pool = ts_subtree_pool_new(32); - self->stack = ts_stack_new(&self->tree_pool); - self->finished_tree = NULL_SUBTREE; - self->reusable_node = reusable_node_new(); - self->dot_graph_file = NULL; - self->cancellation_flag = NULL; - self->timeout_duration = 0; - self->end_clock = clock_null(); - self->operation_count = 0; - self->old_tree = NULL_SUBTREE; - self->scratch_tree.ptr = &self->scratch_tree_data; - self->included_range_differences = (TSRangeArray) array_new(); - self->included_range_difference_index = 0; - ts_parser__set_cached_token(self, 0, NULL_SUBTREE, NULL_SUBTREE); - return self; -} - -void ts_parser_delete(TSParser *self) { - if (!self) return; - - ts_parser_set_language(self, NULL); - ts_stack_delete(self->stack); - if (self->reduce_actions.contents) { - array_delete(&self->reduce_actions); - } - if (self->included_range_differences.contents) { - array_delete(&self->included_range_differences); - } - if (self->old_tree.ptr) { - ts_subtree_release(&self->tree_pool, self->old_tree); - self->old_tree = NULL_SUBTREE; - } - ts_lexer_delete(&self->lexer); - ts_parser__set_cached_token(self, 0, NULL_SUBTREE, NULL_SUBTREE); - ts_subtree_pool_delete(&self->tree_pool); - reusable_node_delete(&self->reusable_node); - ts_free(self); -} - -const TSLanguage *ts_parser_language(const TSParser *self) { - return self->language; -} - -bool ts_parser_set_language(TSParser *self, const TSLanguage *language) { - if (language) { - if (language->version > TREE_SITTER_LANGUAGE_VERSION) return false; - if (language->version < TREE_SITTER_MIN_COMPATIBLE_LANGUAGE_VERSION) return false; - } - - if (self->external_scanner_payload && self->language->external_scanner.destroy) { - self->language->external_scanner.destroy(self->external_scanner_payload); - } - - if (language && language->external_scanner.create) { - self->external_scanner_payload = language->external_scanner.create(); - } else { - self->external_scanner_payload = NULL; - } - - self->language = language; - ts_parser_reset(self); - return true; -} - -TSLogger ts_parser_logger(const TSParser *self) { - return self->lexer.logger; -} - -void ts_parser_set_logger(TSParser *self, TSLogger logger) { - self->lexer.logger = logger; -} - -void ts_parser_print_dot_graphs(TSParser *self, int fd) { - if (self->dot_graph_file) { - fclose(self->dot_graph_file); - } - - if (fd >= 0) { - self->dot_graph_file = fdopen(fd, "a"); - } else { - self->dot_graph_file = NULL; - } -} - -const size_t *ts_parser_cancellation_flag(const TSParser *self) { - return (const size_t *)self->cancellation_flag; -} - -void ts_parser_set_cancellation_flag(TSParser *self, const size_t *flag) { - self->cancellation_flag = (const volatile size_t *)flag; -} - -uint64_t ts_parser_timeout_micros(const TSParser *self) { - return duration_to_micros(self->timeout_duration); -} - -void ts_parser_set_timeout_micros(TSParser *self, uint64_t timeout_micros) { - self->timeout_duration = duration_from_micros(timeout_micros); -} - -bool ts_parser_set_included_ranges( - TSParser *self, - const TSRange *ranges, - uint32_t count -) { - return ts_lexer_set_included_ranges(&self->lexer, ranges, count); -} - -const TSRange *ts_parser_included_ranges(const TSParser *self, uint32_t *count) { - return ts_lexer_included_ranges(&self->lexer, count); -} - -void ts_parser_reset(TSParser *self) { - if (self->language && self->language->external_scanner.deserialize) { - self->language->external_scanner.deserialize(self->external_scanner_payload, NULL, 0); - } - - if (self->old_tree.ptr) { - ts_subtree_release(&self->tree_pool, self->old_tree); - self->old_tree = NULL_SUBTREE; - } - - reusable_node_clear(&self->reusable_node); - ts_lexer_reset(&self->lexer, length_zero()); - ts_stack_clear(self->stack); - ts_parser__set_cached_token(self, 0, NULL_SUBTREE, NULL_SUBTREE); - if (self->finished_tree.ptr) { - ts_subtree_release(&self->tree_pool, self->finished_tree); - self->finished_tree = NULL_SUBTREE; - } - self->accept_count = 0; -} - -TSTree *ts_parser_parse( - TSParser *self, - const TSTree *old_tree, - TSInput input -) { - if (!self->language || !input.read) return NULL; - - ts_lexer_set_input(&self->lexer, input); - - array_clear(&self->included_range_differences); - self->included_range_difference_index = 0; - - if (ts_parser_has_outstanding_parse(self)) { - LOG("resume_parsing"); - } else if (old_tree) { - ts_subtree_retain(old_tree->root); - self->old_tree = old_tree->root; - ts_range_array_get_changed_ranges( - old_tree->included_ranges, old_tree->included_range_count, - self->lexer.included_ranges, self->lexer.included_range_count, - &self->included_range_differences - ); - reusable_node_reset(&self->reusable_node, old_tree->root); - LOG("parse_after_edit"); - LOG_TREE(self->old_tree); - for (unsigned i = 0; i < self->included_range_differences.size; i++) { - TSRange *range = &self->included_range_differences.contents[i]; - LOG("different_included_range %u - %u", range->start_byte, range->end_byte); - } - } else { - reusable_node_clear(&self->reusable_node); - LOG("new_parse"); - } - - uint32_t position = 0, last_position = 0, version_count = 0; - self->operation_count = 0; - if (self->timeout_duration) { - self->end_clock = clock_after(clock_now(), self->timeout_duration); - } else { - self->end_clock = clock_null(); - } - - do { - for (StackVersion version = 0; - version_count = ts_stack_version_count(self->stack), version < version_count; - version++) { - bool allow_node_reuse = version_count == 1; - while (ts_stack_is_active(self->stack, version)) { - LOG("process version:%d, version_count:%u, state:%d, row:%u, col:%u", - version, ts_stack_version_count(self->stack), - ts_stack_state(self->stack, version), - ts_stack_position(self->stack, version).extent.row + 1, - ts_stack_position(self->stack, version).extent.column); - - if (!ts_parser__advance(self, version, allow_node_reuse)) return NULL; - LOG_STACK(); - - position = ts_stack_position(self->stack, version).bytes; - if (position > last_position || (version > 0 && position == last_position)) { - last_position = position; - break; - } - } - } - - unsigned min_error_cost = ts_parser__condense_stack(self); - if (self->finished_tree.ptr && ts_subtree_error_cost(self->finished_tree) < min_error_cost) { - break; - } - - while (self->included_range_difference_index < self->included_range_differences.size) { - TSRange *range = &self->included_range_differences.contents[self->included_range_difference_index]; - if (range->end_byte <= position) { - self->included_range_difference_index++; - } else { - break; - } - } - } while (version_count != 0); - - ts_subtree_balance(self->finished_tree, &self->tree_pool, self->language); - LOG("done"); - LOG_TREE(self->finished_tree); - - TSTree *result = ts_tree_new( - self->finished_tree, - self->language, - self->lexer.included_ranges, - self->lexer.included_range_count - ); - self->finished_tree = NULL_SUBTREE; - ts_parser_reset(self); - return result; -} - -TSTree *ts_parser_parse_string( - TSParser *self, - const TSTree *old_tree, - const char *string, - uint32_t length -) { - return ts_parser_parse_string_encoding(self, old_tree, string, length, TSInputEncodingUTF8); -} - -TSTree *ts_parser_parse_string_encoding(TSParser *self, const TSTree *old_tree, - const char *string, uint32_t length, TSInputEncoding encoding) { - TSStringInput input = {string, length}; - return ts_parser_parse(self, old_tree, (TSInput) { - &input, - ts_string_input_read, - encoding, - }); -} - -#undef LOG diff --git a/src/tree_sitter/parser.h b/src/tree_sitter/parser.h deleted file mode 100644 index 11bf4fc42a..0000000000 --- a/src/tree_sitter/parser.h +++ /dev/null @@ -1,235 +0,0 @@ -#ifndef TREE_SITTER_PARSER_H_ -#define TREE_SITTER_PARSER_H_ - -#ifdef __cplusplus -extern "C" { -#endif - -#include <stdbool.h> -#include <stdint.h> -#include <stdlib.h> - -#define ts_builtin_sym_error ((TSSymbol)-1) -#define ts_builtin_sym_end 0 -#define TREE_SITTER_SERIALIZATION_BUFFER_SIZE 1024 - -#ifndef TREE_SITTER_API_H_ -typedef uint16_t TSSymbol; -typedef uint16_t TSFieldId; -typedef struct TSLanguage TSLanguage; -#endif - -typedef struct { - TSFieldId field_id; - uint8_t child_index; - bool inherited; -} TSFieldMapEntry; - -typedef struct { - uint16_t index; - uint16_t length; -} TSFieldMapSlice; - -typedef uint16_t TSStateId; - -typedef struct { - bool visible : 1; - bool named : 1; -} TSSymbolMetadata; - -typedef struct TSLexer TSLexer; - -struct TSLexer { - int32_t lookahead; - TSSymbol result_symbol; - void (*advance)(TSLexer *, bool); - void (*mark_end)(TSLexer *); - uint32_t (*get_column)(TSLexer *); - bool (*is_at_included_range_start)(const TSLexer *); - bool (*eof)(const TSLexer *); -}; - -typedef enum { - TSParseActionTypeShift, - TSParseActionTypeReduce, - TSParseActionTypeAccept, - TSParseActionTypeRecover, -} TSParseActionType; - -typedef struct { - union { - struct { - TSStateId state; - bool extra : 1; - bool repetition : 1; - } shift; - struct { - TSSymbol symbol; - int16_t dynamic_precedence; - uint8_t child_count; - uint8_t production_id; - } reduce; - } params; - TSParseActionType type : 4; -} TSParseAction; - -typedef struct { - uint16_t lex_state; - uint16_t external_lex_state; -} TSLexMode; - -typedef union { - TSParseAction action; - struct { - uint8_t count; - bool reusable : 1; - } entry; -} TSParseActionEntry; - -struct TSLanguage { - uint32_t version; - uint32_t symbol_count; - uint32_t alias_count; - uint32_t token_count; - uint32_t external_token_count; - const char **symbol_names; - const TSSymbolMetadata *symbol_metadata; - const uint16_t *parse_table; - const TSParseActionEntry *parse_actions; - const TSLexMode *lex_modes; - const TSSymbol *alias_sequences; - uint16_t max_alias_sequence_length; - bool (*lex_fn)(TSLexer *, TSStateId); - bool (*keyword_lex_fn)(TSLexer *, TSStateId); - TSSymbol keyword_capture_token; - struct { - const bool *states; - const TSSymbol *symbol_map; - void *(*create)(void); - void (*destroy)(void *); - bool (*scan)(void *, TSLexer *, const bool *symbol_whitelist); - unsigned (*serialize)(void *, char *); - void (*deserialize)(void *, const char *, unsigned); - } external_scanner; - uint32_t field_count; - const TSFieldMapSlice *field_map_slices; - const TSFieldMapEntry *field_map_entries; - const char **field_names; - uint32_t large_state_count; - const uint16_t *small_parse_table; - const uint32_t *small_parse_table_map; - const TSSymbol *public_symbol_map; -}; - -/* - * Lexer Macros - */ - -#define START_LEXER() \ - bool result = false; \ - bool skip = false; \ - bool eof = false; \ - int32_t lookahead; \ - goto start; \ - next_state: \ - lexer->advance(lexer, skip); \ - start: \ - skip = false; \ - lookahead = lexer->lookahead; - -#define ADVANCE(state_value) \ - { \ - state = state_value; \ - goto next_state; \ - } - -#define SKIP(state_value) \ - { \ - skip = true; \ - state = state_value; \ - goto next_state; \ - } - -#define ACCEPT_TOKEN(symbol_value) \ - result = true; \ - lexer->result_symbol = symbol_value; \ - lexer->mark_end(lexer); - -#define END_STATE() return result; - -/* - * Parse Table Macros - */ - -#define SMALL_STATE(id) id - LARGE_STATE_COUNT - -#define STATE(id) id - -#define ACTIONS(id) id - -#define SHIFT(state_value) \ - { \ - { \ - .params = { \ - .shift = { \ - .state = state_value \ - } \ - }, \ - .type = TSParseActionTypeShift \ - } \ - } - -#define SHIFT_REPEAT(state_value) \ - { \ - { \ - .params = { \ - .shift = { \ - .state = state_value, \ - .repetition = true \ - } \ - }, \ - .type = TSParseActionTypeShift \ - } \ - } - -#define RECOVER() \ - { \ - { .type = TSParseActionTypeRecover } \ - } - -#define SHIFT_EXTRA() \ - { \ - { \ - .params = { \ - .shift = { \ - .extra = true \ - } \ - }, \ - .type = TSParseActionTypeShift \ - } \ - } - -#define REDUCE(symbol_val, child_count_val, ...) \ - { \ - { \ - .params = { \ - .reduce = { \ - .symbol = symbol_val, \ - .child_count = child_count_val, \ - __VA_ARGS__ \ - }, \ - }, \ - .type = TSParseActionTypeReduce \ - } \ - } - -#define ACCEPT_INPUT() \ - { \ - { .type = TSParseActionTypeAccept } \ - } - -#ifdef __cplusplus -} -#endif - -#endif // TREE_SITTER_PARSER_H_ diff --git a/src/tree_sitter/point.h b/src/tree_sitter/point.h deleted file mode 100644 index a50d20214b..0000000000 --- a/src/tree_sitter/point.h +++ /dev/null @@ -1,54 +0,0 @@ -#ifndef TREE_SITTER_POINT_H_ -#define TREE_SITTER_POINT_H_ - -#include "tree_sitter/api.h" - -#define POINT_ZERO ((TSPoint) {0, 0}) -#define POINT_MAX ((TSPoint) {UINT32_MAX, UINT32_MAX}) - -static inline TSPoint point__new(unsigned row, unsigned column) { - TSPoint result = {row, column}; - return result; -} - -static inline TSPoint point_add(TSPoint a, TSPoint b) { - if (b.row > 0) - return point__new(a.row + b.row, b.column); - else - return point__new(a.row, a.column + b.column); -} - -static inline TSPoint point_sub(TSPoint a, TSPoint b) { - if (a.row > b.row) - return point__new(a.row - b.row, a.column); - else - return point__new(0, a.column - b.column); -} - -static inline bool point_lte(TSPoint a, TSPoint b) { - return (a.row < b.row) || (a.row == b.row && a.column <= b.column); -} - -static inline bool point_lt(TSPoint a, TSPoint b) { - return (a.row < b.row) || (a.row == b.row && a.column < b.column); -} - -static inline bool point_eq(TSPoint a, TSPoint b) { - return a.row == b.row && a.column == b.column; -} - -static inline TSPoint point_min(TSPoint a, TSPoint b) { - if (a.row < b.row || (a.row == b.row && a.column < b.column)) - return a; - else - return b; -} - -static inline TSPoint point_max(TSPoint a, TSPoint b) { - if (a.row > b.row || (a.row == b.row && a.column > b.column)) - return a; - else - return b; -} - -#endif diff --git a/src/tree_sitter/query.c b/src/tree_sitter/query.c deleted file mode 100644 index b887b74ff6..0000000000 --- a/src/tree_sitter/query.c +++ /dev/null @@ -1,2143 +0,0 @@ -#include "tree_sitter/api.h" -#include "./alloc.h" -#include "./array.h" -#include "./bits.h" -#include "./language.h" -#include "./point.h" -#include "./tree_cursor.h" -#include "./unicode.h" -#include <wctype.h> - -// #define LOG(...) fprintf(stderr, __VA_ARGS__) -#define LOG(...) - -#define MAX_CAPTURE_LIST_COUNT 32 -#define MAX_STEP_CAPTURE_COUNT 3 - -/* - * Stream - A sequence of unicode characters derived from a UTF8 string. - * This struct is used in parsing queries from S-expressions. - */ -typedef struct { - const char *input; - const char *end; - int32_t next; - uint8_t next_size; -} Stream; - -/* - * QueryStep - A step in the process of matching a query. Each node within - * a query S-expression maps to one of these steps. An entire pattern is - * represented as a sequence of these steps. Fields: - * - * - `symbol` - The grammar symbol to match. A zero value represents the - * wildcard symbol, '_'. - * - `field` - The field name to match. A zero value means that a field name - * was not specified. - * - `capture_ids` - An array of integers representing the names of captures - * associated with this node in the pattern, terminated by a `NONE` value. - * - `depth` - The depth where this node occurs in the pattern. The root node - * of the pattern has depth zero. - * - `alternative_index` - The index of a different query step that serves as - * an alternative to this step. - */ -typedef struct { - TSSymbol symbol; - TSFieldId field; - uint16_t capture_ids[MAX_STEP_CAPTURE_COUNT]; - uint16_t alternative_index; - uint16_t depth; - bool contains_captures: 1; - bool is_immediate: 1; - bool is_last_child: 1; - bool is_pass_through: 1; - bool is_dead_end: 1; - bool alternative_is_immediate: 1; -} QueryStep; - -/* - * Slice - A slice of an external array. Within a query, capture names, - * literal string values, and predicate step informations are stored in three - * contiguous arrays. Individual captures, string values, and predicates are - * represented as slices of these three arrays. - */ -typedef struct { - uint32_t offset; - uint32_t length; -} Slice; - -/* - * SymbolTable - a two-way mapping of strings to ids. - */ -typedef struct { - Array(char) characters; - Array(Slice) slices; -} SymbolTable; - -/* - * PatternEntry - Information about the starting point for matching a - * particular pattern, consisting of the index of the pattern within the query, - * and the index of the patter's first step in the shared `steps` array. These - * entries are stored in a 'pattern map' - a sorted array that makes it - * possible to efficiently lookup patterns based on the symbol for their first - * step. - */ -typedef struct { - uint16_t step_index; - uint16_t pattern_index; -} PatternEntry; - -/* - * QueryState - The state of an in-progress match of a particular pattern - * in a query. While executing, a `TSQueryCursor` must keep track of a number - * of possible in-progress matches. Each of those possible matches is - * represented as one of these states. Fields: - * - `id` - A numeric id that is exposed to the public API. This allows the - * caller to remove a given match, preventing any more of its captures - * from being returned. - * - `start_depth` - The depth in the tree where the first step of the state's - * pattern was matched. - * - `pattern_index` - The pattern that the state is matching. - * - `consumed_capture_count` - The number of captures from this match that - * have already been returned. - * - `capture_list_id` - A numeric id that can be used to retrieve the state's - * list of captures from the `CaptureListPool`. - * - `seeking_immediate_match` - A flag that indicates that the state's next - * step must be matched by the very next sibling. This is used when - * processing repetitions. - * - `has_in_progress_alternatives` - A flag that indicates that there is are - * other states that have the same captures as this state, but are at - * different steps in their pattern. This means that in order to obey the - * 'longest-match' rule, this state should not be returned as a match until - * it is clear that there can be no longer match. - */ -typedef struct { - uint32_t id; - uint16_t start_depth; - uint16_t step_index; - uint16_t pattern_index; - uint16_t capture_list_id; - uint16_t consumed_capture_count: 12; - bool seeking_immediate_match: 1; - bool has_in_progress_alternatives: 1; - bool dead: 1; -} QueryState; - -typedef Array(TSQueryCapture) CaptureList; - -/* - * CaptureListPool - A collection of *lists* of captures. Each QueryState - * needs to maintain its own list of captures. To avoid repeated allocations, - * the reuses a fixed set of capture lists, and keeps track of which ones - * are currently in use. - */ -typedef struct { - CaptureList list[MAX_CAPTURE_LIST_COUNT]; - CaptureList empty_list; - uint32_t usage_map; -} CaptureListPool; - -/* - * TSQuery - A tree query, compiled from a string of S-expressions. The query - * itself is immutable. The mutable state used in the process of executing the - * query is stored in a `TSQueryCursor`. - */ -struct TSQuery { - SymbolTable captures; - SymbolTable predicate_values; - Array(QueryStep) steps; - Array(PatternEntry) pattern_map; - Array(TSQueryPredicateStep) predicate_steps; - Array(Slice) predicates_by_pattern; - Array(uint32_t) start_bytes_by_pattern; - const TSLanguage *language; - uint16_t wildcard_root_pattern_count; - TSSymbol *symbol_map; -}; - -/* - * TSQueryCursor - A stateful struct used to execute a query on a tree. - */ -struct TSQueryCursor { - const TSQuery *query; - TSTreeCursor cursor; - Array(QueryState) states; - Array(QueryState) finished_states; - CaptureListPool capture_list_pool; - uint32_t depth; - uint32_t start_byte; - uint32_t end_byte; - uint32_t next_state_id; - TSPoint start_point; - TSPoint end_point; - bool ascending; - bool halted; -}; - -static const TSQueryError PARENT_DONE = -1; -static const uint16_t PATTERN_DONE_MARKER = UINT16_MAX; -static const uint16_t NONE = UINT16_MAX; -static const TSSymbol WILDCARD_SYMBOL = 0; -static const TSSymbol NAMED_WILDCARD_SYMBOL = UINT16_MAX - 1; - -/********** - * Stream - **********/ - -// Advance to the next unicode code point in the stream. -static bool stream_advance(Stream *self) { - self->input += self->next_size; - if (self->input < self->end) { - uint32_t size = ts_decode_utf8( - (const uint8_t *)self->input, - self->end - self->input, - &self->next - ); - if (size > 0) { - self->next_size = size; - return true; - } - } else { - self->next_size = 0; - self->next = '\0'; - } - return false; -} - -// Reset the stream to the given input position, represented as a pointer -// into the input string. -static void stream_reset(Stream *self, const char *input) { - self->input = input; - self->next_size = 0; - stream_advance(self); -} - -static Stream stream_new(const char *string, uint32_t length) { - Stream self = { - .next = 0, - .input = string, - .end = string + length, - }; - stream_advance(&self); - return self; -} - -static void stream_skip_whitespace(Stream *stream) { - for (;;) { - if (iswspace(stream->next)) { - stream_advance(stream); - } else if (stream->next == ';') { - // skip over comments - stream_advance(stream); - while (stream->next && stream->next != '\n') { - if (!stream_advance(stream)) break; - } - } else { - break; - } - } -} - -static bool stream_is_ident_start(Stream *stream) { - return iswalnum(stream->next) || stream->next == '_' || stream->next == '-'; -} - -static void stream_scan_identifier(Stream *stream) { - do { - stream_advance(stream); - } while ( - iswalnum(stream->next) || - stream->next == '_' || - stream->next == '-' || - stream->next == '.' || - stream->next == '?' || - stream->next == '!' - ); -} - -/****************** - * CaptureListPool - ******************/ - -static CaptureListPool capture_list_pool_new(void) { - return (CaptureListPool) { - .empty_list = array_new(), - .usage_map = UINT32_MAX, - }; -} - -static void capture_list_pool_reset(CaptureListPool *self) { - self->usage_map = UINT32_MAX; - for (unsigned i = 0; i < MAX_CAPTURE_LIST_COUNT; i++) { - array_clear(&self->list[i]); - } -} - -static void capture_list_pool_delete(CaptureListPool *self) { - for (unsigned i = 0; i < MAX_CAPTURE_LIST_COUNT; i++) { - array_delete(&self->list[i]); - } -} - -static const CaptureList *capture_list_pool_get(const CaptureListPool *self, uint16_t id) { - if (id >= MAX_CAPTURE_LIST_COUNT) return &self->empty_list; - return &self->list[id]; -} - -static CaptureList *capture_list_pool_get_mut(CaptureListPool *self, uint16_t id) { - assert(id < MAX_CAPTURE_LIST_COUNT); - return &self->list[id]; -} - -static bool capture_list_pool_is_empty(const CaptureListPool *self) { - return self->usage_map == 0; -} - -static uint16_t capture_list_pool_acquire(CaptureListPool *self) { - // In the usage_map bitmask, ones represent free lists, and zeros represent - // lists that are in use. A free list id can quickly be found by counting - // the leading zeros in the usage map. An id of zero corresponds to the - // highest-order bit in the bitmask. - uint16_t id = count_leading_zeros(self->usage_map); - if (id >= MAX_CAPTURE_LIST_COUNT) return NONE; - self->usage_map &= ~bitmask_for_index(id); - array_clear(&self->list[id]); - return id; -} - -static void capture_list_pool_release(CaptureListPool *self, uint16_t id) { - if (id >= MAX_CAPTURE_LIST_COUNT) return; - array_clear(&self->list[id]); - self->usage_map |= bitmask_for_index(id); -} - -/************** - * SymbolTable - **************/ - -static SymbolTable symbol_table_new(void) { - return (SymbolTable) { - .characters = array_new(), - .slices = array_new(), - }; -} - -static void symbol_table_delete(SymbolTable *self) { - array_delete(&self->characters); - array_delete(&self->slices); -} - -static int symbol_table_id_for_name( - const SymbolTable *self, - const char *name, - uint32_t length -) { - for (unsigned i = 0; i < self->slices.size; i++) { - Slice slice = self->slices.contents[i]; - if ( - slice.length == length && - !strncmp(&self->characters.contents[slice.offset], name, length) - ) return i; - } - return -1; -} - -static const char *symbol_table_name_for_id( - const SymbolTable *self, - uint16_t id, - uint32_t *length -) { - Slice slice = self->slices.contents[id]; - *length = slice.length; - return &self->characters.contents[slice.offset]; -} - -static uint16_t symbol_table_insert_name( - SymbolTable *self, - const char *name, - uint32_t length -) { - int id = symbol_table_id_for_name(self, name, length); - if (id >= 0) return (uint16_t)id; - Slice slice = { - .offset = self->characters.size, - .length = length, - }; - array_grow_by(&self->characters, length + 1); - memcpy(&self->characters.contents[slice.offset], name, length); - self->characters.contents[self->characters.size - 1] = 0; - array_push(&self->slices, slice); - return self->slices.size - 1; -} - -static uint16_t symbol_table_insert_name_with_escapes( - SymbolTable *self, - const char *escaped_name, - uint32_t escaped_length -) { - Slice slice = { - .offset = self->characters.size, - .length = 0, - }; - array_grow_by(&self->characters, escaped_length + 1); - - // Copy the contents of the literal into the characters buffer, processing escape - // sequences like \n and \". This needs to be done before checking if the literal - // is already present, in order to do the string comparison. - bool is_escaped = false; - for (unsigned i = 0; i < escaped_length; i++) { - const char *src = &escaped_name[i]; - char *dest = &self->characters.contents[slice.offset + slice.length]; - if (is_escaped) { - switch (*src) { - case 'n': - *dest = '\n'; - break; - case 'r': - *dest = '\r'; - break; - case 't': - *dest = '\t'; - break; - case '0': - *dest = '\0'; - break; - default: - *dest = *src; - break; - } - is_escaped = false; - slice.length++; - } else { - if (*src == '\\') { - is_escaped = true; - } else { - *dest = *src; - slice.length++; - } - } - } - - // If the string is already present, remove the redundant content from the characters - // buffer and return the existing id. - int id = symbol_table_id_for_name(self, &self->characters.contents[slice.offset], slice.length); - if (id >= 0) { - self->characters.size -= (escaped_length + 1); - return id; - } - - self->characters.contents[slice.offset + slice.length] = 0; - array_push(&self->slices, slice); - return self->slices.size - 1; -} - -/************ - * QueryStep - ************/ - -static QueryStep query_step__new( - TSSymbol symbol, - uint16_t depth, - bool is_immediate -) { - return (QueryStep) { - .symbol = symbol, - .depth = depth, - .field = 0, - .capture_ids = {NONE, NONE, NONE}, - .alternative_index = NONE, - .contains_captures = false, - .is_last_child = false, - .is_pass_through = false, - .is_dead_end = false, - .is_immediate = is_immediate, - .alternative_is_immediate = false, - }; -} - -static void query_step__add_capture(QueryStep *self, uint16_t capture_id) { - for (unsigned i = 0; i < MAX_STEP_CAPTURE_COUNT; i++) { - if (self->capture_ids[i] == NONE) { - self->capture_ids[i] = capture_id; - break; - } - } -} - -static void query_step__remove_capture(QueryStep *self, uint16_t capture_id) { - for (unsigned i = 0; i < MAX_STEP_CAPTURE_COUNT; i++) { - if (self->capture_ids[i] == capture_id) { - self->capture_ids[i] = NONE; - while (i + 1 < MAX_STEP_CAPTURE_COUNT) { - if (self->capture_ids[i + 1] == NONE) break; - self->capture_ids[i] = self->capture_ids[i + 1]; - self->capture_ids[i + 1] = NONE; - i++; - } - break; - } - } -} - -/********* - * Query - *********/ - -// The `pattern_map` contains a mapping from TSSymbol values to indices in the -// `steps` array. For a given syntax node, the `pattern_map` makes it possible -// to quickly find the starting steps of all of the patterns whose root matches -// that node. Each entry has two fields: a `pattern_index`, which identifies one -// of the patterns in the query, and a `step_index`, which indicates the start -// offset of that pattern's steps within the `steps` array. -// -// The entries are sorted by the patterns' root symbols, and lookups use a -// binary search. This ensures that the cost of this initial lookup step -// scales logarithmically with the number of patterns in the query. -// -// This returns `true` if the symbol is present and `false` otherwise. -// If the symbol is not present `*result` is set to the index where the -// symbol should be inserted. -static inline bool ts_query__pattern_map_search( - const TSQuery *self, - TSSymbol needle, - uint32_t *result -) { - uint32_t base_index = self->wildcard_root_pattern_count; - uint32_t size = self->pattern_map.size - base_index; - if (size == 0) { - *result = base_index; - return false; - } - while (size > 1) { - uint32_t half_size = size / 2; - uint32_t mid_index = base_index + half_size; - TSSymbol mid_symbol = self->steps.contents[ - self->pattern_map.contents[mid_index].step_index - ].symbol; - if (needle > mid_symbol) base_index = mid_index; - size -= half_size; - } - - TSSymbol symbol = self->steps.contents[ - self->pattern_map.contents[base_index].step_index - ].symbol; - - if (needle > symbol) { - base_index++; - if (base_index < self->pattern_map.size) { - symbol = self->steps.contents[ - self->pattern_map.contents[base_index].step_index - ].symbol; - } - } - - *result = base_index; - return needle == symbol; -} - -// Insert a new pattern's start index into the pattern map, maintaining -// the pattern map's ordering invariant. -static inline void ts_query__pattern_map_insert( - TSQuery *self, - TSSymbol symbol, - uint32_t start_step_index, - uint32_t pattern_index -) { - uint32_t index; - ts_query__pattern_map_search(self, symbol, &index); - - // Ensure that the entries are sorted not only by symbol, but also - // by pattern_index. This way, states for earlier patterns will be - // initiated first, which allows the ordering of the states array - // to be maintained more efficiently. - while (index < self->pattern_map.size) { - PatternEntry *entry = &self->pattern_map.contents[index]; - if ( - self->steps.contents[entry->step_index].symbol == symbol && - entry->pattern_index < pattern_index - ) { - index++; - } else { - break; - } - } - - array_insert(&self->pattern_map, index, ((PatternEntry) { - .step_index = start_step_index, - .pattern_index = pattern_index, - })); -} - -static void ts_query__finalize_steps(TSQuery *self) { - for (unsigned i = 0; i < self->steps.size; i++) { - QueryStep *step = &self->steps.contents[i]; - uint32_t depth = step->depth; - if (step->capture_ids[0] != NONE) { - step->contains_captures = true; - } else { - step->contains_captures = false; - for (unsigned j = i + 1; j < self->steps.size; j++) { - QueryStep *s = &self->steps.contents[j]; - if (s->depth == PATTERN_DONE_MARKER || s->depth <= depth) break; - if (s->capture_ids[0] != NONE) step->contains_captures = true; - } - } - } -} - -// Parse a single predicate associated with a pattern, adding it to the -// query's internal `predicate_steps` array. Predicates are arbitrary -// S-expressions associated with a pattern which are meant to be handled at -// a higher level of abstraction, such as the Rust/JavaScript bindings. They -// can contain '@'-prefixed capture names, double-quoted strings, and bare -// symbols, which also represent strings. -static TSQueryError ts_query__parse_predicate( - TSQuery *self, - Stream *stream -) { - if (!stream_is_ident_start(stream)) return TSQueryErrorSyntax; - const char *predicate_name = stream->input; - stream_scan_identifier(stream); - uint32_t length = stream->input - predicate_name; - uint16_t id = symbol_table_insert_name( - &self->predicate_values, - predicate_name, - length - ); - array_back(&self->predicates_by_pattern)->length++; - array_push(&self->predicate_steps, ((TSQueryPredicateStep) { - .type = TSQueryPredicateStepTypeString, - .value_id = id, - })); - stream_skip_whitespace(stream); - - for (;;) { - if (stream->next == ')') { - stream_advance(stream); - stream_skip_whitespace(stream); - array_back(&self->predicates_by_pattern)->length++; - array_push(&self->predicate_steps, ((TSQueryPredicateStep) { - .type = TSQueryPredicateStepTypeDone, - .value_id = 0, - })); - break; - } - - // Parse an '@'-prefixed capture name - else if (stream->next == '@') { - stream_advance(stream); - - // Parse the capture name - if (!stream_is_ident_start(stream)) return TSQueryErrorSyntax; - const char *capture_name = stream->input; - stream_scan_identifier(stream); - uint32_t length = stream->input - capture_name; - - // Add the capture id to the first step of the pattern - int capture_id = symbol_table_id_for_name( - &self->captures, - capture_name, - length - ); - if (capture_id == -1) { - stream_reset(stream, capture_name); - return TSQueryErrorCapture; - } - - array_back(&self->predicates_by_pattern)->length++; - array_push(&self->predicate_steps, ((TSQueryPredicateStep) { - .type = TSQueryPredicateStepTypeCapture, - .value_id = capture_id, - })); - } - - // Parse a string literal - else if (stream->next == '"') { - stream_advance(stream); - - // Parse the string content - bool is_escaped = false; - const char *string_content = stream->input; - for (;;) { - if (is_escaped) { - is_escaped = false; - } else { - if (stream->next == '\\') { - is_escaped = true; - } else if (stream->next == '"') { - break; - } else if (stream->next == '\n') { - stream_reset(stream, string_content - 1); - return TSQueryErrorSyntax; - } - } - if (!stream_advance(stream)) { - stream_reset(stream, string_content - 1); - return TSQueryErrorSyntax; - } - } - uint32_t length = stream->input - string_content; - - // Add a step for the node - uint16_t id = symbol_table_insert_name_with_escapes( - &self->predicate_values, - string_content, - length - ); - array_back(&self->predicates_by_pattern)->length++; - array_push(&self->predicate_steps, ((TSQueryPredicateStep) { - .type = TSQueryPredicateStepTypeString, - .value_id = id, - })); - - if (stream->next != '"') return TSQueryErrorSyntax; - stream_advance(stream); - } - - // Parse a bare symbol - else if (stream_is_ident_start(stream)) { - const char *symbol_start = stream->input; - stream_scan_identifier(stream); - uint32_t length = stream->input - symbol_start; - uint16_t id = symbol_table_insert_name( - &self->predicate_values, - symbol_start, - length - ); - array_back(&self->predicates_by_pattern)->length++; - array_push(&self->predicate_steps, ((TSQueryPredicateStep) { - .type = TSQueryPredicateStepTypeString, - .value_id = id, - })); - } - - else { - return TSQueryErrorSyntax; - } - - stream_skip_whitespace(stream); - } - - return 0; -} - -// Read one S-expression pattern from the stream, and incorporate it into -// the query's internal state machine representation. For nested patterns, -// this function calls itself recursively. -static TSQueryError ts_query__parse_pattern( - TSQuery *self, - Stream *stream, - uint32_t depth, - uint32_t *capture_count, - bool is_immediate -) { - const uint32_t starting_step_index = self->steps.size; - - if (stream->next == 0) return TSQueryErrorSyntax; - - // Finish the parent S-expression. - if (stream->next == ')' || stream->next == ']') { - return PARENT_DONE; - } - - // An open bracket is the start of an alternation. - else if (stream->next == '[') { - stream_advance(stream); - stream_skip_whitespace(stream); - - // Parse each branch, and add a placeholder step in between the branches. - Array(uint32_t) branch_step_indices = array_new(); - for (;;) { - uint32_t start_index = self->steps.size; - TSQueryError e = ts_query__parse_pattern( - self, - stream, - depth, - capture_count, - is_immediate - ); - - if (e == PARENT_DONE && stream->next == ']' && branch_step_indices.size > 0) { - stream_advance(stream); - break; - } else if (e) { - array_delete(&branch_step_indices); - return e; - } - - array_push(&branch_step_indices, start_index); - array_push(&self->steps, query_step__new(0, depth, false)); - } - (void)array_pop(&self->steps); - - // For all of the branches except for the last one, add the subsequent branch as an - // alternative, and link the end of the branch to the current end of the steps. - for (unsigned i = 0; i < branch_step_indices.size - 1; i++) { - uint32_t step_index = branch_step_indices.contents[i]; - uint32_t next_step_index = branch_step_indices.contents[i + 1]; - QueryStep *start_step = &self->steps.contents[step_index]; - QueryStep *end_step = &self->steps.contents[next_step_index - 1]; - start_step->alternative_index = next_step_index; - end_step->alternative_index = self->steps.size; - end_step->is_dead_end = true; - } - - array_delete(&branch_step_indices); - } - - // An open parenthesis can be the start of three possible constructs: - // * A grouped sequence - // * A predicate - // * A named node - else if (stream->next == '(') { - stream_advance(stream); - stream_skip_whitespace(stream); - - // If this parenthesis is followed by a node, then it represents a grouped sequence. - if (stream->next == '(' || stream->next == '"' || stream->next == '[') { - bool child_is_immediate = false; - for (;;) { - if (stream->next == '.') { - child_is_immediate = true; - stream_advance(stream); - stream_skip_whitespace(stream); - } - TSQueryError e = ts_query__parse_pattern( - self, - stream, - depth, - capture_count, - child_is_immediate - ); - if (e == PARENT_DONE && stream->next == ')') { - stream_advance(stream); - break; - } else if (e) { - return e; - } - - child_is_immediate = false; - } - } - - // A dot/pound character indicates the start of a predicate. - else if (stream->next == '.' || stream->next == '#') { - stream_advance(stream); - return ts_query__parse_predicate(self, stream); - } - - // Otherwise, this parenthesis is the start of a named node. - else { - TSSymbol symbol; - - // Parse the wildcard symbol - if ( - stream->next == '_' || - - // TODO - remove. - // For temporary backward compatibility, handle '*' as a wildcard. - stream->next == '*' - ) { - symbol = depth > 0 ? NAMED_WILDCARD_SYMBOL : WILDCARD_SYMBOL; - stream_advance(stream); - } - - // Parse a normal node name - else if (stream_is_ident_start(stream)) { - const char *node_name = stream->input; - stream_scan_identifier(stream); - uint32_t length = stream->input - node_name; - - // TODO - remove. - // For temporary backward compatibility, handle predicates without the leading '#' sign. - if (length > 0 && (node_name[length - 1] == '!' || node_name[length - 1] == '?')) { - stream_reset(stream, node_name); - return ts_query__parse_predicate(self, stream); - } - - symbol = ts_language_symbol_for_name( - self->language, - node_name, - length, - true - ); - if (!symbol) { - stream_reset(stream, node_name); - return TSQueryErrorNodeType; - } - } else { - return TSQueryErrorSyntax; - } - - // Add a step for the node. - array_push(&self->steps, query_step__new(symbol, depth, is_immediate)); - - // Parse the child patterns - stream_skip_whitespace(stream); - bool child_is_immediate = false; - uint16_t child_start_step_index = self->steps.size; - for (;;) { - if (stream->next == '.') { - child_is_immediate = true; - stream_advance(stream); - stream_skip_whitespace(stream); - } - - TSQueryError e = ts_query__parse_pattern( - self, - stream, - depth + 1, - capture_count, - child_is_immediate - ); - if (e == PARENT_DONE && stream->next == ')') { - if (child_is_immediate) { - self->steps.contents[child_start_step_index].is_last_child = true; - } - stream_advance(stream); - break; - } else if (e) { - return e; - } - - child_is_immediate = false; - } - } - } - - // Parse a wildcard pattern - else if ( - stream->next == '_' || - - // TODO remove. - // For temporary backward compatibility, handle '*' as a wildcard. - stream->next == '*' - ) { - stream_advance(stream); - stream_skip_whitespace(stream); - - // Add a step that matches any kind of node - array_push(&self->steps, query_step__new(WILDCARD_SYMBOL, depth, is_immediate)); - } - - // Parse a double-quoted anonymous leaf node expression - else if (stream->next == '"') { - stream_advance(stream); - - // Parse the string content - const char *string_content = stream->input; - while (stream->next != '"') { - if (!stream_advance(stream)) { - stream_reset(stream, string_content - 1); - return TSQueryErrorSyntax; - } - } - uint32_t length = stream->input - string_content; - - // Add a step for the node - TSSymbol symbol = ts_language_symbol_for_name( - self->language, - string_content, - length, - false - ); - if (!symbol) { - stream_reset(stream, string_content); - return TSQueryErrorNodeType; - } - array_push(&self->steps, query_step__new(symbol, depth, is_immediate)); - - if (stream->next != '"') return TSQueryErrorSyntax; - stream_advance(stream); - } - - // Parse a field-prefixed pattern - else if (stream_is_ident_start(stream)) { - // Parse the field name - const char *field_name = stream->input; - stream_scan_identifier(stream); - uint32_t length = stream->input - field_name; - stream_skip_whitespace(stream); - - if (stream->next != ':') { - stream_reset(stream, field_name); - return TSQueryErrorSyntax; - } - stream_advance(stream); - stream_skip_whitespace(stream); - - // Parse the pattern - TSQueryError e = ts_query__parse_pattern( - self, - stream, - depth, - capture_count, - is_immediate - ); - if (e == PARENT_DONE) return TSQueryErrorSyntax; - if (e) return e; - - // Add the field name to the first step of the pattern - TSFieldId field_id = ts_language_field_id_for_name( - self->language, - field_name, - length - ); - if (!field_id) { - stream->input = field_name; - return TSQueryErrorField; - } - - uint32_t step_index = starting_step_index; - QueryStep *step = &self->steps.contents[step_index]; - for (;;) { - step->field = field_id; - if ( - step->alternative_index != NONE && - step->alternative_index > step_index && - step->alternative_index < self->steps.size - ) { - step_index = step->alternative_index; - step = &self->steps.contents[step_index]; - } else { - break; - } - } - } - - else { - return TSQueryErrorSyntax; - } - - stream_skip_whitespace(stream); - - // Parse suffixes modifiers for this pattern - for (;;) { - QueryStep *step = &self->steps.contents[starting_step_index]; - - // Parse the one-or-more operator. - if (stream->next == '+') { - stream_advance(stream); - stream_skip_whitespace(stream); - - QueryStep repeat_step = query_step__new(WILDCARD_SYMBOL, depth, false); - repeat_step.alternative_index = starting_step_index; - repeat_step.is_pass_through = true; - repeat_step.alternative_is_immediate = true; - array_push(&self->steps, repeat_step); - } - - // Parse the zero-or-more repetition operator. - else if (stream->next == '*') { - stream_advance(stream); - stream_skip_whitespace(stream); - - QueryStep repeat_step = query_step__new(WILDCARD_SYMBOL, depth, false); - repeat_step.alternative_index = starting_step_index; - repeat_step.is_pass_through = true; - repeat_step.alternative_is_immediate = true; - array_push(&self->steps, repeat_step); - - while (step->alternative_index != NONE) { - step = &self->steps.contents[step->alternative_index]; - } - step->alternative_index = self->steps.size; - } - - // Parse the optional operator. - else if (stream->next == '?') { - stream_advance(stream); - stream_skip_whitespace(stream); - - while (step->alternative_index != NONE) { - step = &self->steps.contents[step->alternative_index]; - } - step->alternative_index = self->steps.size; - } - - // Parse an '@'-prefixed capture pattern - else if (stream->next == '@') { - stream_advance(stream); - if (!stream_is_ident_start(stream)) return TSQueryErrorSyntax; - const char *capture_name = stream->input; - stream_scan_identifier(stream); - uint32_t length = stream->input - capture_name; - stream_skip_whitespace(stream); - - // Add the capture id to the first step of the pattern - uint16_t capture_id = symbol_table_insert_name( - &self->captures, - capture_name, - length - ); - - uint32_t step_index = starting_step_index; - for (;;) { - query_step__add_capture(step, capture_id); - if ( - step->alternative_index != NONE && - step->alternative_index > step_index && - step->alternative_index < self->steps.size - ) { - step_index = step->alternative_index; - step = &self->steps.contents[step_index]; - } else { - break; - } - } - - (*capture_count)++; - } - - // No more suffix modifiers - else { - break; - } - } - - return 0; -} - -TSQuery *ts_query_new( - const TSLanguage *language, - const char *source, - uint32_t source_len, - uint32_t *error_offset, - TSQueryError *error_type -) { - TSSymbol *symbol_map; - if (ts_language_version(language) >= TREE_SITTER_LANGUAGE_VERSION_WITH_SYMBOL_DEDUPING) { - symbol_map = NULL; - } else { - // Work around the fact that multiple symbols can currently be - // associated with the same name, due to "simple aliases". - // In the next language ABI version, this map will be contained - // in the language's `public_symbol_map` field. - uint32_t symbol_count = ts_language_symbol_count(language); - symbol_map = ts_malloc(sizeof(TSSymbol) * symbol_count); - for (unsigned i = 0; i < symbol_count; i++) { - const char *name = ts_language_symbol_name(language, i); - const TSSymbolType symbol_type = ts_language_symbol_type(language, i); - - symbol_map[i] = i; - - for (unsigned j = 0; j < i; j++) { - if (ts_language_symbol_type(language, j) == symbol_type) { - if (!strcmp(name, ts_language_symbol_name(language, j))) { - symbol_map[i] = j; - break; - } - } - } - } - } - - TSQuery *self = ts_malloc(sizeof(TSQuery)); - *self = (TSQuery) { - .steps = array_new(), - .pattern_map = array_new(), - .captures = symbol_table_new(), - .predicate_values = symbol_table_new(), - .predicate_steps = array_new(), - .predicates_by_pattern = array_new(), - .symbol_map = symbol_map, - .wildcard_root_pattern_count = 0, - .language = language, - }; - - // Parse all of the S-expressions in the given string. - Stream stream = stream_new(source, source_len); - stream_skip_whitespace(&stream); - while (stream.input < stream.end) { - uint32_t pattern_index = self->predicates_by_pattern.size; - uint32_t start_step_index = self->steps.size; - uint32_t capture_count = 0; - array_push(&self->start_bytes_by_pattern, stream.input - source); - array_push(&self->predicates_by_pattern, ((Slice) { - .offset = self->predicate_steps.size, - .length = 0, - })); - *error_type = ts_query__parse_pattern(self, &stream, 0, &capture_count, false); - array_push(&self->steps, query_step__new(0, PATTERN_DONE_MARKER, false)); - - // If any pattern could not be parsed, then report the error information - // and terminate. - if (*error_type) { - if (*error_type == PARENT_DONE) *error_type = TSQueryErrorSyntax; - *error_offset = stream.input - source; - ts_query_delete(self); - return NULL; - } - - // If a pattern has a wildcard at its root, optimize the matching process - // by skipping matching the wildcard. - if ( - self->steps.contents[start_step_index].symbol == WILDCARD_SYMBOL - ) { - QueryStep *second_step = &self->steps.contents[start_step_index + 1]; - if (second_step->symbol != WILDCARD_SYMBOL && second_step->depth != PATTERN_DONE_MARKER) { - start_step_index += 1; - } - } - - // Maintain a map that can look up patterns for a given root symbol. - for (;;) { - QueryStep *step = &self->steps.contents[start_step_index]; - ts_query__pattern_map_insert(self, step->symbol, start_step_index, pattern_index); - if (step->symbol == WILDCARD_SYMBOL) { - self->wildcard_root_pattern_count++; - } - - // If there are alternatives or options at the root of the pattern, - // then add multiple entries to the pattern map. - if (step->alternative_index != NONE) { - start_step_index = step->alternative_index; - step->alternative_index = NONE; - } else { - break; - } - } - } - - ts_query__finalize_steps(self); - return self; -} - -void ts_query_delete(TSQuery *self) { - if (self) { - array_delete(&self->steps); - array_delete(&self->pattern_map); - array_delete(&self->predicate_steps); - array_delete(&self->predicates_by_pattern); - array_delete(&self->start_bytes_by_pattern); - symbol_table_delete(&self->captures); - symbol_table_delete(&self->predicate_values); - ts_free(self->symbol_map); - ts_free(self); - } -} - -uint32_t ts_query_pattern_count(const TSQuery *self) { - return self->predicates_by_pattern.size; -} - -uint32_t ts_query_capture_count(const TSQuery *self) { - return self->captures.slices.size; -} - -uint32_t ts_query_string_count(const TSQuery *self) { - return self->predicate_values.slices.size; -} - -const char *ts_query_capture_name_for_id( - const TSQuery *self, - uint32_t index, - uint32_t *length -) { - return symbol_table_name_for_id(&self->captures, index, length); -} - -const char *ts_query_string_value_for_id( - const TSQuery *self, - uint32_t index, - uint32_t *length -) { - return symbol_table_name_for_id(&self->predicate_values, index, length); -} - -const TSQueryPredicateStep *ts_query_predicates_for_pattern( - const TSQuery *self, - uint32_t pattern_index, - uint32_t *step_count -) { - Slice slice = self->predicates_by_pattern.contents[pattern_index]; - *step_count = slice.length; - if (self->predicate_steps.contents == NULL) { - return NULL; - } - return &self->predicate_steps.contents[slice.offset]; -} - -uint32_t ts_query_start_byte_for_pattern( - const TSQuery *self, - uint32_t pattern_index -) { - return self->start_bytes_by_pattern.contents[pattern_index]; -} - -void ts_query_disable_capture( - TSQuery *self, - const char *name, - uint32_t length -) { - // Remove capture information for any pattern step that previously - // captured with the given name. - int id = symbol_table_id_for_name(&self->captures, name, length); - if (id != -1) { - for (unsigned i = 0; i < self->steps.size; i++) { - QueryStep *step = &self->steps.contents[i]; - query_step__remove_capture(step, id); - } - ts_query__finalize_steps(self); - } -} - -void ts_query_disable_pattern( - TSQuery *self, - uint32_t pattern_index -) { - // Remove the given pattern from the pattern map. Its steps will still - // be in the `steps` array, but they will never be read. - for (unsigned i = 0; i < self->pattern_map.size; i++) { - PatternEntry *pattern = &self->pattern_map.contents[i]; - if (pattern->pattern_index == pattern_index) { - array_erase(&self->pattern_map, i); - i--; - } - } -} - -/*************** - * QueryCursor - ***************/ - -TSQueryCursor *ts_query_cursor_new(void) { - TSQueryCursor *self = ts_malloc(sizeof(TSQueryCursor)); - *self = (TSQueryCursor) { - .ascending = false, - .halted = false, - .states = array_new(), - .finished_states = array_new(), - .capture_list_pool = capture_list_pool_new(), - .start_byte = 0, - .end_byte = UINT32_MAX, - .start_point = {0, 0}, - .end_point = POINT_MAX, - }; - array_reserve(&self->states, 8); - array_reserve(&self->finished_states, 8); - return self; -} - -void ts_query_cursor_delete(TSQueryCursor *self) { - array_delete(&self->states); - array_delete(&self->finished_states); - ts_tree_cursor_delete(&self->cursor); - capture_list_pool_delete(&self->capture_list_pool); - ts_free(self); -} - -void ts_query_cursor_exec( - TSQueryCursor *self, - const TSQuery *query, - TSNode node -) { - array_clear(&self->states); - array_clear(&self->finished_states); - ts_tree_cursor_reset(&self->cursor, node); - capture_list_pool_reset(&self->capture_list_pool); - self->next_state_id = 0; - self->depth = 0; - self->ascending = false; - self->halted = false; - self->query = query; -} - -void ts_query_cursor_set_byte_range( - TSQueryCursor *self, - uint32_t start_byte, - uint32_t end_byte -) { - if (end_byte == 0) { - start_byte = 0; - end_byte = UINT32_MAX; - } - self->start_byte = start_byte; - self->end_byte = end_byte; -} - -void ts_query_cursor_set_point_range( - TSQueryCursor *self, - TSPoint start_point, - TSPoint end_point -) { - if (end_point.row == 0 && end_point.column == 0) { - start_point = POINT_ZERO; - end_point = POINT_MAX; - } - self->start_point = start_point; - self->end_point = end_point; -} - -// Search through all of the in-progress states, and find the captured -// node that occurs earliest in the document. -static bool ts_query_cursor__first_in_progress_capture( - TSQueryCursor *self, - uint32_t *state_index, - uint32_t *byte_offset, - uint32_t *pattern_index -) { - bool result = false; - *state_index = UINT32_MAX; - *byte_offset = UINT32_MAX; - *pattern_index = UINT32_MAX; - for (unsigned i = 0; i < self->states.size; i++) { - const QueryState *state = &self->states.contents[i]; - if (state->dead) continue; - const CaptureList *captures = capture_list_pool_get( - &self->capture_list_pool, - state->capture_list_id - ); - if (captures->size > 0) { - uint32_t capture_byte = ts_node_start_byte(captures->contents[0].node); - if ( - !result || - capture_byte < *byte_offset || - (capture_byte == *byte_offset && state->pattern_index < *pattern_index) - ) { - result = true; - *state_index = i; - *byte_offset = capture_byte; - *pattern_index = state->pattern_index; - } - } - } - return result; -} - -// Determine which node is first in a depth-first traversal -int ts_query_cursor__compare_nodes(TSNode left, TSNode right) { - if (left.id != right.id) { - uint32_t left_start = ts_node_start_byte(left); - uint32_t right_start = ts_node_start_byte(right); - if (left_start < right_start) return -1; - if (left_start > right_start) return 1; - uint32_t left_node_count = ts_node_end_byte(left); - uint32_t right_node_count = ts_node_end_byte(right); - if (left_node_count > right_node_count) return -1; - if (left_node_count < right_node_count) return 1; - } - return 0; -} - -// Determine if either state contains a superset of the other state's captures. -void ts_query_cursor__compare_captures( - TSQueryCursor *self, - QueryState *left_state, - QueryState *right_state, - bool *left_contains_right, - bool *right_contains_left -) { - const CaptureList *left_captures = capture_list_pool_get( - &self->capture_list_pool, - left_state->capture_list_id - ); - const CaptureList *right_captures = capture_list_pool_get( - &self->capture_list_pool, - right_state->capture_list_id - ); - *left_contains_right = true; - *right_contains_left = true; - unsigned i = 0, j = 0; - for (;;) { - if (i < left_captures->size) { - if (j < right_captures->size) { - TSQueryCapture *left = &left_captures->contents[i]; - TSQueryCapture *right = &right_captures->contents[j]; - if (left->node.id == right->node.id && left->index == right->index) { - i++; - j++; - } else { - switch (ts_query_cursor__compare_nodes(left->node, right->node)) { - case -1: - *right_contains_left = false; - i++; - break; - case 1: - *left_contains_right = false; - j++; - break; - default: - *right_contains_left = false; - *left_contains_right = false; - i++; - j++; - break; - } - } - } else { - *right_contains_left = false; - break; - } - } else { - if (j < right_captures->size) { - *left_contains_right = false; - } - break; - } - } -} - -static void ts_query_cursor__add_state( - TSQueryCursor *self, - const PatternEntry *pattern -) { - QueryStep *step = &self->query->steps.contents[pattern->step_index]; - uint32_t start_depth = self->depth - step->depth; - - // Keep the states array in ascending order of start_depth and pattern_index, - // so that it can be processed more efficiently elsewhere. Usually, there is - // no work to do here because of two facts: - // * States with lower start_depth are naturally added first due to the - // order in which nodes are visited. - // * Earlier patterns are naturally added first because of the ordering of the - // pattern_map data structure that's used to initiate matches. - // - // This loop is only needed in cases where two conditions hold: - // * A pattern consists of more than one sibling node, so that its states - // remain in progress after exiting the node that started the match. - // * The first node in the pattern matches against multiple nodes at the - // same depth. - // - // An example of this is the pattern '((comment)* (function))'. If multiple - // `comment` nodes appear in a row, then we may initiate a new state for this - // pattern while another state for the same pattern is already in progress. - // If there are multiple patterns like this in a query, then this loop will - // need to execute in order to keep the states ordered by pattern_index. - uint32_t index = self->states.size; - while (index > 0) { - QueryState *prev_state = &self->states.contents[index - 1]; - if (prev_state->start_depth < start_depth) break; - if (prev_state->start_depth == start_depth) { - if (prev_state->pattern_index < pattern->pattern_index) break; - if (prev_state->pattern_index == pattern->pattern_index) { - // Avoid unnecessarily inserting an unnecessary duplicate state, - // which would be immediately pruned by the longest-match criteria. - if (prev_state->step_index == pattern->step_index) return; - } - } - index--; - } - - LOG( - " start state. pattern:%u, step:%u\n", - pattern->pattern_index, - pattern->step_index - ); - array_insert(&self->states, index, ((QueryState) { - .capture_list_id = NONE, - .step_index = pattern->step_index, - .pattern_index = pattern->pattern_index, - .start_depth = start_depth, - .consumed_capture_count = 0, - .seeking_immediate_match = true, - .has_in_progress_alternatives = false, - .dead = false, - })); -} - -// Acquire a capture list for this state. If there are no capture lists left in the -// pool, this will steal the capture list from another existing state, and mark that -// other state as 'dead'. -static CaptureList *ts_query_cursor__prepare_to_capture( - TSQueryCursor *self, - QueryState *state, - unsigned state_index_to_preserve -) { - if (state->capture_list_id == NONE) { - state->capture_list_id = capture_list_pool_acquire(&self->capture_list_pool); - - // If there are no capture lists left in the pool, then terminate whichever - // state has captured the earliest node in the document, and steal its - // capture list. - if (state->capture_list_id == NONE) { - uint32_t state_index, byte_offset, pattern_index; - if ( - ts_query_cursor__first_in_progress_capture( - self, - &state_index, - &byte_offset, - &pattern_index - ) && - state_index != state_index_to_preserve - ) { - LOG( - " abandon state. index:%u, pattern:%u, offset:%u.\n", - state_index, pattern_index, byte_offset - ); - QueryState *other_state = &self->states.contents[state_index]; - state->capture_list_id = other_state->capture_list_id; - other_state->capture_list_id = NONE; - other_state->dead = true; - CaptureList *list = capture_list_pool_get_mut( - &self->capture_list_pool, - state->capture_list_id - ); - array_clear(list); - return list; - } else { - LOG(" ran out of capture lists"); - return NULL; - } - } - } - return capture_list_pool_get_mut(&self->capture_list_pool, state->capture_list_id); -} - -// Duplicate the given state and insert the newly-created state immediately after -// the given state in the `states` array. Ensures that the given state reference is -// still valid, even if the states array is reallocated. -static QueryState *ts_query_cursor__copy_state( - TSQueryCursor *self, - QueryState **state_ref -) { - const QueryState *state = *state_ref; - uint32_t state_index = state - self->states.contents; - QueryState copy = *state; - copy.capture_list_id = NONE; - - // If the state has captures, copy its capture list. - if (state->capture_list_id != NONE) { - CaptureList *new_captures = ts_query_cursor__prepare_to_capture(self, ©, state_index); - if (!new_captures) return NULL; - const CaptureList *old_captures = capture_list_pool_get( - &self->capture_list_pool, - state->capture_list_id - ); - array_push_all(new_captures, old_captures); - } - - array_insert(&self->states, state_index + 1, copy); - *state_ref = &self->states.contents[state_index]; - return &self->states.contents[state_index + 1]; -} - -// Walk the tree, processing patterns until at least one pattern finishes, -// If one or more patterns finish, return `true` and store their states in the -// `finished_states` array. Multiple patterns can finish on the same node. If -// there are no more matches, return `false`. -static inline bool ts_query_cursor__advance(TSQueryCursor *self) { - bool did_match = false; - for (;;) { - if (self->halted) { - while (self->states.size > 0) { - QueryState state = array_pop(&self->states); - capture_list_pool_release( - &self->capture_list_pool, - state.capture_list_id - ); - } - } - - if (did_match || self->halted) return did_match; - - if (self->ascending) { - LOG("leave node. type:%s\n", ts_node_type(ts_tree_cursor_current_node(&self->cursor))); - - // Leave this node by stepping to its next sibling or to its parent. - if (ts_tree_cursor_goto_next_sibling(&self->cursor)) { - self->ascending = false; - } else if (ts_tree_cursor_goto_parent(&self->cursor)) { - self->depth--; - } else { - self->halted = true; - } - - // After leaving a node, remove any states that cannot make further progress. - uint32_t deleted_count = 0; - for (unsigned i = 0, n = self->states.size; i < n; i++) { - QueryState *state = &self->states.contents[i]; - QueryStep *step = &self->query->steps.contents[state->step_index]; - - // If a state completed its pattern inside of this node, but was deferred from finishing - // in order to search for longer matches, mark it as finished. - if (step->depth == PATTERN_DONE_MARKER) { - if (state->start_depth > self->depth || self->halted) { - LOG(" finish pattern %u\n", state->pattern_index); - state->id = self->next_state_id++; - array_push(&self->finished_states, *state); - did_match = true; - deleted_count++; - continue; - } - } - - // If a state needed to match something within this node, then remove that state - // as it has failed to match. - else if ((uint32_t)state->start_depth + (uint32_t)step->depth > self->depth) { - LOG( - " failed to match. pattern:%u, step:%u\n", - state->pattern_index, - state->step_index - ); - capture_list_pool_release( - &self->capture_list_pool, - state->capture_list_id - ); - deleted_count++; - continue; - } - - if (deleted_count > 0) { - self->states.contents[i - deleted_count] = *state; - } - } - self->states.size -= deleted_count; - } else { - // If this node is before the selected range, then avoid descending into it. - TSNode node = ts_tree_cursor_current_node(&self->cursor); - if ( - ts_node_end_byte(node) <= self->start_byte || - point_lte(ts_node_end_point(node), self->start_point) - ) { - if (!ts_tree_cursor_goto_next_sibling(&self->cursor)) { - self->ascending = true; - } - continue; - } - - // If this node is after the selected range, then stop walking. - if ( - self->end_byte <= ts_node_start_byte(node) || - point_lte(self->end_point, ts_node_start_point(node)) - ) { - self->halted = true; - continue; - } - - // Get the properties of the current node. - TSSymbol symbol = ts_node_symbol(node); - bool is_named = ts_node_is_named(node); - if (symbol != ts_builtin_sym_error && self->query->symbol_map) { - symbol = self->query->symbol_map[symbol]; - } - bool can_have_later_siblings; - bool can_have_later_siblings_with_this_field; - TSFieldId field_id = ts_tree_cursor_current_status( - &self->cursor, - &can_have_later_siblings, - &can_have_later_siblings_with_this_field - ); - LOG( - "enter node. type:%s, field:%s, row:%u state_count:%u, finished_state_count:%u\n", - ts_node_type(node), - ts_language_field_name_for_id(self->query->language, field_id), - ts_node_start_point(node).row, - self->states.size, - self->finished_states.size - ); - - // Add new states for any patterns whose root node is a wildcard. - for (unsigned i = 0; i < self->query->wildcard_root_pattern_count; i++) { - PatternEntry *pattern = &self->query->pattern_map.contents[i]; - QueryStep *step = &self->query->steps.contents[pattern->step_index]; - - // If this node matches the first step of the pattern, then add a new - // state at the start of this pattern. - if (step->field && field_id != step->field) continue; - ts_query_cursor__add_state(self, pattern); - } - - // Add new states for any patterns whose root node matches this node. - unsigned i; - if (ts_query__pattern_map_search(self->query, symbol, &i)) { - PatternEntry *pattern = &self->query->pattern_map.contents[i]; - QueryStep *step = &self->query->steps.contents[pattern->step_index]; - do { - // If this node matches the first step of the pattern, then add a new - // state at the start of this pattern. - if (step->field && field_id != step->field) continue; - ts_query_cursor__add_state(self, pattern); - - // Advance to the next pattern whose root node matches this node. - i++; - if (i == self->query->pattern_map.size) break; - pattern = &self->query->pattern_map.contents[i]; - step = &self->query->steps.contents[pattern->step_index]; - } while (step->symbol == symbol); - } - - // Update all of the in-progress states with current node. - for (unsigned i = 0, copy_count = 0; i < self->states.size; i += 1 + copy_count) { - QueryState *state = &self->states.contents[i]; - QueryStep *step = &self->query->steps.contents[state->step_index]; - state->has_in_progress_alternatives = false; - copy_count = 0; - - // Check that the node matches all of the criteria for the next - // step of the pattern. - if ((uint32_t)state->start_depth + (uint32_t)step->depth != self->depth) continue; - - // Determine if this node matches this step of the pattern, and also - // if this node can have later siblings that match this step of the - // pattern. - bool node_does_match = - step->symbol == symbol || - step->symbol == WILDCARD_SYMBOL || - (step->symbol == NAMED_WILDCARD_SYMBOL && is_named); - bool later_sibling_can_match = can_have_later_siblings; - if ((step->is_immediate && is_named) || state->seeking_immediate_match) { - later_sibling_can_match = false; - } - if (step->is_last_child && can_have_later_siblings) { - node_does_match = false; - } - if (step->field) { - if (step->field == field_id) { - if (!can_have_later_siblings_with_this_field) { - later_sibling_can_match = false; - } - } else { - node_does_match = false; - } - } - - // Remove states immediately if it is ever clear that they cannot match. - if (!node_does_match) { - if (!later_sibling_can_match) { - LOG( - " discard state. pattern:%u, step:%u\n", - state->pattern_index, - state->step_index - ); - capture_list_pool_release( - &self->capture_list_pool, - state->capture_list_id - ); - array_erase(&self->states, i); - i--; - } - continue; - } - - // Some patterns can match their root node in multiple ways, capturing different - // children. If this pattern step could match later children within the same - // parent, then this query state cannot simply be updated in place. It must be - // split into two states: one that matches this node, and one which skips over - // this node, to preserve the possibility of matching later siblings. - if (later_sibling_can_match && step->contains_captures) { - if (ts_query_cursor__copy_state(self, &state)) { - LOG( - " split state for capture. pattern:%u, step:%u\n", - state->pattern_index, - state->step_index - ); - copy_count++; - } - } - - // If the current node is captured in this pattern, add it to the capture list. - if (step->capture_ids[0] != NONE) { - CaptureList *capture_list = ts_query_cursor__prepare_to_capture(self, state, UINT32_MAX); - if (!capture_list) { - array_erase(&self->states, i); - i--; - continue; - } - - for (unsigned j = 0; j < MAX_STEP_CAPTURE_COUNT; j++) { - uint16_t capture_id = step->capture_ids[j]; - if (step->capture_ids[j] == NONE) break; - array_push(capture_list, ((TSQueryCapture) { node, capture_id })); - LOG( - " capture node. pattern:%u, capture_id:%u, capture_count:%u\n", - state->pattern_index, - capture_id, - capture_list->size - ); - } - } - - // Advance this state to the next step of its pattern. - state->step_index++; - state->seeking_immediate_match = false; - LOG( - " advance state. pattern:%u, step:%u\n", - state->pattern_index, - state->step_index - ); - - // If this state's next step has an alternative step, then copy the state in order - // to pursue both alternatives. The alternative step itself may have an alternative, - // so this is an interative process. - unsigned end_index = i + 1; - for (unsigned j = i; j < end_index; j++) { - QueryState *state = &self->states.contents[j]; - QueryStep *next_step = &self->query->steps.contents[state->step_index]; - if (next_step->alternative_index != NONE) { - if (next_step->is_dead_end) { - state->step_index = next_step->alternative_index; - j--; - continue; - } - - if (next_step->is_pass_through) { - state->step_index++; - j--; - } - - QueryState *copy = ts_query_cursor__copy_state(self, &state); - if (copy) { - LOG( - " split state for branch. pattern:%u, from_step:%u, to_step:%u, immediate:%d, capture_count: %u\n", - copy->pattern_index, - copy->step_index, - next_step->alternative_index, - next_step->alternative_is_immediate, - capture_list_pool_get(&self->capture_list_pool, copy->capture_list_id)->size - ); - end_index++; - copy_count++; - copy->step_index = next_step->alternative_index; - if (next_step->alternative_is_immediate) { - copy->seeking_immediate_match = true; - } - } - } - } - } - - for (unsigned i = 0; i < self->states.size; i++) { - QueryState *state = &self->states.contents[i]; - if (state->dead) { - array_erase(&self->states, i); - i--; - continue; - } - - // Enfore the longest-match criteria. When a query pattern contains optional or - // repeated nodes, this is necessary to avoid multiple redundant states, where - // one state has a strict subset of another state's captures. - bool did_remove = false; - for (unsigned j = i + 1; j < self->states.size; j++) { - QueryState *other_state = &self->states.contents[j]; - - // Query states are kept in ascending order of start_depth and pattern_index. - // Since the longest-match criteria is only used for deduping matches of the same - // pattern and root node, we only need to perform pairwise comparisons within a - // small slice of the states array. - if ( - other_state->start_depth != state->start_depth || - other_state->pattern_index != state->pattern_index - ) break; - - bool left_contains_right, right_contains_left; - ts_query_cursor__compare_captures( - self, - state, - other_state, - &left_contains_right, - &right_contains_left - ); - if (left_contains_right) { - if (state->step_index == other_state->step_index) { - LOG( - " drop shorter state. pattern: %u, step_index: %u\n", - state->pattern_index, - state->step_index - ); - capture_list_pool_release(&self->capture_list_pool, other_state->capture_list_id); - array_erase(&self->states, j); - j--; - continue; - } - other_state->has_in_progress_alternatives = true; - } - if (right_contains_left) { - if (state->step_index == other_state->step_index) { - LOG( - " drop shorter state. pattern: %u, step_index: %u\n", - state->pattern_index, - state->step_index - ); - capture_list_pool_release(&self->capture_list_pool, state->capture_list_id); - array_erase(&self->states, i); - i--; - did_remove = true; - break; - } - state->has_in_progress_alternatives = true; - } - } - - // If there the state is at the end of its pattern, remove it from the list - // of in-progress states and add it to the list of finished states. - if (!did_remove) { - LOG( - " keep state. pattern: %u, start_depth: %u, step_index: %u, capture_count: %u\n", - state->pattern_index, - state->start_depth, - state->step_index, - capture_list_pool_get(&self->capture_list_pool, state->capture_list_id)->size - ); - QueryStep *next_step = &self->query->steps.contents[state->step_index]; - if (next_step->depth == PATTERN_DONE_MARKER) { - if (state->has_in_progress_alternatives) { - LOG(" defer finishing pattern %u\n", state->pattern_index); - } else { - LOG(" finish pattern %u\n", state->pattern_index); - state->id = self->next_state_id++; - array_push(&self->finished_states, *state); - array_erase(&self->states, state - self->states.contents); - did_match = true; - i--; - } - } - } - } - - // Continue descending if possible. - if (ts_tree_cursor_goto_first_child(&self->cursor)) { - self->depth++; - } else { - self->ascending = true; - } - } - } -} - -bool ts_query_cursor_next_match( - TSQueryCursor *self, - TSQueryMatch *match -) { - if (self->finished_states.size == 0) { - if (!ts_query_cursor__advance(self)) { - return false; - } - } - - QueryState *state = &self->finished_states.contents[0]; - match->id = state->id; - match->pattern_index = state->pattern_index; - const CaptureList *captures = capture_list_pool_get( - &self->capture_list_pool, - state->capture_list_id - ); - match->captures = captures->contents; - match->capture_count = captures->size; - capture_list_pool_release(&self->capture_list_pool, state->capture_list_id); - array_erase(&self->finished_states, 0); - return true; -} - -void ts_query_cursor_remove_match( - TSQueryCursor *self, - uint32_t match_id -) { - for (unsigned i = 0; i < self->finished_states.size; i++) { - const QueryState *state = &self->finished_states.contents[i]; - if (state->id == match_id) { - capture_list_pool_release( - &self->capture_list_pool, - state->capture_list_id - ); - array_erase(&self->finished_states, i); - return; - } - } -} - -bool ts_query_cursor_next_capture( - TSQueryCursor *self, - TSQueryMatch *match, - uint32_t *capture_index -) { - for (;;) { - // The goal here is to return captures in order, even though they may not - // be discovered in order, because patterns can overlap. If there are any - // finished patterns, then try to find one that contains a capture that - // is *definitely* before any capture in an *unfinished* pattern. - if (self->finished_states.size > 0) { - // First, identify the position of the earliest capture in an unfinished - // match. For a finished capture to be returned, it must be *before* - // this position. - uint32_t first_unfinished_capture_byte; - uint32_t first_unfinished_pattern_index; - uint32_t first_unfinished_state_index; - ts_query_cursor__first_in_progress_capture( - self, - &first_unfinished_state_index, - &first_unfinished_capture_byte, - &first_unfinished_pattern_index - ); - - // Find the earliest capture in a finished match. - int first_finished_state_index = -1; - uint32_t first_finished_capture_byte = first_unfinished_capture_byte; - uint32_t first_finished_pattern_index = first_unfinished_pattern_index; - for (unsigned i = 0; i < self->finished_states.size; i++) { - const QueryState *state = &self->finished_states.contents[i]; - const CaptureList *captures = capture_list_pool_get( - &self->capture_list_pool, - state->capture_list_id - ); - if (captures->size > state->consumed_capture_count) { - uint32_t capture_byte = ts_node_start_byte( - captures->contents[state->consumed_capture_count].node - ); - if ( - capture_byte < first_finished_capture_byte || - ( - capture_byte == first_finished_capture_byte && - state->pattern_index < first_finished_pattern_index - ) - ) { - first_finished_state_index = i; - first_finished_capture_byte = capture_byte; - first_finished_pattern_index = state->pattern_index; - } - } else { - capture_list_pool_release( - &self->capture_list_pool, - state->capture_list_id - ); - array_erase(&self->finished_states, i); - i--; - } - } - - // If there is finished capture that is clearly before any unfinished - // capture, then return its match, and its capture index. Internally - // record the fact that the capture has been 'consumed'. - if (first_finished_state_index != -1) { - QueryState *state = &self->finished_states.contents[ - first_finished_state_index - ]; - match->id = state->id; - match->pattern_index = state->pattern_index; - const CaptureList *captures = capture_list_pool_get( - &self->capture_list_pool, - state->capture_list_id - ); - match->captures = captures->contents; - match->capture_count = captures->size; - *capture_index = state->consumed_capture_count; - state->consumed_capture_count++; - return true; - } - - if (capture_list_pool_is_empty(&self->capture_list_pool)) { - LOG( - " abandon state. index:%u, pattern:%u, offset:%u.\n", - first_unfinished_state_index, - first_unfinished_pattern_index, - first_unfinished_capture_byte - ); - capture_list_pool_release( - &self->capture_list_pool, - self->states.contents[first_unfinished_state_index].capture_list_id - ); - array_erase(&self->states, first_unfinished_state_index); - } - } - - // If there are no finished matches that are ready to be returned, then - // continue finding more matches. - if ( - !ts_query_cursor__advance(self) && - self->finished_states.size == 0 - ) return false; - } -} - -#undef LOG diff --git a/src/tree_sitter/reduce_action.h b/src/tree_sitter/reduce_action.h deleted file mode 100644 index 72aff08d73..0000000000 --- a/src/tree_sitter/reduce_action.h +++ /dev/null @@ -1,34 +0,0 @@ -#ifndef TREE_SITTER_REDUCE_ACTION_H_ -#define TREE_SITTER_REDUCE_ACTION_H_ - -#ifdef __cplusplus -extern "C" { -#endif - -#include "./array.h" -#include "tree_sitter/api.h" - -typedef struct { - uint32_t count; - TSSymbol symbol; - int dynamic_precedence; - unsigned short production_id; -} ReduceAction; - -typedef Array(ReduceAction) ReduceActionSet; - -static inline void ts_reduce_action_set_add(ReduceActionSet *self, - ReduceAction new_action) { - for (uint32_t i = 0; i < self->size; i++) { - ReduceAction action = self->contents[i]; - if (action.symbol == new_action.symbol && action.count == new_action.count) - return; - } - array_push(self, new_action); -} - -#ifdef __cplusplus -} -#endif - -#endif // TREE_SITTER_REDUCE_ACTION_H_ diff --git a/src/tree_sitter/reusable_node.h b/src/tree_sitter/reusable_node.h deleted file mode 100644 index 9cba951909..0000000000 --- a/src/tree_sitter/reusable_node.h +++ /dev/null @@ -1,88 +0,0 @@ -#include "./subtree.h" - -typedef struct { - Subtree tree; - uint32_t child_index; - uint32_t byte_offset; -} StackEntry; - -typedef struct { - Array(StackEntry) stack; - Subtree last_external_token; -} ReusableNode; - -static inline ReusableNode reusable_node_new(void) { - return (ReusableNode) {array_new(), NULL_SUBTREE}; -} - -static inline void reusable_node_clear(ReusableNode *self) { - array_clear(&self->stack); - self->last_external_token = NULL_SUBTREE; -} - -static inline void reusable_node_reset(ReusableNode *self, Subtree tree) { - reusable_node_clear(self); - array_push(&self->stack, ((StackEntry) { - .tree = tree, - .child_index = 0, - .byte_offset = 0, - })); -} - -static inline Subtree reusable_node_tree(ReusableNode *self) { - return self->stack.size > 0 - ? self->stack.contents[self->stack.size - 1].tree - : NULL_SUBTREE; -} - -static inline uint32_t reusable_node_byte_offset(ReusableNode *self) { - return self->stack.size > 0 - ? self->stack.contents[self->stack.size - 1].byte_offset - : UINT32_MAX; -} - -static inline void reusable_node_delete(ReusableNode *self) { - array_delete(&self->stack); -} - -static inline void reusable_node_advance(ReusableNode *self) { - StackEntry last_entry = *array_back(&self->stack); - uint32_t byte_offset = last_entry.byte_offset + ts_subtree_total_bytes(last_entry.tree); - if (ts_subtree_has_external_tokens(last_entry.tree)) { - self->last_external_token = ts_subtree_last_external_token(last_entry.tree); - } - - Subtree tree; - uint32_t next_index; - do { - StackEntry popped_entry = array_pop(&self->stack); - next_index = popped_entry.child_index + 1; - if (self->stack.size == 0) return; - tree = array_back(&self->stack)->tree; - } while (ts_subtree_child_count(tree) <= next_index); - - array_push(&self->stack, ((StackEntry) { - .tree = tree.ptr->children[next_index], - .child_index = next_index, - .byte_offset = byte_offset, - })); -} - -static inline bool reusable_node_descend(ReusableNode *self) { - StackEntry last_entry = *array_back(&self->stack); - if (ts_subtree_child_count(last_entry.tree) > 0) { - array_push(&self->stack, ((StackEntry) { - .tree = last_entry.tree.ptr->children[0], - .child_index = 0, - .byte_offset = last_entry.byte_offset, - })); - return true; - } else { - return false; - } -} - -static inline void reusable_node_advance_past_leaf(ReusableNode *self) { - while (reusable_node_descend(self)) {} - reusable_node_advance(self); -} diff --git a/src/tree_sitter/stack.c b/src/tree_sitter/stack.c deleted file mode 100644 index 6a8d897c37..0000000000 --- a/src/tree_sitter/stack.c +++ /dev/null @@ -1,857 +0,0 @@ -#include "./alloc.h" -#include "./language.h" -#include "./subtree.h" -#include "./array.h" -#include "./stack.h" -#include "./length.h" -#include <assert.h> -#include <stdio.h> - -#define MAX_LINK_COUNT 8 -#define MAX_NODE_POOL_SIZE 50 -#define MAX_ITERATOR_COUNT 64 - -#if defined _WIN32 && !defined __GNUC__ -#define inline __forceinline -#else -#define inline static inline __attribute__((always_inline)) -#endif - -typedef struct StackNode StackNode; - -typedef struct { - StackNode *node; - Subtree subtree; - bool is_pending; -} StackLink; - -struct StackNode { - TSStateId state; - Length position; - StackLink links[MAX_LINK_COUNT]; - short unsigned int link_count; - uint32_t ref_count; - unsigned error_cost; - unsigned node_count; - int dynamic_precedence; -}; - -typedef struct { - StackNode *node; - SubtreeArray subtrees; - uint32_t subtree_count; - bool is_pending; -} StackIterator; - -typedef struct { - void *payload; - StackIterateCallback callback; -} StackIterateSession; - -typedef Array(StackNode *) StackNodeArray; - -typedef enum { - StackStatusActive, - StackStatusPaused, - StackStatusHalted, -} StackStatus; - -typedef struct { - StackNode *node; - Subtree last_external_token; - StackSummary *summary; - unsigned node_count_at_last_error; - TSSymbol lookahead_when_paused; - StackStatus status; -} StackHead; - -struct Stack { - Array(StackHead) heads; - StackSliceArray slices; - Array(StackIterator) iterators; - StackNodeArray node_pool; - StackNode *base_node; - SubtreePool *subtree_pool; -}; - -typedef unsigned StackAction; -enum { - StackActionNone, - StackActionStop = 1, - StackActionPop = 2, -}; - -typedef StackAction (*StackCallback)(void *, const StackIterator *); - -static void stack_node_retain(StackNode *self) { - if (!self) - return; - assert(self->ref_count > 0); - self->ref_count++; - assert(self->ref_count != 0); -} - -static void stack_node_release(StackNode *self, StackNodeArray *pool, SubtreePool *subtree_pool) { -recur: - assert(self->ref_count != 0); - self->ref_count--; - if (self->ref_count > 0) return; - - StackNode *first_predecessor = NULL; - if (self->link_count > 0) { - for (unsigned i = self->link_count - 1; i > 0; i--) { - StackLink link = self->links[i]; - if (link.subtree.ptr) ts_subtree_release(subtree_pool, link.subtree); - stack_node_release(link.node, pool, subtree_pool); - } - StackLink link = self->links[0]; - if (link.subtree.ptr) ts_subtree_release(subtree_pool, link.subtree); - first_predecessor = self->links[0].node; - } - - if (pool->size < MAX_NODE_POOL_SIZE) { - array_push(pool, self); - } else { - ts_free(self); - } - - if (first_predecessor) { - self = first_predecessor; - goto recur; - } -} - -static StackNode *stack_node_new(StackNode *previous_node, Subtree subtree, - bool is_pending, TSStateId state, StackNodeArray *pool) { - StackNode *node = pool->size > 0 ? - array_pop(pool) : - ts_malloc(sizeof(StackNode)); - *node = (StackNode){.ref_count = 1, .link_count = 0, .state = state}; - - if (previous_node) { - node->link_count = 1; - node->links[0] = (StackLink){ - .node = previous_node, - .subtree = subtree, - .is_pending = is_pending, - }; - - node->position = previous_node->position; - node->error_cost = previous_node->error_cost; - node->dynamic_precedence = previous_node->dynamic_precedence; - node->node_count = previous_node->node_count; - - if (subtree.ptr) { - node->error_cost += ts_subtree_error_cost(subtree); - node->position = length_add(node->position, ts_subtree_total_size(subtree)); - node->node_count += ts_subtree_node_count(subtree); - node->dynamic_precedence += ts_subtree_dynamic_precedence(subtree); - } - } else { - node->position = length_zero(); - node->error_cost = 0; - } - - return node; -} - -static bool stack__subtree_is_equivalent(Subtree left, Subtree right) { - return - left.ptr == right.ptr || - (left.ptr && right.ptr && - ts_subtree_symbol(left) == ts_subtree_symbol(right) && - ((ts_subtree_error_cost(left) > 0 && ts_subtree_error_cost(right) > 0) || - (ts_subtree_padding(left).bytes == ts_subtree_padding(right).bytes && - ts_subtree_size(left).bytes == ts_subtree_size(right).bytes && - ts_subtree_child_count(left) == ts_subtree_child_count(right) && - ts_subtree_extra(left) == ts_subtree_extra(right) && - ts_subtree_external_scanner_state_eq(left, right)))); -} - -static void stack_node_add_link(StackNode *self, StackLink link, SubtreePool *subtree_pool) { - if (link.node == self) return; - - for (int i = 0; i < self->link_count; i++) { - StackLink *existing_link = &self->links[i]; - if (stack__subtree_is_equivalent(existing_link->subtree, link.subtree)) { - // In general, we preserve ambiguities until they are removed from the stack - // during a pop operation where multiple paths lead to the same node. But in - // the special case where two links directly connect the same pair of nodes, - // we can safely remove the ambiguity ahead of time without changing behavior. - if (existing_link->node == link.node) { - if ( - ts_subtree_dynamic_precedence(link.subtree) > - ts_subtree_dynamic_precedence(existing_link->subtree) - ) { - ts_subtree_retain(link.subtree); - ts_subtree_release(subtree_pool, existing_link->subtree); - existing_link->subtree = link.subtree; - self->dynamic_precedence = - link.node->dynamic_precedence + ts_subtree_dynamic_precedence(link.subtree); - } - return; - } - - // If the previous nodes are mergeable, merge them recursively. - if (existing_link->node->state == link.node->state && - existing_link->node->position.bytes == link.node->position.bytes) { - for (int j = 0; j < link.node->link_count; j++) { - stack_node_add_link(existing_link->node, link.node->links[j], subtree_pool); - } - int32_t dynamic_precedence = link.node->dynamic_precedence; - if (link.subtree.ptr) { - dynamic_precedence += ts_subtree_dynamic_precedence(link.subtree); - } - if (dynamic_precedence > self->dynamic_precedence) { - self->dynamic_precedence = dynamic_precedence; - } - return; - } - } - } - - if (self->link_count == MAX_LINK_COUNT) return; - - stack_node_retain(link.node); - unsigned node_count = link.node->node_count; - int dynamic_precedence = link.node->dynamic_precedence; - self->links[self->link_count++] = link; - - if (link.subtree.ptr) { - ts_subtree_retain(link.subtree); - node_count += ts_subtree_node_count(link.subtree); - dynamic_precedence += ts_subtree_dynamic_precedence(link.subtree); - } - - if (node_count > self->node_count) self->node_count = node_count; - if (dynamic_precedence > self->dynamic_precedence) self->dynamic_precedence = dynamic_precedence; -} - -static void stack_head_delete(StackHead *self, StackNodeArray *pool, SubtreePool *subtree_pool) { - if (self->node) { - if (self->last_external_token.ptr) { - ts_subtree_release(subtree_pool, self->last_external_token); - } - if (self->summary) { - array_delete(self->summary); - ts_free(self->summary); - } - stack_node_release(self->node, pool, subtree_pool); - } -} - -static StackVersion ts_stack__add_version(Stack *self, StackVersion original_version, - StackNode *node) { - StackHead head = { - .node = node, - .node_count_at_last_error = self->heads.contents[original_version].node_count_at_last_error, - .last_external_token = self->heads.contents[original_version].last_external_token, - .status = StackStatusActive, - .lookahead_when_paused = 0, - }; - array_push(&self->heads, head); - stack_node_retain(node); - if (head.last_external_token.ptr) ts_subtree_retain(head.last_external_token); - return (StackVersion)(self->heads.size - 1); -} - -static void ts_stack__add_slice(Stack *self, StackVersion original_version, - StackNode *node, SubtreeArray *subtrees) { - for (uint32_t i = self->slices.size - 1; i + 1 > 0; i--) { - StackVersion version = self->slices.contents[i].version; - if (self->heads.contents[version].node == node) { - StackSlice slice = {*subtrees, version}; - array_insert(&self->slices, i + 1, slice); - return; - } - } - - StackVersion version = ts_stack__add_version(self, original_version, node); - StackSlice slice = { *subtrees, version }; - array_push(&self->slices, slice); -} - -inline StackSliceArray stack__iter(Stack *self, StackVersion version, - StackCallback callback, void *payload, - int goal_subtree_count) { - array_clear(&self->slices); - array_clear(&self->iterators); - - StackHead *head = array_get(&self->heads, version); - StackIterator iterator = { - .node = head->node, - .subtrees = array_new(), - .subtree_count = 0, - .is_pending = true, - }; - - bool include_subtrees = false; - if (goal_subtree_count >= 0) { - include_subtrees = true; - array_reserve(&iterator.subtrees, goal_subtree_count); - } - - array_push(&self->iterators, iterator); - - while (self->iterators.size > 0) { - for (uint32_t i = 0, size = self->iterators.size; i < size; i++) { - StackIterator *iterator = &self->iterators.contents[i]; - StackNode *node = iterator->node; - - StackAction action = callback(payload, iterator); - bool should_pop = action & StackActionPop; - bool should_stop = action & StackActionStop || node->link_count == 0; - - if (should_pop) { - SubtreeArray subtrees = iterator->subtrees; - if (!should_stop) - ts_subtree_array_copy(subtrees, &subtrees); - ts_subtree_array_reverse(&subtrees); - ts_stack__add_slice( - self, - version, - node, - &subtrees - ); - } - - if (should_stop) { - if (!should_pop) - ts_subtree_array_delete(self->subtree_pool, &iterator->subtrees); - array_erase(&self->iterators, i); - i--, size--; - continue; - } - - for (uint32_t j = 1; j <= node->link_count; j++) { - StackIterator *next_iterator; - StackLink link; - if (j == node->link_count) { - link = node->links[0]; - next_iterator = &self->iterators.contents[i]; - } else { - if (self->iterators.size >= MAX_ITERATOR_COUNT) continue; - link = node->links[j]; - StackIterator current_iterator = self->iterators.contents[i]; - array_push(&self->iterators, current_iterator); - next_iterator = array_back(&self->iterators); - ts_subtree_array_copy(next_iterator->subtrees, &next_iterator->subtrees); - } - - next_iterator->node = link.node; - if (link.subtree.ptr) { - if (include_subtrees) { - array_push(&next_iterator->subtrees, link.subtree); - ts_subtree_retain(link.subtree); - } - - if (!ts_subtree_extra(link.subtree)) { - next_iterator->subtree_count++; - if (!link.is_pending) { - next_iterator->is_pending = false; - } - } - } else { - next_iterator->subtree_count++; - next_iterator->is_pending = false; - } - } - } - } - - return self->slices; -} - -Stack *ts_stack_new(SubtreePool *subtree_pool) { - Stack *self = ts_calloc(1, sizeof(Stack)); - - array_init(&self->heads); - array_init(&self->slices); - array_init(&self->iterators); - array_init(&self->node_pool); - array_reserve(&self->heads, 4); - array_reserve(&self->slices, 4); - array_reserve(&self->iterators, 4); - array_reserve(&self->node_pool, MAX_NODE_POOL_SIZE); - - self->subtree_pool = subtree_pool; - self->base_node = stack_node_new(NULL, NULL_SUBTREE, false, 1, &self->node_pool); - ts_stack_clear(self); - - return self; -} - -void ts_stack_delete(Stack *self) { - if (self->slices.contents) - array_delete(&self->slices); - if (self->iterators.contents) - array_delete(&self->iterators); - stack_node_release(self->base_node, &self->node_pool, self->subtree_pool); - for (uint32_t i = 0; i < self->heads.size; i++) { - stack_head_delete(&self->heads.contents[i], &self->node_pool, self->subtree_pool); - } - array_clear(&self->heads); - if (self->node_pool.contents) { - for (uint32_t i = 0; i < self->node_pool.size; i++) - ts_free(self->node_pool.contents[i]); - array_delete(&self->node_pool); - } - array_delete(&self->heads); - ts_free(self); -} - -uint32_t ts_stack_version_count(const Stack *self) { - return self->heads.size; -} - -TSStateId ts_stack_state(const Stack *self, StackVersion version) { - return array_get(&self->heads, version)->node->state; -} - -Length ts_stack_position(const Stack *self, StackVersion version) { - return array_get(&self->heads, version)->node->position; -} - -Subtree ts_stack_last_external_token(const Stack *self, StackVersion version) { - return array_get(&self->heads, version)->last_external_token; -} - -void ts_stack_set_last_external_token(Stack *self, StackVersion version, Subtree token) { - StackHead *head = array_get(&self->heads, version); - if (token.ptr) ts_subtree_retain(token); - if (head->last_external_token.ptr) ts_subtree_release(self->subtree_pool, head->last_external_token); - head->last_external_token = token; -} - -unsigned ts_stack_error_cost(const Stack *self, StackVersion version) { - StackHead *head = array_get(&self->heads, version); - unsigned result = head->node->error_cost; - if ( - head->status == StackStatusPaused || - (head->node->state == ERROR_STATE && !head->node->links[0].subtree.ptr)) { - result += ERROR_COST_PER_RECOVERY; - } - return result; -} - -unsigned ts_stack_node_count_since_error(const Stack *self, StackVersion version) { - StackHead *head = array_get(&self->heads, version); - if (head->node->node_count < head->node_count_at_last_error) { - head->node_count_at_last_error = head->node->node_count; - } - return head->node->node_count - head->node_count_at_last_error; -} - -void ts_stack_push(Stack *self, StackVersion version, Subtree subtree, - bool pending, TSStateId state) { - StackHead *head = array_get(&self->heads, version); - StackNode *new_node = stack_node_new(head->node, subtree, pending, state, &self->node_pool); - if (!subtree.ptr) head->node_count_at_last_error = new_node->node_count; - head->node = new_node; -} - -inline StackAction iterate_callback(void *payload, const StackIterator *iterator) { - StackIterateSession *session = payload; - session->callback( - session->payload, - iterator->node->state, - iterator->subtree_count - ); - return StackActionNone; -} - -void ts_stack_iterate(Stack *self, StackVersion version, - StackIterateCallback callback, void *payload) { - StackIterateSession session = {payload, callback}; - stack__iter(self, version, iterate_callback, &session, -1); -} - -inline StackAction pop_count_callback(void *payload, const StackIterator *iterator) { - unsigned *goal_subtree_count = payload; - if (iterator->subtree_count == *goal_subtree_count) { - return StackActionPop | StackActionStop; - } else { - return StackActionNone; - } -} - -StackSliceArray ts_stack_pop_count(Stack *self, StackVersion version, uint32_t count) { - return stack__iter(self, version, pop_count_callback, &count, count); -} - -inline StackAction pop_pending_callback(void *payload, const StackIterator *iterator) { - (void)payload; - if (iterator->subtree_count >= 1) { - if (iterator->is_pending) { - return StackActionPop | StackActionStop; - } else { - return StackActionStop; - } - } else { - return StackActionNone; - } -} - -StackSliceArray ts_stack_pop_pending(Stack *self, StackVersion version) { - StackSliceArray pop = stack__iter(self, version, pop_pending_callback, NULL, 0); - if (pop.size > 0) { - ts_stack_renumber_version(self, pop.contents[0].version, version); - pop.contents[0].version = version; - } - return pop; -} - -inline StackAction pop_error_callback(void *payload, const StackIterator *iterator) { - if (iterator->subtrees.size > 0) { - bool *found_error = payload; - if (!*found_error && ts_subtree_is_error(iterator->subtrees.contents[0])) { - *found_error = true; - return StackActionPop | StackActionStop; - } else { - return StackActionStop; - } - } else { - return StackActionNone; - } -} - -SubtreeArray ts_stack_pop_error(Stack *self, StackVersion version) { - StackNode *node = array_get(&self->heads, version)->node; - for (unsigned i = 0; i < node->link_count; i++) { - if (node->links[i].subtree.ptr && ts_subtree_is_error(node->links[i].subtree)) { - bool found_error = false; - StackSliceArray pop = stack__iter(self, version, pop_error_callback, &found_error, 1); - if (pop.size > 0) { - assert(pop.size == 1); - ts_stack_renumber_version(self, pop.contents[0].version, version); - return pop.contents[0].subtrees; - } - break; - } - } - return (SubtreeArray){.size = 0}; -} - -inline StackAction pop_all_callback(void *payload, const StackIterator *iterator) { - (void)payload; - return iterator->node->link_count == 0 ? StackActionPop : StackActionNone; -} - -StackSliceArray ts_stack_pop_all(Stack *self, StackVersion version) { - return stack__iter(self, version, pop_all_callback, NULL, 0); -} - -typedef struct { - StackSummary *summary; - unsigned max_depth; -} SummarizeStackSession; - -inline StackAction summarize_stack_callback(void *payload, const StackIterator *iterator) { - SummarizeStackSession *session = payload; - TSStateId state = iterator->node->state; - unsigned depth = iterator->subtree_count; - if (depth > session->max_depth) return StackActionStop; - for (unsigned i = session->summary->size - 1; i + 1 > 0; i--) { - StackSummaryEntry entry = session->summary->contents[i]; - if (entry.depth < depth) break; - if (entry.depth == depth && entry.state == state) return StackActionNone; - } - array_push(session->summary, ((StackSummaryEntry){ - .position = iterator->node->position, - .depth = depth, - .state = state, - })); - return StackActionNone; -} - -void ts_stack_record_summary(Stack *self, StackVersion version, unsigned max_depth) { - SummarizeStackSession session = { - .summary = ts_malloc(sizeof(StackSummary)), - .max_depth = max_depth - }; - array_init(session.summary); - stack__iter(self, version, summarize_stack_callback, &session, -1); - StackHead *head = &self->heads.contents[version]; - if (head->summary) { - array_delete(head->summary); - ts_free(head->summary); - } - head->summary = session.summary; -} - -StackSummary *ts_stack_get_summary(Stack *self, StackVersion version) { - return array_get(&self->heads, version)->summary; -} - -int ts_stack_dynamic_precedence(Stack *self, StackVersion version) { - return array_get(&self->heads, version)->node->dynamic_precedence; -} - -bool ts_stack_has_advanced_since_error(const Stack *self, StackVersion version) { - const StackHead *head = array_get(&self->heads, version); - const StackNode *node = head->node; - if (node->error_cost == 0) return true; - while (node) { - if (node->link_count > 0) { - Subtree subtree = node->links[0].subtree; - if (subtree.ptr) { - if (ts_subtree_total_bytes(subtree) > 0) { - return true; - } else if ( - node->node_count > head->node_count_at_last_error && - ts_subtree_error_cost(subtree) == 0 - ) { - node = node->links[0].node; - continue; - } - } - } - break; - } - return false; -} - -void ts_stack_remove_version(Stack *self, StackVersion version) { - stack_head_delete(array_get(&self->heads, version), &self->node_pool, self->subtree_pool); - array_erase(&self->heads, version); -} - -void ts_stack_renumber_version(Stack *self, StackVersion v1, StackVersion v2) { - if (v1 == v2) return; - assert(v2 < v1); - assert((uint32_t)v1 < self->heads.size); - StackHead *source_head = &self->heads.contents[v1]; - StackHead *target_head = &self->heads.contents[v2]; - if (target_head->summary && !source_head->summary) { - source_head->summary = target_head->summary; - target_head->summary = NULL; - } - stack_head_delete(target_head, &self->node_pool, self->subtree_pool); - *target_head = *source_head; - array_erase(&self->heads, v1); -} - -void ts_stack_swap_versions(Stack *self, StackVersion v1, StackVersion v2) { - StackHead temporary_head = self->heads.contents[v1]; - self->heads.contents[v1] = self->heads.contents[v2]; - self->heads.contents[v2] = temporary_head; -} - -StackVersion ts_stack_copy_version(Stack *self, StackVersion version) { - assert(version < self->heads.size); - array_push(&self->heads, self->heads.contents[version]); - StackHead *head = array_back(&self->heads); - stack_node_retain(head->node); - if (head->last_external_token.ptr) ts_subtree_retain(head->last_external_token); - head->summary = NULL; - return self->heads.size - 1; -} - -bool ts_stack_merge(Stack *self, StackVersion version1, StackVersion version2) { - if (!ts_stack_can_merge(self, version1, version2)) return false; - StackHead *head1 = &self->heads.contents[version1]; - StackHead *head2 = &self->heads.contents[version2]; - for (uint32_t i = 0; i < head2->node->link_count; i++) { - stack_node_add_link(head1->node, head2->node->links[i], self->subtree_pool); - } - if (head1->node->state == ERROR_STATE) { - head1->node_count_at_last_error = head1->node->node_count; - } - ts_stack_remove_version(self, version2); - return true; -} - -bool ts_stack_can_merge(Stack *self, StackVersion version1, StackVersion version2) { - StackHead *head1 = &self->heads.contents[version1]; - StackHead *head2 = &self->heads.contents[version2]; - return - head1->status == StackStatusActive && - head2->status == StackStatusActive && - head1->node->state == head2->node->state && - head1->node->position.bytes == head2->node->position.bytes && - head1->node->error_cost == head2->node->error_cost && - ts_subtree_external_scanner_state_eq(head1->last_external_token, head2->last_external_token); -} - -void ts_stack_halt(Stack *self, StackVersion version) { - array_get(&self->heads, version)->status = StackStatusHalted; -} - -void ts_stack_pause(Stack *self, StackVersion version, TSSymbol lookahead) { - StackHead *head = array_get(&self->heads, version); - head->status = StackStatusPaused; - head->lookahead_when_paused = lookahead; - head->node_count_at_last_error = head->node->node_count; -} - -bool ts_stack_is_active(const Stack *self, StackVersion version) { - return array_get(&self->heads, version)->status == StackStatusActive; -} - -bool ts_stack_is_halted(const Stack *self, StackVersion version) { - return array_get(&self->heads, version)->status == StackStatusHalted; -} - -bool ts_stack_is_paused(const Stack *self, StackVersion version) { - return array_get(&self->heads, version)->status == StackStatusPaused; -} - -TSSymbol ts_stack_resume(Stack *self, StackVersion version) { - StackHead *head = array_get(&self->heads, version); - assert(head->status == StackStatusPaused); - TSSymbol result = head->lookahead_when_paused; - head->status = StackStatusActive; - head->lookahead_when_paused = 0; - return result; -} - -void ts_stack_clear(Stack *self) { - stack_node_retain(self->base_node); - for (uint32_t i = 0; i < self->heads.size; i++) { - stack_head_delete(&self->heads.contents[i], &self->node_pool, self->subtree_pool); - } - array_clear(&self->heads); - array_push(&self->heads, ((StackHead){ - .node = self->base_node, - .last_external_token = NULL_SUBTREE, - .status = StackStatusActive, - .lookahead_when_paused = 0, - })); -} - -bool ts_stack_print_dot_graph(Stack *self, const TSLanguage *language, FILE *f) { - array_reserve(&self->iterators, 32); - bool was_recording_allocations = ts_toggle_allocation_recording(false); - if (!f) f = stderr; - - fprintf(f, "digraph stack {\n"); - fprintf(f, "rankdir=\"RL\";\n"); - fprintf(f, "edge [arrowhead=none]\n"); - - Array(StackNode *) visited_nodes = array_new(); - - array_clear(&self->iterators); - for (uint32_t i = 0; i < self->heads.size; i++) { - StackHead *head = &self->heads.contents[i]; - if (head->status == StackStatusHalted) continue; - - fprintf(f, "node_head_%u [shape=none, label=\"\"]\n", i); - fprintf(f, "node_head_%u -> node_%p [", i, head->node); - - if (head->status == StackStatusPaused) { - fprintf(f, "color=red "); - } - fprintf(f, - "label=%u, fontcolor=blue, weight=10000, labeltooltip=\"node_count: %u\nerror_cost: %u", - i, - ts_stack_node_count_since_error(self, i), - ts_stack_error_cost(self, i) - ); - - if (head->summary) { - fprintf(f, "\nsummary_size: %u", head->summary->size); - } - - if (head->last_external_token.ptr) { - const ExternalScannerState *state = &head->last_external_token.ptr->external_scanner_state; - const char *data = ts_external_scanner_state_data(state); - fprintf(f, "\nexternal_scanner_state:"); - for (uint32_t j = 0; j < state->length; j++) fprintf(f, " %2X", data[j]); - } - - fprintf(f, "\"]\n"); - array_push(&self->iterators, ((StackIterator){.node = head->node })); - } - - bool all_iterators_done = false; - while (!all_iterators_done) { - all_iterators_done = true; - - for (uint32_t i = 0; i < self->iterators.size; i++) { - StackIterator iterator = self->iterators.contents[i]; - StackNode *node = iterator.node; - - for (uint32_t j = 0; j < visited_nodes.size; j++) { - if (visited_nodes.contents[j] == node) { - node = NULL; - break; - } - } - - if (!node) continue; - all_iterators_done = false; - - fprintf(f, "node_%p [", node); - if (node->state == ERROR_STATE) { - fprintf(f, "label=\"?\""); - } else if ( - node->link_count == 1 && - node->links[0].subtree.ptr && - ts_subtree_extra(node->links[0].subtree) - ) { - fprintf(f, "shape=point margin=0 label=\"\""); - } else { - fprintf(f, "label=\"%d\"", node->state); - } - - fprintf( - f, - " tooltip=\"position: %u,%u\nnode_count:%u\nerror_cost: %u\ndynamic_precedence: %d\"];\n", - node->position.extent.row + 1, - node->position.extent.column, - node->node_count, - node->error_cost, - node->dynamic_precedence - ); - - for (int j = 0; j < node->link_count; j++) { - StackLink link = node->links[j]; - fprintf(f, "node_%p -> node_%p [", node, link.node); - if (link.is_pending) fprintf(f, "style=dashed "); - if (link.subtree.ptr && ts_subtree_extra(link.subtree)) fprintf(f, "fontcolor=gray "); - - if (!link.subtree.ptr) { - fprintf(f, "color=red"); - } else { - fprintf(f, "label=\""); - bool quoted = ts_subtree_visible(link.subtree) && !ts_subtree_named(link.subtree); - if (quoted) fprintf(f, "'"); - const char *name = ts_language_symbol_name(language, ts_subtree_symbol(link.subtree)); - for (const char *c = name; *c; c++) { - if (*c == '\"' || *c == '\\') fprintf(f, "\\"); - fprintf(f, "%c", *c); - } - if (quoted) fprintf(f, "'"); - fprintf(f, "\""); - fprintf( - f, - "labeltooltip=\"error_cost: %u\ndynamic_precedence: %u\"", - ts_subtree_error_cost(link.subtree), - ts_subtree_dynamic_precedence(link.subtree) - ); - } - - fprintf(f, "];\n"); - - StackIterator *next_iterator; - if (j == 0) { - next_iterator = &self->iterators.contents[i]; - } else { - array_push(&self->iterators, iterator); - next_iterator = array_back(&self->iterators); - } - next_iterator->node = link.node; - } - - array_push(&visited_nodes, node); - } - } - - fprintf(f, "}\n"); - - array_delete(&visited_nodes); - ts_toggle_allocation_recording(was_recording_allocations); - return true; -} - -#undef inline diff --git a/src/tree_sitter/stack.h b/src/tree_sitter/stack.h deleted file mode 100644 index ec7a69d2b4..0000000000 --- a/src/tree_sitter/stack.h +++ /dev/null @@ -1,135 +0,0 @@ -#ifndef TREE_SITTER_PARSE_STACK_H_ -#define TREE_SITTER_PARSE_STACK_H_ - -#ifdef __cplusplus -extern "C" { -#endif - -#include "./array.h" -#include "./subtree.h" -#include "./error_costs.h" -#include <stdio.h> - -typedef struct Stack Stack; - -typedef unsigned StackVersion; -#define STACK_VERSION_NONE ((StackVersion)-1) - -typedef struct { - SubtreeArray subtrees; - StackVersion version; -} StackSlice; -typedef Array(StackSlice) StackSliceArray; - -typedef struct { - Length position; - unsigned depth; - TSStateId state; -} StackSummaryEntry; -typedef Array(StackSummaryEntry) StackSummary; - -// Create a stack. -Stack *ts_stack_new(SubtreePool *); - -// Release the memory reserved for a given stack. -void ts_stack_delete(Stack *); - -// Get the stack's current number of versions. -uint32_t ts_stack_version_count(const Stack *); - -// Get the state at the top of the given version of the stack. If the stack is -// empty, this returns the initial state, 0. -TSStateId ts_stack_state(const Stack *, StackVersion); - -// Get the last external token associated with a given version of the stack. -Subtree ts_stack_last_external_token(const Stack *, StackVersion); - -// Set the last external token associated with a given version of the stack. -void ts_stack_set_last_external_token(Stack *, StackVersion, Subtree ); - -// Get the position of the given version of the stack within the document. -Length ts_stack_position(const Stack *, StackVersion); - -// Push a tree and state onto the given version of the stack. -// -// This transfers ownership of the tree to the Stack. Callers that -// need to retain ownership of the tree for their own purposes should -// first retain the tree. -void ts_stack_push(Stack *, StackVersion, Subtree , bool, TSStateId); - -// Pop the given number of entries from the given version of the stack. This -// operation can increase the number of stack versions by revealing multiple -// versions which had previously been merged. It returns an array that -// specifies the index of each revealed version and the trees that were -// removed from that version. -StackSliceArray ts_stack_pop_count(Stack *, StackVersion, uint32_t count); - -// Remove an error at the top of the given version of the stack. -SubtreeArray ts_stack_pop_error(Stack *, StackVersion); - -// Remove any pending trees from the top of the given version of the stack. -StackSliceArray ts_stack_pop_pending(Stack *, StackVersion); - -// Remove any all trees from the given version of the stack. -StackSliceArray ts_stack_pop_all(Stack *, StackVersion); - -// Get the maximum number of tree nodes reachable from this version of the stack -// since the last error was detected. -unsigned ts_stack_node_count_since_error(const Stack *, StackVersion); - -int ts_stack_dynamic_precedence(Stack *, StackVersion); - -bool ts_stack_has_advanced_since_error(const Stack *, StackVersion); - -// Compute a summary of all the parse states near the top of the given -// version of the stack and store the summary for later retrieval. -void ts_stack_record_summary(Stack *, StackVersion, unsigned max_depth); - -// Retrieve a summary of all the parse states near the top of the -// given version of the stack. -StackSummary *ts_stack_get_summary(Stack *, StackVersion); - -// Get the total cost of all errors on the given version of the stack. -unsigned ts_stack_error_cost(const Stack *, StackVersion version); - -// Merge the given two stack versions if possible, returning true -// if they were successfully merged and false otherwise. -bool ts_stack_merge(Stack *, StackVersion, StackVersion); - -// Determine whether the given two stack versions can be merged. -bool ts_stack_can_merge(Stack *, StackVersion, StackVersion); - -TSSymbol ts_stack_resume(Stack *, StackVersion); - -void ts_stack_pause(Stack *, StackVersion, TSSymbol); - -void ts_stack_halt(Stack *, StackVersion); - -bool ts_stack_is_active(const Stack *, StackVersion); - -bool ts_stack_is_paused(const Stack *, StackVersion); - -bool ts_stack_is_halted(const Stack *, StackVersion); - -void ts_stack_renumber_version(Stack *, StackVersion, StackVersion); - -void ts_stack_swap_versions(Stack *, StackVersion, StackVersion); - -StackVersion ts_stack_copy_version(Stack *, StackVersion); - -// Remove the given version from the stack. -void ts_stack_remove_version(Stack *, StackVersion); - -void ts_stack_clear(Stack *); - -bool ts_stack_print_dot_graph(Stack *, const TSLanguage *, FILE *); - -typedef void (*StackIterateCallback)(void *, TSStateId, uint32_t); - -void ts_stack_iterate(Stack *, StackVersion, StackIterateCallback, void *); - -#ifdef __cplusplus -} -#endif - -#endif // TREE_SITTER_PARSE_STACK_H_ diff --git a/src/tree_sitter/subtree.c b/src/tree_sitter/subtree.c deleted file mode 100644 index ef92a32fe4..0000000000 --- a/src/tree_sitter/subtree.c +++ /dev/null @@ -1,982 +0,0 @@ -#include <assert.h> -#include <ctype.h> -#include <limits.h> -#include <stdbool.h> -#include <string.h> -#include <stdio.h> -#include "./alloc.h" -#include "./atomic.h" -#include "./subtree.h" -#include "./length.h" -#include "./language.h" -#include "./error_costs.h" -#include <stddef.h> - -typedef struct { - Length start; - Length old_end; - Length new_end; -} Edit; - -#define TS_MAX_INLINE_TREE_LENGTH UINT8_MAX -#define TS_MAX_TREE_POOL_SIZE 32 - -static const ExternalScannerState empty_state = {{.short_data = {0}}, .length = 0}; - -// ExternalScannerState - -void ts_external_scanner_state_init(ExternalScannerState *self, const char *data, unsigned length) { - self->length = length; - if (length > sizeof(self->short_data)) { - self->long_data = ts_malloc(length); - memcpy(self->long_data, data, length); - } else { - memcpy(self->short_data, data, length); - } -} - -ExternalScannerState ts_external_scanner_state_copy(const ExternalScannerState *self) { - ExternalScannerState result = *self; - if (self->length > sizeof(self->short_data)) { - result.long_data = ts_malloc(self->length); - memcpy(result.long_data, self->long_data, self->length); - } - return result; -} - -void ts_external_scanner_state_delete(ExternalScannerState *self) { - if (self->length > sizeof(self->short_data)) { - ts_free(self->long_data); - } -} - -const char *ts_external_scanner_state_data(const ExternalScannerState *self) { - if (self->length > sizeof(self->short_data)) { - return self->long_data; - } else { - return self->short_data; - } -} - -bool ts_external_scanner_state_eq(const ExternalScannerState *a, const ExternalScannerState *b) { - return a == b || ( - a->length == b->length && - !memcmp(ts_external_scanner_state_data(a), ts_external_scanner_state_data(b), a->length) - ); -} - -// SubtreeArray - -void ts_subtree_array_copy(SubtreeArray self, SubtreeArray *dest) { - dest->size = self.size; - dest->capacity = self.capacity; - dest->contents = self.contents; - if (self.capacity > 0) { - dest->contents = ts_calloc(self.capacity, sizeof(Subtree)); - memcpy(dest->contents, self.contents, self.size * sizeof(Subtree)); - for (uint32_t i = 0; i < self.size; i++) { - ts_subtree_retain(dest->contents[i]); - } - } -} - -void ts_subtree_array_delete(SubtreePool *pool, SubtreeArray *self) { - for (uint32_t i = 0; i < self->size; i++) { - ts_subtree_release(pool, self->contents[i]); - } - array_delete(self); -} - -SubtreeArray ts_subtree_array_remove_trailing_extras(SubtreeArray *self) { - SubtreeArray result = array_new(); - - uint32_t i = self->size - 1; - for (; i + 1 > 0; i--) { - Subtree child = self->contents[i]; - if (!ts_subtree_extra(child)) break; - array_push(&result, child); - } - - self->size = i + 1; - ts_subtree_array_reverse(&result); - return result; -} - -void ts_subtree_array_reverse(SubtreeArray *self) { - for (uint32_t i = 0, limit = self->size / 2; i < limit; i++) { - size_t reverse_index = self->size - 1 - i; - Subtree swap = self->contents[i]; - self->contents[i] = self->contents[reverse_index]; - self->contents[reverse_index] = swap; - } -} - -// SubtreePool - -SubtreePool ts_subtree_pool_new(uint32_t capacity) { - SubtreePool self = {array_new(), array_new()}; - array_reserve(&self.free_trees, capacity); - return self; -} - -void ts_subtree_pool_delete(SubtreePool *self) { - if (self->free_trees.contents) { - for (unsigned i = 0; i < self->free_trees.size; i++) { - ts_free(self->free_trees.contents[i].ptr); - } - array_delete(&self->free_trees); - } - if (self->tree_stack.contents) array_delete(&self->tree_stack); -} - -static SubtreeHeapData *ts_subtree_pool_allocate(SubtreePool *self) { - if (self->free_trees.size > 0) { - return array_pop(&self->free_trees).ptr; - } else { - return ts_malloc(sizeof(SubtreeHeapData)); - } -} - -static void ts_subtree_pool_free(SubtreePool *self, SubtreeHeapData *tree) { - if (self->free_trees.capacity > 0 && self->free_trees.size + 1 <= TS_MAX_TREE_POOL_SIZE) { - array_push(&self->free_trees, (MutableSubtree) {.ptr = tree}); - } else { - ts_free(tree); - } -} - -// Subtree - -static inline bool ts_subtree_can_inline(Length padding, Length size, uint32_t lookahead_bytes) { - return - padding.bytes < TS_MAX_INLINE_TREE_LENGTH && - padding.extent.row < 16 && - padding.extent.column < TS_MAX_INLINE_TREE_LENGTH && - size.extent.row == 0 && - size.extent.column < TS_MAX_INLINE_TREE_LENGTH && - lookahead_bytes < 16; -} - -Subtree ts_subtree_new_leaf( - SubtreePool *pool, TSSymbol symbol, Length padding, Length size, - uint32_t lookahead_bytes, TSStateId parse_state, bool has_external_tokens, - bool is_keyword, const TSLanguage *language -) { - TSSymbolMetadata metadata = ts_language_symbol_metadata(language, symbol); - bool extra = symbol == ts_builtin_sym_end; - - bool is_inline = ( - symbol <= UINT8_MAX && - !has_external_tokens && - ts_subtree_can_inline(padding, size, lookahead_bytes) - ); - - if (is_inline) { - return (Subtree) {{ - .parse_state = parse_state, - .symbol = symbol, - .padding_bytes = padding.bytes, - .padding_rows = padding.extent.row, - .padding_columns = padding.extent.column, - .size_bytes = size.bytes, - .lookahead_bytes = lookahead_bytes, - .visible = metadata.visible, - .named = metadata.named, - .extra = extra, - .has_changes = false, - .is_missing = false, - .is_keyword = is_keyword, - .is_inline = true, - }}; - } else { - SubtreeHeapData *data = ts_subtree_pool_allocate(pool); - *data = (SubtreeHeapData) { - .ref_count = 1, - .padding = padding, - .size = size, - .lookahead_bytes = lookahead_bytes, - .error_cost = 0, - .child_count = 0, - .symbol = symbol, - .parse_state = parse_state, - .visible = metadata.visible, - .named = metadata.named, - .extra = extra, - .fragile_left = false, - .fragile_right = false, - .has_changes = false, - .has_external_tokens = has_external_tokens, - .is_missing = false, - .is_keyword = is_keyword, - {{.first_leaf = {.symbol = 0, .parse_state = 0}}} - }; - return (Subtree) {.ptr = data}; - } -} - -void ts_subtree_set_symbol( - MutableSubtree *self, - TSSymbol symbol, - const TSLanguage *language -) { - TSSymbolMetadata metadata = ts_language_symbol_metadata(language, symbol); - if (self->data.is_inline) { - assert(symbol < UINT8_MAX); - self->data.symbol = symbol; - self->data.named = metadata.named; - self->data.visible = metadata.visible; - } else { - self->ptr->symbol = symbol; - self->ptr->named = metadata.named; - self->ptr->visible = metadata.visible; - } -} - -Subtree ts_subtree_new_error( - SubtreePool *pool, int32_t lookahead_char, Length padding, Length size, - uint32_t bytes_scanned, TSStateId parse_state, const TSLanguage *language -) { - Subtree result = ts_subtree_new_leaf( - pool, ts_builtin_sym_error, padding, size, bytes_scanned, - parse_state, false, false, language - ); - SubtreeHeapData *data = (SubtreeHeapData *)result.ptr; - data->fragile_left = true; - data->fragile_right = true; - data->lookahead_char = lookahead_char; - return result; -} - -MutableSubtree ts_subtree_make_mut(SubtreePool *pool, Subtree self) { - if (self.data.is_inline) return (MutableSubtree) {self.data}; - if (self.ptr->ref_count == 1) return ts_subtree_to_mut_unsafe(self); - - SubtreeHeapData *result = ts_subtree_pool_allocate(pool); - memcpy(result, self.ptr, sizeof(SubtreeHeapData)); - if (result->child_count > 0) { - result->children = ts_calloc(self.ptr->child_count, sizeof(Subtree)); - memcpy(result->children, self.ptr->children, result->child_count * sizeof(Subtree)); - for (uint32_t i = 0; i < result->child_count; i++) { - ts_subtree_retain(result->children[i]); - } - } else if (result->has_external_tokens) { - result->external_scanner_state = ts_external_scanner_state_copy(&self.ptr->external_scanner_state); - } - result->ref_count = 1; - ts_subtree_release(pool, self); - return (MutableSubtree) {.ptr = result}; -} - -static void ts_subtree__compress(MutableSubtree self, unsigned count, const TSLanguage *language, - MutableSubtreeArray *stack) { - unsigned initial_stack_size = stack->size; - - MutableSubtree tree = self; - TSSymbol symbol = tree.ptr->symbol; - for (unsigned i = 0; i < count; i++) { - if (tree.ptr->ref_count > 1 || tree.ptr->child_count < 2) break; - - MutableSubtree child = ts_subtree_to_mut_unsafe(tree.ptr->children[0]); - if ( - child.data.is_inline || - child.ptr->child_count < 2 || - child.ptr->ref_count > 1 || - child.ptr->symbol != symbol - ) break; - - MutableSubtree grandchild = ts_subtree_to_mut_unsafe(child.ptr->children[0]); - if ( - grandchild.data.is_inline || - grandchild.ptr->child_count < 2 || - grandchild.ptr->ref_count > 1 || - grandchild.ptr->symbol != symbol - ) break; - - tree.ptr->children[0] = ts_subtree_from_mut(grandchild); - child.ptr->children[0] = grandchild.ptr->children[grandchild.ptr->child_count - 1]; - grandchild.ptr->children[grandchild.ptr->child_count - 1] = ts_subtree_from_mut(child); - array_push(stack, tree); - tree = grandchild; - } - - while (stack->size > initial_stack_size) { - tree = array_pop(stack); - MutableSubtree child = ts_subtree_to_mut_unsafe(tree.ptr->children[0]); - MutableSubtree grandchild = ts_subtree_to_mut_unsafe(child.ptr->children[child.ptr->child_count - 1]); - ts_subtree_set_children(grandchild, grandchild.ptr->children, grandchild.ptr->child_count, language); - ts_subtree_set_children(child, child.ptr->children, child.ptr->child_count, language); - ts_subtree_set_children(tree, tree.ptr->children, tree.ptr->child_count, language); - } -} - -void ts_subtree_balance(Subtree self, SubtreePool *pool, const TSLanguage *language) { - array_clear(&pool->tree_stack); - - if (ts_subtree_child_count(self) > 0 && self.ptr->ref_count == 1) { - array_push(&pool->tree_stack, ts_subtree_to_mut_unsafe(self)); - } - - while (pool->tree_stack.size > 0) { - MutableSubtree tree = array_pop(&pool->tree_stack); - - if (tree.ptr->repeat_depth > 0) { - Subtree child1 = tree.ptr->children[0]; - Subtree child2 = tree.ptr->children[tree.ptr->child_count - 1]; - long repeat_delta = (long)ts_subtree_repeat_depth(child1) - (long)ts_subtree_repeat_depth(child2); - if (repeat_delta > 0) { - unsigned n = repeat_delta; - for (unsigned i = n / 2; i > 0; i /= 2) { - ts_subtree__compress(tree, i, language, &pool->tree_stack); - n -= i; - } - } - } - - for (uint32_t i = 0; i < tree.ptr->child_count; i++) { - Subtree child = tree.ptr->children[i]; - if (ts_subtree_child_count(child) > 0 && child.ptr->ref_count == 1) { - array_push(&pool->tree_stack, ts_subtree_to_mut_unsafe(child)); - } - } - } -} - -void ts_subtree_set_children( - MutableSubtree self, Subtree *children, uint32_t child_count, const TSLanguage *language -) { - assert(!self.data.is_inline); - - if (self.ptr->child_count > 0 && children != self.ptr->children) { - ts_free(self.ptr->children); - } - - self.ptr->child_count = child_count; - self.ptr->children = children; - self.ptr->named_child_count = 0; - self.ptr->visible_child_count = 0; - self.ptr->error_cost = 0; - self.ptr->repeat_depth = 0; - self.ptr->node_count = 1; - self.ptr->has_external_tokens = false; - self.ptr->dynamic_precedence = 0; - - uint32_t non_extra_index = 0; - const TSSymbol *alias_sequence = ts_language_alias_sequence(language, self.ptr->production_id); - uint32_t lookahead_end_byte = 0; - - for (uint32_t i = 0; i < self.ptr->child_count; i++) { - Subtree child = self.ptr->children[i]; - - if (i == 0) { - self.ptr->padding = ts_subtree_padding(child); - self.ptr->size = ts_subtree_size(child); - } else { - self.ptr->size = length_add(self.ptr->size, ts_subtree_total_size(child)); - } - - uint32_t child_lookahead_end_byte = - self.ptr->padding.bytes + - self.ptr->size.bytes + - ts_subtree_lookahead_bytes(child); - if (child_lookahead_end_byte > lookahead_end_byte) lookahead_end_byte = child_lookahead_end_byte; - - if (ts_subtree_symbol(child) != ts_builtin_sym_error_repeat) { - self.ptr->error_cost += ts_subtree_error_cost(child); - } - - self.ptr->dynamic_precedence += ts_subtree_dynamic_precedence(child); - self.ptr->node_count += ts_subtree_node_count(child); - - if (alias_sequence && alias_sequence[non_extra_index] != 0 && !ts_subtree_extra(child)) { - self.ptr->visible_child_count++; - if (ts_language_symbol_metadata(language, alias_sequence[non_extra_index]).named) { - self.ptr->named_child_count++; - } - } else if (ts_subtree_visible(child)) { - self.ptr->visible_child_count++; - if (ts_subtree_named(child)) self.ptr->named_child_count++; - } else if (ts_subtree_child_count(child) > 0) { - self.ptr->visible_child_count += child.ptr->visible_child_count; - self.ptr->named_child_count += child.ptr->named_child_count; - } - - if (ts_subtree_has_external_tokens(child)) self.ptr->has_external_tokens = true; - - if (ts_subtree_is_error(child)) { - self.ptr->fragile_left = self.ptr->fragile_right = true; - self.ptr->parse_state = TS_TREE_STATE_NONE; - } - - if (!ts_subtree_extra(child)) non_extra_index++; - } - - self.ptr->lookahead_bytes = lookahead_end_byte - self.ptr->size.bytes - self.ptr->padding.bytes; - - if (self.ptr->symbol == ts_builtin_sym_error || self.ptr->symbol == ts_builtin_sym_error_repeat) { - self.ptr->error_cost += - ERROR_COST_PER_RECOVERY + - ERROR_COST_PER_SKIPPED_CHAR * self.ptr->size.bytes + - ERROR_COST_PER_SKIPPED_LINE * self.ptr->size.extent.row; - for (uint32_t i = 0; i < self.ptr->child_count; i++) { - Subtree child = self.ptr->children[i]; - uint32_t grandchild_count = ts_subtree_child_count(child); - if (ts_subtree_extra(child)) continue; - if (ts_subtree_is_error(child) && grandchild_count == 0) continue; - if (ts_subtree_visible(child)) { - self.ptr->error_cost += ERROR_COST_PER_SKIPPED_TREE; - } else if (grandchild_count > 0) { - self.ptr->error_cost += ERROR_COST_PER_SKIPPED_TREE * child.ptr->visible_child_count; - } - } - } - - if (self.ptr->child_count > 0) { - Subtree first_child = self.ptr->children[0]; - Subtree last_child = self.ptr->children[self.ptr->child_count - 1]; - - self.ptr->first_leaf.symbol = ts_subtree_leaf_symbol(first_child); - self.ptr->first_leaf.parse_state = ts_subtree_leaf_parse_state(first_child); - - if (ts_subtree_fragile_left(first_child)) self.ptr->fragile_left = true; - if (ts_subtree_fragile_right(last_child)) self.ptr->fragile_right = true; - - if ( - self.ptr->child_count >= 2 && - !self.ptr->visible && - !self.ptr->named && - ts_subtree_symbol(first_child) == self.ptr->symbol - ) { - if (ts_subtree_repeat_depth(first_child) > ts_subtree_repeat_depth(last_child)) { - self.ptr->repeat_depth = ts_subtree_repeat_depth(first_child) + 1; - } else { - self.ptr->repeat_depth = ts_subtree_repeat_depth(last_child) + 1; - } - } - } -} - -MutableSubtree ts_subtree_new_node(SubtreePool *pool, TSSymbol symbol, - SubtreeArray *children, unsigned production_id, - const TSLanguage *language) { - TSSymbolMetadata metadata = ts_language_symbol_metadata(language, symbol); - bool fragile = symbol == ts_builtin_sym_error || symbol == ts_builtin_sym_error_repeat; - SubtreeHeapData *data = ts_subtree_pool_allocate(pool); - *data = (SubtreeHeapData) { - .ref_count = 1, - .symbol = symbol, - .visible = metadata.visible, - .named = metadata.named, - .has_changes = false, - .fragile_left = fragile, - .fragile_right = fragile, - .is_keyword = false, - {{ - .node_count = 0, - .production_id = production_id, - .first_leaf = {.symbol = 0, .parse_state = 0}, - }} - }; - MutableSubtree result = {.ptr = data}; - ts_subtree_set_children(result, children->contents, children->size, language); - return result; -} - -Subtree ts_subtree_new_error_node(SubtreePool *pool, SubtreeArray *children, - bool extra, const TSLanguage *language) { - MutableSubtree result = ts_subtree_new_node( - pool, ts_builtin_sym_error, children, 0, language - ); - result.ptr->extra = extra; - return ts_subtree_from_mut(result); -} - -Subtree ts_subtree_new_missing_leaf(SubtreePool *pool, TSSymbol symbol, Length padding, - const TSLanguage *language) { - Subtree result = ts_subtree_new_leaf( - pool, symbol, padding, length_zero(), 0, - 0, false, false, language - ); - - if (result.data.is_inline) { - result.data.is_missing = true; - } else { - ((SubtreeHeapData *)result.ptr)->is_missing = true; - } - - return result; -} - -void ts_subtree_retain(Subtree self) { - if (self.data.is_inline) return; - assert(self.ptr->ref_count > 0); - atomic_inc((volatile uint32_t *)&self.ptr->ref_count); - assert(self.ptr->ref_count != 0); -} - -void ts_subtree_release(SubtreePool *pool, Subtree self) { - if (self.data.is_inline) return; - array_clear(&pool->tree_stack); - - assert(self.ptr->ref_count > 0); - if (atomic_dec((volatile uint32_t *)&self.ptr->ref_count) == 0) { - array_push(&pool->tree_stack, ts_subtree_to_mut_unsafe(self)); - } - - while (pool->tree_stack.size > 0) { - MutableSubtree tree = array_pop(&pool->tree_stack); - if (tree.ptr->child_count > 0) { - for (uint32_t i = 0; i < tree.ptr->child_count; i++) { - Subtree child = tree.ptr->children[i]; - if (child.data.is_inline) continue; - assert(child.ptr->ref_count > 0); - if (atomic_dec((volatile uint32_t *)&child.ptr->ref_count) == 0) { - array_push(&pool->tree_stack, ts_subtree_to_mut_unsafe(child)); - } - } - ts_free(tree.ptr->children); - } else if (tree.ptr->has_external_tokens) { - ts_external_scanner_state_delete(&tree.ptr->external_scanner_state); - } - ts_subtree_pool_free(pool, tree.ptr); - } -} - -bool ts_subtree_eq(Subtree self, Subtree other) { - if (self.data.is_inline || other.data.is_inline) { - return memcmp(&self, &other, sizeof(SubtreeInlineData)) == 0; - } - - if (self.ptr) { - if (!other.ptr) return false; - } else { - return !other.ptr; - } - - if (self.ptr->symbol != other.ptr->symbol) return false; - if (self.ptr->visible != other.ptr->visible) return false; - if (self.ptr->named != other.ptr->named) return false; - if (self.ptr->padding.bytes != other.ptr->padding.bytes) return false; - if (self.ptr->size.bytes != other.ptr->size.bytes) return false; - if (self.ptr->symbol == ts_builtin_sym_error) return self.ptr->lookahead_char == other.ptr->lookahead_char; - if (self.ptr->child_count != other.ptr->child_count) return false; - if (self.ptr->child_count > 0) { - if (self.ptr->visible_child_count != other.ptr->visible_child_count) return false; - if (self.ptr->named_child_count != other.ptr->named_child_count) return false; - - for (uint32_t i = 0; i < self.ptr->child_count; i++) { - if (!ts_subtree_eq(self.ptr->children[i], other.ptr->children[i])) { - return false; - } - } - } - return true; -} - -int ts_subtree_compare(Subtree left, Subtree right) { - if (ts_subtree_symbol(left) < ts_subtree_symbol(right)) return -1; - if (ts_subtree_symbol(right) < ts_subtree_symbol(left)) return 1; - if (ts_subtree_child_count(left) < ts_subtree_child_count(right)) return -1; - if (ts_subtree_child_count(right) < ts_subtree_child_count(left)) return 1; - for (uint32_t i = 0, n = ts_subtree_child_count(left); i < n; i++) { - Subtree left_child = left.ptr->children[i]; - Subtree right_child = right.ptr->children[i]; - switch (ts_subtree_compare(left_child, right_child)) { - case -1: return -1; - case 1: return 1; - default: break; - } - } - return 0; -} - -static inline void ts_subtree_set_has_changes(MutableSubtree *self) { - if (self->data.is_inline) { - self->data.has_changes = true; - } else { - self->ptr->has_changes = true; - } -} - -Subtree ts_subtree_edit(Subtree self, const TSInputEdit *edit, SubtreePool *pool) { - typedef struct { - Subtree *tree; - Edit edit; - } StackEntry; - - Array(StackEntry) stack = array_new(); - array_push(&stack, ((StackEntry) { - .tree = &self, - .edit = (Edit) { - .start = {edit->start_byte, edit->start_point}, - .old_end = {edit->old_end_byte, edit->old_end_point}, - .new_end = {edit->new_end_byte, edit->new_end_point}, - }, - })); - - while (stack.size) { - StackEntry entry = array_pop(&stack); - Edit edit = entry.edit; - bool is_noop = edit.old_end.bytes == edit.start.bytes && edit.new_end.bytes == edit.start.bytes; - bool is_pure_insertion = edit.old_end.bytes == edit.start.bytes; - - Length size = ts_subtree_size(*entry.tree); - Length padding = ts_subtree_padding(*entry.tree); - uint32_t lookahead_bytes = ts_subtree_lookahead_bytes(*entry.tree); - uint32_t end_byte = padding.bytes + size.bytes + lookahead_bytes; - if (edit.start.bytes > end_byte || (is_noop && edit.start.bytes == end_byte)) continue; - - // If the edit is entirely within the space before this subtree, then shift this - // subtree over according to the edit without changing its size. - if (edit.old_end.bytes <= padding.bytes) { - padding = length_add(edit.new_end, length_sub(padding, edit.old_end)); - } - - // If the edit starts in the space before this subtree and extends into this subtree, - // shrink the subtree's content to compensate for the change in the space before it. - else if (edit.start.bytes < padding.bytes) { - size = length_sub(size, length_sub(edit.old_end, padding)); - padding = edit.new_end; - } - - // If the edit is a pure insertion right at the start of the subtree, - // shift the subtree over according to the insertion. - else if (edit.start.bytes == padding.bytes && is_pure_insertion) { - padding = edit.new_end; - } - - // If the edit is within this subtree, resize the subtree to reflect the edit. - else { - uint32_t total_bytes = padding.bytes + size.bytes; - if (edit.start.bytes < total_bytes || - (edit.start.bytes == total_bytes && is_pure_insertion)) { - size = length_add( - length_sub(edit.new_end, padding), - length_sub(size, length_sub(edit.old_end, padding)) - ); - } - } - - MutableSubtree result = ts_subtree_make_mut(pool, *entry.tree); - - if (result.data.is_inline) { - if (ts_subtree_can_inline(padding, size, lookahead_bytes)) { - result.data.padding_bytes = padding.bytes; - result.data.padding_rows = padding.extent.row; - result.data.padding_columns = padding.extent.column; - result.data.size_bytes = size.bytes; - } else { - SubtreeHeapData *data = ts_subtree_pool_allocate(pool); - data->ref_count = 1; - data->padding = padding; - data->size = size; - data->lookahead_bytes = lookahead_bytes; - data->error_cost = 0; - data->child_count = 0; - data->symbol = result.data.symbol; - data->parse_state = result.data.parse_state; - data->visible = result.data.visible; - data->named = result.data.named; - data->extra = result.data.extra; - data->fragile_left = false; - data->fragile_right = false; - data->has_changes = false; - data->has_external_tokens = false; - data->is_missing = result.data.is_missing; - data->is_keyword = result.data.is_keyword; - result.ptr = data; - } - } else { - result.ptr->padding = padding; - result.ptr->size = size; - } - - ts_subtree_set_has_changes(&result); - *entry.tree = ts_subtree_from_mut(result); - - Length child_left, child_right = length_zero(); - for (uint32_t i = 0, n = ts_subtree_child_count(*entry.tree); i < n; i++) { - Subtree *child = &result.ptr->children[i]; - Length child_size = ts_subtree_total_size(*child); - child_left = child_right; - child_right = length_add(child_left, child_size); - - // If this child ends before the edit, it is not affected. - if (child_right.bytes + ts_subtree_lookahead_bytes(*child) < edit.start.bytes) continue; - - // If this child starts after the edit, then we're done processing children. - if (child_left.bytes > edit.old_end.bytes || - (child_left.bytes == edit.old_end.bytes && child_size.bytes > 0 && i > 0)) break; - - // Transform edit into the child's coordinate space. - Edit child_edit = { - .start = length_sub(edit.start, child_left), - .old_end = length_sub(edit.old_end, child_left), - .new_end = length_sub(edit.new_end, child_left), - }; - - // Clamp child_edit to the child's bounds. - if (edit.start.bytes < child_left.bytes) child_edit.start = length_zero(); - if (edit.old_end.bytes < child_left.bytes) child_edit.old_end = length_zero(); - if (edit.new_end.bytes < child_left.bytes) child_edit.new_end = length_zero(); - if (edit.old_end.bytes > child_right.bytes) child_edit.old_end = child_size; - - // Interpret all inserted text as applying to the *first* child that touches the edit. - // Subsequent children are only never have any text inserted into them; they are only - // shrunk to compensate for the edit. - if (child_right.bytes > edit.start.bytes || - (child_right.bytes == edit.start.bytes && is_pure_insertion)) { - edit.new_end = edit.start; - } - - // Children that occur before the edit are not reshaped by the edit. - else { - child_edit.old_end = child_edit.start; - child_edit.new_end = child_edit.start; - } - - // Queue processing of this child's subtree. - array_push(&stack, ((StackEntry) { - .tree = child, - .edit = child_edit, - })); - } - } - - array_delete(&stack); - return self; -} - -Subtree ts_subtree_last_external_token(Subtree tree) { - if (!ts_subtree_has_external_tokens(tree)) return NULL_SUBTREE; - while (tree.ptr->child_count > 0) { - for (uint32_t i = tree.ptr->child_count - 1; i + 1 > 0; i--) { - Subtree child = tree.ptr->children[i]; - if (ts_subtree_has_external_tokens(child)) { - tree = child; - break; - } - } - } - return tree; -} - -static size_t ts_subtree__write_char_to_string(char *s, size_t n, int32_t c) { - if (c == -1) - return snprintf(s, n, "INVALID"); - else if (c == '\0') - return snprintf(s, n, "'\\0'"); - else if (c == '\n') - return snprintf(s, n, "'\\n'"); - else if (c == '\t') - return snprintf(s, n, "'\\t'"); - else if (c == '\r') - return snprintf(s, n, "'\\r'"); - else if (0 < c && c < 128 && isprint(c)) - return snprintf(s, n, "'%c'", c); - else - return snprintf(s, n, "%d", c); -} - -static void ts_subtree__write_dot_string(FILE *f, const char *string) { - for (const char *c = string; *c; c++) { - if (*c == '"') { - fputs("\\\"", f); - } else if (*c == '\n') { - fputs("\\n", f); - } else { - fputc(*c, f); - } - } -} - -static const char *ROOT_FIELD = "__ROOT__"; - -static size_t ts_subtree__write_to_string( - Subtree self, char *string, size_t limit, - const TSLanguage *language, bool include_all, - TSSymbol alias_symbol, bool alias_is_named, const char *field_name -) { - if (!self.ptr) return snprintf(string, limit, "(NULL)"); - - char *cursor = string; - char **writer = (limit > 0) ? &cursor : &string; - bool is_root = field_name == ROOT_FIELD; - bool is_visible = - include_all || - ts_subtree_missing(self) || - ( - alias_symbol - ? alias_is_named - : ts_subtree_visible(self) && ts_subtree_named(self) - ); - - if (is_visible) { - if (!is_root) { - cursor += snprintf(*writer, limit, " "); - if (field_name) { - cursor += snprintf(*writer, limit, "%s: ", field_name); - } - } - - if (ts_subtree_is_error(self) && ts_subtree_child_count(self) == 0 && self.ptr->size.bytes > 0) { - cursor += snprintf(*writer, limit, "(UNEXPECTED "); - cursor += ts_subtree__write_char_to_string(*writer, limit, self.ptr->lookahead_char); - } else { - TSSymbol symbol = alias_symbol ? alias_symbol : ts_subtree_symbol(self); - const char *symbol_name = ts_language_symbol_name(language, symbol); - if (ts_subtree_missing(self)) { - cursor += snprintf(*writer, limit, "(MISSING "); - if (alias_is_named || ts_subtree_named(self)) { - cursor += snprintf(*writer, limit, "%s", symbol_name); - } else { - cursor += snprintf(*writer, limit, "\"%s\"", symbol_name); - } - } else { - cursor += snprintf(*writer, limit, "(%s", symbol_name); - } - } - } else if (is_root) { - TSSymbol symbol = ts_subtree_symbol(self); - const char *symbol_name = ts_language_symbol_name(language, symbol); - cursor += snprintf(*writer, limit, "(\"%s\")", symbol_name); - } - - if (ts_subtree_child_count(self)) { - const TSSymbol *alias_sequence = ts_language_alias_sequence(language, self.ptr->production_id); - const TSFieldMapEntry *field_map, *field_map_end; - ts_language_field_map( - language, - self.ptr->production_id, - &field_map, - &field_map_end - ); - - uint32_t structural_child_index = 0; - for (uint32_t i = 0; i < self.ptr->child_count; i++) { - Subtree child = self.ptr->children[i]; - if (ts_subtree_extra(child)) { - cursor += ts_subtree__write_to_string( - child, *writer, limit, - language, include_all, - 0, false, NULL - ); - } else { - TSSymbol alias_symbol = alias_sequence - ? alias_sequence[structural_child_index] - : 0; - bool alias_is_named = alias_symbol - ? ts_language_symbol_metadata(language, alias_symbol).named - : false; - - const char *child_field_name = is_visible ? NULL : field_name; - for (const TSFieldMapEntry *i = field_map; i < field_map_end; i++) { - if (!i->inherited && i->child_index == structural_child_index) { - child_field_name = language->field_names[i->field_id]; - break; - } - } - - cursor += ts_subtree__write_to_string( - child, *writer, limit, - language, include_all, - alias_symbol, alias_is_named, child_field_name - ); - structural_child_index++; - } - } - } - - if (is_visible) cursor += snprintf(*writer, limit, ")"); - - return cursor - string; -} - -char *ts_subtree_string( - Subtree self, - const TSLanguage *language, - bool include_all -) { - char scratch_string[1]; - size_t size = ts_subtree__write_to_string( - self, scratch_string, 0, - language, include_all, - 0, false, ROOT_FIELD - ) + 1; - char *result = malloc(size * sizeof(char)); - ts_subtree__write_to_string( - self, result, size, - language, include_all, - 0, false, ROOT_FIELD - ); - return result; -} - -void ts_subtree__print_dot_graph(const Subtree *self, uint32_t start_offset, - const TSLanguage *language, TSSymbol alias_symbol, - FILE *f) { - TSSymbol subtree_symbol = ts_subtree_symbol(*self); - TSSymbol symbol = alias_symbol ? alias_symbol : subtree_symbol; - uint32_t end_offset = start_offset + ts_subtree_total_bytes(*self); - fprintf(f, "tree_%p [label=\"", self); - ts_subtree__write_dot_string(f, ts_language_symbol_name(language, symbol)); - fprintf(f, "\""); - - if (ts_subtree_child_count(*self) == 0) fprintf(f, ", shape=plaintext"); - if (ts_subtree_extra(*self)) fprintf(f, ", fontcolor=gray"); - - fprintf(f, ", tooltip=\"" - "range: %u - %u\n" - "state: %d\n" - "error-cost: %u\n" - "has-changes: %u\n" - "repeat-depth: %u\n" - "lookahead-bytes: %u", - start_offset, end_offset, - ts_subtree_parse_state(*self), - ts_subtree_error_cost(*self), - ts_subtree_has_changes(*self), - ts_subtree_repeat_depth(*self), - ts_subtree_lookahead_bytes(*self) - ); - - if (ts_subtree_is_error(*self) && ts_subtree_child_count(*self) == 0) { - fprintf(f, "\ncharacter: '%c'", self->ptr->lookahead_char); - } - - fprintf(f, "\"]\n"); - - uint32_t child_start_offset = start_offset; - uint32_t child_info_offset = - language->max_alias_sequence_length * - ts_subtree_production_id(*self); - for (uint32_t i = 0, n = ts_subtree_child_count(*self); i < n; i++) { - const Subtree *child = &self->ptr->children[i]; - TSSymbol alias_symbol = 0; - if (!ts_subtree_extra(*child) && child_info_offset) { - alias_symbol = language->alias_sequences[child_info_offset]; - child_info_offset++; - } - ts_subtree__print_dot_graph(child, child_start_offset, language, alias_symbol, f); - fprintf(f, "tree_%p -> tree_%p [tooltip=%u]\n", self, child, i); - child_start_offset += ts_subtree_total_bytes(*child); - } -} - -void ts_subtree_print_dot_graph(Subtree self, const TSLanguage *language, FILE *f) { - fprintf(f, "digraph tree {\n"); - fprintf(f, "edge [arrowhead=none]\n"); - ts_subtree__print_dot_graph(&self, 0, language, 0, f); - fprintf(f, "}\n"); -} - -bool ts_subtree_external_scanner_state_eq(Subtree self, Subtree other) { - const ExternalScannerState *state1 = &empty_state; - const ExternalScannerState *state2 = &empty_state; - if (self.ptr && ts_subtree_has_external_tokens(self) && !self.ptr->child_count) { - state1 = &self.ptr->external_scanner_state; - } - if (other.ptr && ts_subtree_has_external_tokens(other) && !other.ptr->child_count) { - state2 = &other.ptr->external_scanner_state; - } - return ts_external_scanner_state_eq(state1, state2); -} diff --git a/src/tree_sitter/subtree.h b/src/tree_sitter/subtree.h deleted file mode 100644 index 18c48dcbd0..0000000000 --- a/src/tree_sitter/subtree.h +++ /dev/null @@ -1,285 +0,0 @@ -#ifndef TREE_SITTER_SUBTREE_H_ -#define TREE_SITTER_SUBTREE_H_ - -#ifdef __cplusplus -extern "C" { -#endif - -#include <limits.h> -#include <stdbool.h> -#include <stdio.h> -#include "./length.h" -#include "./array.h" -#include "./error_costs.h" -#include "tree_sitter/api.h" -#include "tree_sitter/parser.h" - -static const TSStateId TS_TREE_STATE_NONE = USHRT_MAX; -#define NULL_SUBTREE ((Subtree) {.ptr = NULL}) - -typedef union Subtree Subtree; -typedef union MutableSubtree MutableSubtree; - -typedef struct { - union { - char *long_data; - char short_data[24]; - }; - uint32_t length; -} ExternalScannerState; - -typedef struct { - bool is_inline : 1; - bool visible : 1; - bool named : 1; - bool extra : 1; - bool has_changes : 1; - bool is_missing : 1; - bool is_keyword : 1; - uint8_t symbol; - uint8_t padding_bytes; - uint8_t size_bytes; - uint8_t padding_columns; - uint8_t padding_rows : 4; - uint8_t lookahead_bytes : 4; - uint16_t parse_state; -} SubtreeInlineData; - -typedef struct { - volatile uint32_t ref_count; - Length padding; - Length size; - uint32_t lookahead_bytes; - uint32_t error_cost; - uint32_t child_count; - TSSymbol symbol; - TSStateId parse_state; - - bool visible : 1; - bool named : 1; - bool extra : 1; - bool fragile_left : 1; - bool fragile_right : 1; - bool has_changes : 1; - bool has_external_tokens : 1; - bool is_missing : 1; - bool is_keyword : 1; - - union { - // Non-terminal subtrees (`child_count > 0`) - struct { - Subtree *children; - uint32_t visible_child_count; - uint32_t named_child_count; - uint32_t node_count; - uint32_t repeat_depth; - int32_t dynamic_precedence; - uint16_t production_id; - struct { - TSSymbol symbol; - TSStateId parse_state; - } first_leaf; - }; - - // External terminal subtrees (`child_count == 0 && has_external_tokens`) - ExternalScannerState external_scanner_state; - - // Error terminal subtrees (`child_count == 0 && symbol == ts_builtin_sym_error`) - int32_t lookahead_char; - }; -} SubtreeHeapData; - -union Subtree { - SubtreeInlineData data; - const SubtreeHeapData *ptr; -}; - -union MutableSubtree { - SubtreeInlineData data; - SubtreeHeapData *ptr; -}; - -typedef Array(Subtree) SubtreeArray; -typedef Array(MutableSubtree) MutableSubtreeArray; - -typedef struct { - MutableSubtreeArray free_trees; - MutableSubtreeArray tree_stack; -} SubtreePool; - -void ts_external_scanner_state_init(ExternalScannerState *, const char *, unsigned); -const char *ts_external_scanner_state_data(const ExternalScannerState *); - -void ts_subtree_array_copy(SubtreeArray, SubtreeArray *); -void ts_subtree_array_delete(SubtreePool *, SubtreeArray *); -SubtreeArray ts_subtree_array_remove_trailing_extras(SubtreeArray *); -void ts_subtree_array_reverse(SubtreeArray *); - -SubtreePool ts_subtree_pool_new(uint32_t capacity); -void ts_subtree_pool_delete(SubtreePool *); - -Subtree ts_subtree_new_leaf( - SubtreePool *, TSSymbol, Length, Length, uint32_t, - TSStateId, bool, bool, const TSLanguage * -); -Subtree ts_subtree_new_error( - SubtreePool *, int32_t, Length, Length, uint32_t, TSStateId, const TSLanguage * -); -MutableSubtree ts_subtree_new_node(SubtreePool *, TSSymbol, SubtreeArray *, unsigned, const TSLanguage *); -Subtree ts_subtree_new_error_node(SubtreePool *, SubtreeArray *, bool, const TSLanguage *); -Subtree ts_subtree_new_missing_leaf(SubtreePool *, TSSymbol, Length, const TSLanguage *); -MutableSubtree ts_subtree_make_mut(SubtreePool *, Subtree); -void ts_subtree_retain(Subtree); -void ts_subtree_release(SubtreePool *, Subtree); -bool ts_subtree_eq(Subtree, Subtree); -int ts_subtree_compare(Subtree, Subtree); -void ts_subtree_set_symbol(MutableSubtree *, TSSymbol, const TSLanguage *); -void ts_subtree_set_children(MutableSubtree, Subtree *, uint32_t, const TSLanguage *); -void ts_subtree_balance(Subtree, SubtreePool *, const TSLanguage *); -Subtree ts_subtree_edit(Subtree, const TSInputEdit *edit, SubtreePool *); -char *ts_subtree_string(Subtree, const TSLanguage *, bool include_all); -void ts_subtree_print_dot_graph(Subtree, const TSLanguage *, FILE *); -Subtree ts_subtree_last_external_token(Subtree); -bool ts_subtree_external_scanner_state_eq(Subtree, Subtree); - -#define SUBTREE_GET(self, name) (self.data.is_inline ? self.data.name : self.ptr->name) - -static inline TSSymbol ts_subtree_symbol(Subtree self) { return SUBTREE_GET(self, symbol); } -static inline bool ts_subtree_visible(Subtree self) { return SUBTREE_GET(self, visible); } -static inline bool ts_subtree_named(Subtree self) { return SUBTREE_GET(self, named); } -static inline bool ts_subtree_extra(Subtree self) { return SUBTREE_GET(self, extra); } -static inline bool ts_subtree_has_changes(Subtree self) { return SUBTREE_GET(self, has_changes); } -static inline bool ts_subtree_missing(Subtree self) { return SUBTREE_GET(self, is_missing); } -static inline bool ts_subtree_is_keyword(Subtree self) { return SUBTREE_GET(self, is_keyword); } -static inline TSStateId ts_subtree_parse_state(Subtree self) { return SUBTREE_GET(self, parse_state); } -static inline uint32_t ts_subtree_lookahead_bytes(Subtree self) { return SUBTREE_GET(self, lookahead_bytes); } - -#undef SUBTREE_GET - -static inline void ts_subtree_set_extra(MutableSubtree *self) { - if (self->data.is_inline) { - self->data.extra = true; - } else { - self->ptr->extra = true; - } -} - -static inline TSSymbol ts_subtree_leaf_symbol(Subtree self) { - if (self.data.is_inline) return self.data.symbol; - if (self.ptr->child_count == 0) return self.ptr->symbol; - return self.ptr->first_leaf.symbol; -} - -static inline TSStateId ts_subtree_leaf_parse_state(Subtree self) { - if (self.data.is_inline) return self.data.parse_state; - if (self.ptr->child_count == 0) return self.ptr->parse_state; - return self.ptr->first_leaf.parse_state; -} - -static inline Length ts_subtree_padding(Subtree self) { - if (self.data.is_inline) { - Length result = {self.data.padding_bytes, {self.data.padding_rows, self.data.padding_columns}}; - return result; - } else { - return self.ptr->padding; - } -} - -static inline Length ts_subtree_size(Subtree self) { - if (self.data.is_inline) { - Length result = {self.data.size_bytes, {0, self.data.size_bytes}}; - return result; - } else { - return self.ptr->size; - } -} - -static inline Length ts_subtree_total_size(Subtree self) { - return length_add(ts_subtree_padding(self), ts_subtree_size(self)); -} - -static inline uint32_t ts_subtree_total_bytes(Subtree self) { - return ts_subtree_total_size(self).bytes; -} - -static inline uint32_t ts_subtree_child_count(Subtree self) { - return self.data.is_inline ? 0 : self.ptr->child_count; -} - -static inline uint32_t ts_subtree_repeat_depth(Subtree self) { - return self.data.is_inline ? 0 : self.ptr->repeat_depth; -} - -static inline uint32_t ts_subtree_node_count(Subtree self) { - return (self.data.is_inline || self.ptr->child_count == 0) ? 1 : self.ptr->node_count; -} - -static inline uint32_t ts_subtree_visible_child_count(Subtree self) { - if (ts_subtree_child_count(self) > 0) { - return self.ptr->visible_child_count; - } else { - return 0; - } -} - -static inline uint32_t ts_subtree_error_cost(Subtree self) { - if (ts_subtree_missing(self)) { - return ERROR_COST_PER_MISSING_TREE + ERROR_COST_PER_RECOVERY; - } else { - return self.data.is_inline ? 0 : self.ptr->error_cost; - } -} - -static inline int32_t ts_subtree_dynamic_precedence(Subtree self) { - return (self.data.is_inline || self.ptr->child_count == 0) ? 0 : self.ptr->dynamic_precedence; -} - -static inline uint16_t ts_subtree_production_id(Subtree self) { - if (ts_subtree_child_count(self) > 0) { - return self.ptr->production_id; - } else { - return 0; - } -} - -static inline bool ts_subtree_fragile_left(Subtree self) { - return self.data.is_inline ? false : self.ptr->fragile_left; -} - -static inline bool ts_subtree_fragile_right(Subtree self) { - return self.data.is_inline ? false : self.ptr->fragile_right; -} - -static inline bool ts_subtree_has_external_tokens(Subtree self) { - return self.data.is_inline ? false : self.ptr->has_external_tokens; -} - -static inline bool ts_subtree_is_fragile(Subtree self) { - return self.data.is_inline ? false : (self.ptr->fragile_left || self.ptr->fragile_right); -} - -static inline bool ts_subtree_is_error(Subtree self) { - return ts_subtree_symbol(self) == ts_builtin_sym_error; -} - -static inline bool ts_subtree_is_eof(Subtree self) { - return ts_subtree_symbol(self) == ts_builtin_sym_end; -} - -static inline Subtree ts_subtree_from_mut(MutableSubtree self) { - Subtree result; - result.data = self.data; - return result; -} - -static inline MutableSubtree ts_subtree_to_mut_unsafe(Subtree self) { - MutableSubtree result; - result.data = self.data; - return result; -} - -#ifdef __cplusplus -} -#endif - -#endif // TREE_SITTER_SUBTREE_H_ diff --git a/src/tree_sitter/tree.c b/src/tree_sitter/tree.c deleted file mode 100644 index 391fa7f592..0000000000 --- a/src/tree_sitter/tree.c +++ /dev/null @@ -1,148 +0,0 @@ -#include "tree_sitter/api.h" -#include "./array.h" -#include "./get_changed_ranges.h" -#include "./subtree.h" -#include "./tree_cursor.h" -#include "./tree.h" - -static const unsigned PARENT_CACHE_CAPACITY = 32; - -TSTree *ts_tree_new( - Subtree root, const TSLanguage *language, - const TSRange *included_ranges, unsigned included_range_count -) { - TSTree *result = ts_malloc(sizeof(TSTree)); - result->root = root; - result->language = language; - result->parent_cache = NULL; - result->parent_cache_start = 0; - result->parent_cache_size = 0; - result->included_ranges = ts_calloc(included_range_count, sizeof(TSRange)); - memcpy(result->included_ranges, included_ranges, included_range_count * sizeof(TSRange)); - result->included_range_count = included_range_count; - return result; -} - -TSTree *ts_tree_copy(const TSTree *self) { - ts_subtree_retain(self->root); - return ts_tree_new(self->root, self->language, self->included_ranges, self->included_range_count); -} - -void ts_tree_delete(TSTree *self) { - if (!self) return; - - SubtreePool pool = ts_subtree_pool_new(0); - ts_subtree_release(&pool, self->root); - ts_subtree_pool_delete(&pool); - ts_free(self->included_ranges); - if (self->parent_cache) ts_free(self->parent_cache); - ts_free(self); -} - -TSNode ts_tree_root_node(const TSTree *self) { - return ts_node_new(self, &self->root, ts_subtree_padding(self->root), 0); -} - -const TSLanguage *ts_tree_language(const TSTree *self) { - return self->language; -} - -void ts_tree_edit(TSTree *self, const TSInputEdit *edit) { - for (unsigned i = 0; i < self->included_range_count; i++) { - TSRange *range = &self->included_ranges[i]; - if (range->end_byte >= edit->old_end_byte) { - if (range->end_byte != UINT32_MAX) { - range->end_byte = edit->new_end_byte + (range->end_byte - edit->old_end_byte); - range->end_point = point_add( - edit->new_end_point, - point_sub(range->end_point, edit->old_end_point) - ); - if (range->end_byte < edit->new_end_byte) { - range->end_byte = UINT32_MAX; - range->end_point = POINT_MAX; - } - } - if (range->start_byte >= edit->old_end_byte) { - range->start_byte = edit->new_end_byte + (range->start_byte - edit->old_end_byte); - range->start_point = point_add( - edit->new_end_point, - point_sub(range->start_point, edit->old_end_point) - ); - if (range->start_byte < edit->new_end_byte) { - range->start_byte = UINT32_MAX; - range->start_point = POINT_MAX; - } - } - } - } - - SubtreePool pool = ts_subtree_pool_new(0); - self->root = ts_subtree_edit(self->root, edit, &pool); - self->parent_cache_start = 0; - self->parent_cache_size = 0; - ts_subtree_pool_delete(&pool); -} - -TSRange *ts_tree_get_changed_ranges(const TSTree *self, const TSTree *other, uint32_t *count) { - TreeCursor cursor1 = {NULL, array_new()}; - TreeCursor cursor2 = {NULL, array_new()}; - ts_tree_cursor_init(&cursor1, ts_tree_root_node(self)); - ts_tree_cursor_init(&cursor2, ts_tree_root_node(other)); - - TSRangeArray included_range_differences = array_new(); - ts_range_array_get_changed_ranges( - self->included_ranges, self->included_range_count, - other->included_ranges, other->included_range_count, - &included_range_differences - ); - - TSRange *result; - *count = ts_subtree_get_changed_ranges( - &self->root, &other->root, &cursor1, &cursor2, - self->language, &included_range_differences, &result - ); - - array_delete(&included_range_differences); - array_delete(&cursor1.stack); - array_delete(&cursor2.stack); - return result; -} - -void ts_tree_print_dot_graph(const TSTree *self, FILE *file) { - ts_subtree_print_dot_graph(self->root, self->language, file); -} - -TSNode ts_tree_get_cached_parent(const TSTree *self, const TSNode *node) { - for (uint32_t i = 0; i < self->parent_cache_size; i++) { - uint32_t index = (self->parent_cache_start + i) % PARENT_CACHE_CAPACITY; - ParentCacheEntry *entry = &self->parent_cache[index]; - if (entry->child == node->id) { - return ts_node_new(self, entry->parent, entry->position, entry->alias_symbol); - } - } - return ts_node_new(NULL, NULL, length_zero(), 0); -} - -void ts_tree_set_cached_parent(const TSTree *_self, const TSNode *node, const TSNode *parent) { - TSTree *self = (TSTree *)_self; - if (!self->parent_cache) { - self->parent_cache = ts_calloc(PARENT_CACHE_CAPACITY, sizeof(ParentCacheEntry)); - } - - uint32_t index = (self->parent_cache_start + self->parent_cache_size) % PARENT_CACHE_CAPACITY; - self->parent_cache[index] = (ParentCacheEntry) { - .child = node->id, - .parent = (const Subtree *)parent->id, - .position = { - parent->context[0], - {parent->context[1], parent->context[2]} - }, - .alias_symbol = parent->context[3], - }; - - if (self->parent_cache_size == PARENT_CACHE_CAPACITY) { - self->parent_cache_start++; - } else { - self->parent_cache_size++; - } -} diff --git a/src/tree_sitter/tree.h b/src/tree_sitter/tree.h deleted file mode 100644 index 92a7e64179..0000000000 --- a/src/tree_sitter/tree.h +++ /dev/null @@ -1,34 +0,0 @@ -#ifndef TREE_SITTER_TREE_H_ -#define TREE_SITTER_TREE_H_ - -#ifdef __cplusplus -extern "C" { -#endif - -typedef struct { - const Subtree *child; - const Subtree *parent; - Length position; - TSSymbol alias_symbol; -} ParentCacheEntry; - -struct TSTree { - Subtree root; - const TSLanguage *language; - ParentCacheEntry *parent_cache; - uint32_t parent_cache_start; - uint32_t parent_cache_size; - TSRange *included_ranges; - unsigned included_range_count; -}; - -TSTree *ts_tree_new(Subtree root, const TSLanguage *language, const TSRange *, unsigned); -TSNode ts_node_new(const TSTree *, const Subtree *, Length, TSSymbol); -TSNode ts_tree_get_cached_parent(const TSTree *, const TSNode *); -void ts_tree_set_cached_parent(const TSTree *, const TSNode *, const TSNode *); - -#ifdef __cplusplus -} -#endif - -#endif // TREE_SITTER_TREE_H_ diff --git a/src/tree_sitter/tree_cursor.c b/src/tree_sitter/tree_cursor.c deleted file mode 100644 index 00b9679d73..0000000000 --- a/src/tree_sitter/tree_cursor.c +++ /dev/null @@ -1,367 +0,0 @@ -#include "tree_sitter/api.h" -#include "./alloc.h" -#include "./tree_cursor.h" -#include "./language.h" -#include "./tree.h" - -typedef struct { - Subtree parent; - const TSTree *tree; - Length position; - uint32_t child_index; - uint32_t structural_child_index; - const TSSymbol *alias_sequence; -} CursorChildIterator; - -// CursorChildIterator - -static inline CursorChildIterator ts_tree_cursor_iterate_children(const TreeCursor *self) { - TreeCursorEntry *last_entry = array_back(&self->stack); - if (ts_subtree_child_count(*last_entry->subtree) == 0) { - return (CursorChildIterator) {NULL_SUBTREE, self->tree, length_zero(), 0, 0, NULL}; - } - const TSSymbol *alias_sequence = ts_language_alias_sequence( - self->tree->language, - last_entry->subtree->ptr->production_id - ); - return (CursorChildIterator) { - .tree = self->tree, - .parent = *last_entry->subtree, - .position = last_entry->position, - .child_index = 0, - .structural_child_index = 0, - .alias_sequence = alias_sequence, - }; -} - -static inline bool ts_tree_cursor_child_iterator_next(CursorChildIterator *self, - TreeCursorEntry *result, - bool *visible) { - if (!self->parent.ptr || self->child_index == self->parent.ptr->child_count) return false; - const Subtree *child = &self->parent.ptr->children[self->child_index]; - *result = (TreeCursorEntry) { - .subtree = child, - .position = self->position, - .child_index = self->child_index, - .structural_child_index = self->structural_child_index, - }; - *visible = ts_subtree_visible(*child); - bool extra = ts_subtree_extra(*child); - if (!extra && self->alias_sequence) { - *visible |= self->alias_sequence[self->structural_child_index]; - self->structural_child_index++; - } - - self->position = length_add(self->position, ts_subtree_size(*child)); - self->child_index++; - - if (self->child_index < self->parent.ptr->child_count) { - Subtree next_child = self->parent.ptr->children[self->child_index]; - self->position = length_add(self->position, ts_subtree_padding(next_child)); - } - - return true; -} - -// TSTreeCursor - lifecycle - -TSTreeCursor ts_tree_cursor_new(TSNode node) { - TSTreeCursor self = {NULL, NULL, {0, 0}}; - ts_tree_cursor_init((TreeCursor *)&self, node); - return self; -} - -void ts_tree_cursor_reset(TSTreeCursor *_self, TSNode node) { - ts_tree_cursor_init((TreeCursor *)_self, node); -} - -void ts_tree_cursor_init(TreeCursor *self, TSNode node) { - self->tree = node.tree; - array_clear(&self->stack); - array_push(&self->stack, ((TreeCursorEntry) { - .subtree = (const Subtree *)node.id, - .position = { - ts_node_start_byte(node), - ts_node_start_point(node) - }, - .child_index = 0, - .structural_child_index = 0, - })); -} - -void ts_tree_cursor_delete(TSTreeCursor *_self) { - TreeCursor *self = (TreeCursor *)_self; - array_delete(&self->stack); -} - -// TSTreeCursor - walking the tree - -bool ts_tree_cursor_goto_first_child(TSTreeCursor *_self) { - TreeCursor *self = (TreeCursor *)_self; - - bool did_descend; - do { - did_descend = false; - - bool visible; - TreeCursorEntry entry; - CursorChildIterator iterator = ts_tree_cursor_iterate_children(self); - while (ts_tree_cursor_child_iterator_next(&iterator, &entry, &visible)) { - if (visible) { - array_push(&self->stack, entry); - return true; - } - - if (ts_subtree_visible_child_count(*entry.subtree) > 0) { - array_push(&self->stack, entry); - did_descend = true; - break; - } - } - } while (did_descend); - - return false; -} - -int64_t ts_tree_cursor_goto_first_child_for_byte(TSTreeCursor *_self, uint32_t goal_byte) { - TreeCursor *self = (TreeCursor *)_self; - uint32_t initial_size = self->stack.size; - uint32_t visible_child_index = 0; - - bool did_descend; - do { - did_descend = false; - - bool visible; - TreeCursorEntry entry; - CursorChildIterator iterator = ts_tree_cursor_iterate_children(self); - while (ts_tree_cursor_child_iterator_next(&iterator, &entry, &visible)) { - uint32_t end_byte = entry.position.bytes + ts_subtree_size(*entry.subtree).bytes; - bool at_goal = end_byte > goal_byte; - uint32_t visible_child_count = ts_subtree_visible_child_count(*entry.subtree); - - if (at_goal) { - if (visible) { - array_push(&self->stack, entry); - return visible_child_index; - } - - if (visible_child_count > 0) { - array_push(&self->stack, entry); - did_descend = true; - break; - } - } else if (visible) { - visible_child_index++; - } else { - visible_child_index += visible_child_count; - } - } - } while (did_descend); - - if (self->stack.size > initial_size && - ts_tree_cursor_goto_next_sibling((TSTreeCursor *)self)) { - return visible_child_index; - } - - self->stack.size = initial_size; - return -1; -} - -bool ts_tree_cursor_goto_next_sibling(TSTreeCursor *_self) { - TreeCursor *self = (TreeCursor *)_self; - uint32_t initial_size = self->stack.size; - - while (self->stack.size > 1) { - TreeCursorEntry entry = array_pop(&self->stack); - CursorChildIterator iterator = ts_tree_cursor_iterate_children(self); - iterator.child_index = entry.child_index; - iterator.structural_child_index = entry.structural_child_index; - iterator.position = entry.position; - - bool visible = false; - ts_tree_cursor_child_iterator_next(&iterator, &entry, &visible); - if (visible && self->stack.size + 1 < initial_size) break; - - while (ts_tree_cursor_child_iterator_next(&iterator, &entry, &visible)) { - if (visible) { - array_push(&self->stack, entry); - return true; - } - - if (ts_subtree_visible_child_count(*entry.subtree)) { - array_push(&self->stack, entry); - ts_tree_cursor_goto_first_child(_self); - return true; - } - } - } - - self->stack.size = initial_size; - return false; -} - -bool ts_tree_cursor_goto_parent(TSTreeCursor *_self) { - TreeCursor *self = (TreeCursor *)_self; - for (unsigned i = self->stack.size - 2; i + 1 > 0; i--) { - TreeCursorEntry *entry = &self->stack.contents[i]; - bool is_aliased = false; - if (i > 0) { - TreeCursorEntry *parent_entry = &self->stack.contents[i - 1]; - const TSSymbol *alias_sequence = ts_language_alias_sequence( - self->tree->language, - parent_entry->subtree->ptr->production_id - ); - is_aliased = alias_sequence && alias_sequence[entry->structural_child_index]; - } - if (ts_subtree_visible(*entry->subtree) || is_aliased) { - self->stack.size = i + 1; - return true; - } - } - return false; -} - -TSNode ts_tree_cursor_current_node(const TSTreeCursor *_self) { - const TreeCursor *self = (const TreeCursor *)_self; - TreeCursorEntry *last_entry = array_back(&self->stack); - TSSymbol alias_symbol = 0; - if (self->stack.size > 1) { - TreeCursorEntry *parent_entry = &self->stack.contents[self->stack.size - 2]; - const TSSymbol *alias_sequence = ts_language_alias_sequence( - self->tree->language, - parent_entry->subtree->ptr->production_id - ); - if (alias_sequence && !ts_subtree_extra(*last_entry->subtree)) { - alias_symbol = alias_sequence[last_entry->structural_child_index]; - } - } - return ts_node_new( - self->tree, - last_entry->subtree, - last_entry->position, - alias_symbol - ); -} - -TSFieldId ts_tree_cursor_current_status( - const TSTreeCursor *_self, - bool *can_have_later_siblings, - bool *can_have_later_siblings_with_this_field -) { - const TreeCursor *self = (const TreeCursor *)_self; - TSFieldId result = 0; - *can_have_later_siblings = false; - *can_have_later_siblings_with_this_field = false; - - // Walk up the tree, visiting the current node and its invisible ancestors, - // because fields can refer to nodes through invisible *wrapper* nodes, - for (unsigned i = self->stack.size - 1; i > 0; i--) { - TreeCursorEntry *entry = &self->stack.contents[i]; - TreeCursorEntry *parent_entry = &self->stack.contents[i - 1]; - - // Stop walking up when a visible ancestor is found. - if (i != self->stack.size - 1) { - if (ts_subtree_visible(*entry->subtree)) break; - const TSSymbol *alias_sequence = ts_language_alias_sequence( - self->tree->language, - parent_entry->subtree->ptr->production_id - ); - if (alias_sequence && alias_sequence[entry->structural_child_index]) { - break; - } - } - - if (ts_subtree_child_count(*parent_entry->subtree) > entry->child_index + 1) { - *can_have_later_siblings = true; - } - - if (ts_subtree_extra(*entry->subtree)) break; - - const TSFieldMapEntry *field_map, *field_map_end; - ts_language_field_map( - self->tree->language, - parent_entry->subtree->ptr->production_id, - &field_map, &field_map_end - ); - - // Look for a field name associated with the current node. - if (!result) { - for (const TSFieldMapEntry *i = field_map; i < field_map_end; i++) { - if (!i->inherited && i->child_index == entry->structural_child_index) { - result = i->field_id; - *can_have_later_siblings_with_this_field = false; - break; - } - } - } - - // Determine if there other later siblings with the same field name. - if (result) { - for (const TSFieldMapEntry *i = field_map; i < field_map_end; i++) { - if (i->field_id == result && i->child_index > entry->structural_child_index) { - *can_have_later_siblings_with_this_field = true; - break; - } - } - } - } - - return result; -} - -TSFieldId ts_tree_cursor_current_field_id(const TSTreeCursor *_self) { - const TreeCursor *self = (const TreeCursor *)_self; - - // Walk up the tree, visiting the current node and its invisible ancestors. - for (unsigned i = self->stack.size - 1; i > 0; i--) { - TreeCursorEntry *entry = &self->stack.contents[i]; - TreeCursorEntry *parent_entry = &self->stack.contents[i - 1]; - - // Stop walking up when another visible node is found. - if (i != self->stack.size - 1) { - if (ts_subtree_visible(*entry->subtree)) break; - const TSSymbol *alias_sequence = ts_language_alias_sequence( - self->tree->language, - parent_entry->subtree->ptr->production_id - ); - if (alias_sequence && alias_sequence[entry->structural_child_index]) { - break; - } - } - - if (ts_subtree_extra(*entry->subtree)) break; - - const TSFieldMapEntry *field_map, *field_map_end; - ts_language_field_map( - self->tree->language, - parent_entry->subtree->ptr->production_id, - &field_map, &field_map_end - ); - for (const TSFieldMapEntry *i = field_map; i < field_map_end; i++) { - if (!i->inherited && i->child_index == entry->structural_child_index) { - return i->field_id; - } - } - } - return 0; -} - -const char *ts_tree_cursor_current_field_name(const TSTreeCursor *_self) { - TSFieldId id = ts_tree_cursor_current_field_id(_self); - if (id) { - const TreeCursor *self = (const TreeCursor *)_self; - return self->tree->language->field_names[id]; - } else { - return NULL; - } -} - -TSTreeCursor ts_tree_cursor_copy(const TSTreeCursor *_cursor) { - const TreeCursor *cursor = (const TreeCursor *)_cursor; - TSTreeCursor res = {NULL, NULL, {0, 0}}; - TreeCursor *copy = (TreeCursor *)&res; - copy->tree = cursor->tree; - array_push_all(©->stack, &cursor->stack); - return res; -} diff --git a/src/tree_sitter/tree_cursor.h b/src/tree_sitter/tree_cursor.h deleted file mode 100644 index 5a39dd278c..0000000000 --- a/src/tree_sitter/tree_cursor.h +++ /dev/null @@ -1,21 +0,0 @@ -#ifndef TREE_SITTER_TREE_CURSOR_H_ -#define TREE_SITTER_TREE_CURSOR_H_ - -#include "./subtree.h" - -typedef struct { - const Subtree *subtree; - Length position; - uint32_t child_index; - uint32_t structural_child_index; -} TreeCursorEntry; - -typedef struct { - const TSTree *tree; - Array(TreeCursorEntry) stack; -} TreeCursor; - -void ts_tree_cursor_init(TreeCursor *, TSNode); -TSFieldId ts_tree_cursor_current_status(const TSTreeCursor *, bool *, bool *); - -#endif // TREE_SITTER_TREE_CURSOR_H_ diff --git a/src/tree_sitter/treesitter_commit_hash.txt b/src/tree_sitter/treesitter_commit_hash.txt deleted file mode 100644 index 322cdd24a6..0000000000 --- a/src/tree_sitter/treesitter_commit_hash.txt +++ /dev/null @@ -1 +0,0 @@ -87df53a99b51bce0d1e901cd6838f24e1c7a4073 diff --git a/src/tree_sitter/unicode.h b/src/tree_sitter/unicode.h deleted file mode 100644 index 2ab51c2a3a..0000000000 --- a/src/tree_sitter/unicode.h +++ /dev/null @@ -1,50 +0,0 @@ -#ifndef TREE_SITTER_UNICODE_H_ -#define TREE_SITTER_UNICODE_H_ - -#ifdef __cplusplus -extern "C" { -#endif - -#include <limits.h> -#include <stdint.h> - -#define U_EXPORT -#define U_EXPORT2 -#include "./unicode/utf8.h" -#include "./unicode/utf16.h" - -static const int32_t TS_DECODE_ERROR = U_SENTINEL; - -// These functions read one unicode code point from the given string, -// returning the number of bytes consumed. -typedef uint32_t (*UnicodeDecodeFunction)( - const uint8_t *string, - uint32_t length, - int32_t *code_point -); - -static inline uint32_t ts_decode_utf8( - const uint8_t *string, - uint32_t length, - int32_t *code_point -) { - uint32_t i = 0; - U8_NEXT(string, i, length, *code_point); - return i; -} - -static inline uint32_t ts_decode_utf16( - const uint8_t *string, - uint32_t length, - int32_t *code_point -) { - uint32_t i = 0; - U16_NEXT(((uint16_t *)string), i, length, *code_point); - return i * 2; -} - -#ifdef __cplusplus -} -#endif - -#endif // TREE_SITTER_UNICODE_H_ diff --git a/src/tree_sitter/unicode/ICU_SHA b/src/tree_sitter/unicode/ICU_SHA deleted file mode 100644 index 3622283ba3..0000000000 --- a/src/tree_sitter/unicode/ICU_SHA +++ /dev/null @@ -1 +0,0 @@ -552b01f61127d30d6589aa4bf99468224979b661 diff --git a/src/tree_sitter/unicode/LICENSE b/src/tree_sitter/unicode/LICENSE deleted file mode 100644 index 2e01e36876..0000000000 --- a/src/tree_sitter/unicode/LICENSE +++ /dev/null @@ -1,414 +0,0 @@ -COPYRIGHT AND PERMISSION NOTICE (ICU 58 and later) - -Copyright © 1991-2019 Unicode, Inc. All rights reserved. -Distributed under the Terms of Use in https://www.unicode.org/copyright.html. - -Permission is hereby granted, free of charge, to any person obtaining -a copy of the Unicode data files and any associated documentation -(the "Data Files") or Unicode software and any associated documentation -(the "Software") to deal in the Data Files or Software -without restriction, including without limitation the rights to use, -copy, modify, merge, publish, distribute, and/or sell copies of -the Data Files or Software, and to permit persons to whom the Data Files -or Software are furnished to do so, provided that either -(a) this copyright and permission notice appear with all copies -of the Data Files or Software, or -(b) this copyright and permission notice appear in associated -Documentation. - -THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF -ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -NONINFRINGEMENT OF THIRD PARTY RIGHTS. -IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS -NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL -DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, -DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER -TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR -PERFORMANCE OF THE DATA FILES OR SOFTWARE. - -Except as contained in this notice, the name of a copyright holder -shall not be used in advertising or otherwise to promote the sale, -use or other dealings in these Data Files or Software without prior -written authorization of the copyright holder. - ---------------------- - -Third-Party Software Licenses - -This section contains third-party software notices and/or additional -terms for licensed third-party software components included within ICU -libraries. - -1. ICU License - ICU 1.8.1 to ICU 57.1 - -COPYRIGHT AND PERMISSION NOTICE - -Copyright (c) 1995-2016 International Business Machines Corporation and others -All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining -a copy of this software and associated documentation files (the -"Software"), to deal in the Software without restriction, including -without limitation the rights to use, copy, modify, merge, publish, -distribute, and/or sell copies of the Software, and to permit persons -to whom the Software is furnished to do so, provided that the above -copyright notice(s) and this permission notice appear in all copies of -the Software and that both the above copyright notice(s) and this -permission notice appear in supporting documentation. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT -OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR -HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY -SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER -RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF -CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN -CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - -Except as contained in this notice, the name of a copyright holder -shall not be used in advertising or otherwise to promote the sale, use -or other dealings in this Software without prior written authorization -of the copyright holder. - -All trademarks and registered trademarks mentioned herein are the -property of their respective owners. - -2. Chinese/Japanese Word Break Dictionary Data (cjdict.txt) - - # The Google Chrome software developed by Google is licensed under - # the BSD license. Other software included in this distribution is - # provided under other licenses, as set forth below. - # - # The BSD License - # http://opensource.org/licenses/bsd-license.php - # Copyright (C) 2006-2008, Google Inc. - # - # All rights reserved. - # - # Redistribution and use in source and binary forms, with or without - # modification, are permitted provided that the following conditions are met: - # - # Redistributions of source code must retain the above copyright notice, - # this list of conditions and the following disclaimer. - # Redistributions in binary form must reproduce the above - # copyright notice, this list of conditions and the following - # disclaimer in the documentation and/or other materials provided with - # the distribution. - # Neither the name of Google Inc. nor the names of its - # contributors may be used to endorse or promote products derived from - # this software without specific prior written permission. - # - # - # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND - # CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, - # INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF - # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR - # BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - # - # - # The word list in cjdict.txt are generated by combining three word lists - # listed below with further processing for compound word breaking. The - # frequency is generated with an iterative training against Google web - # corpora. - # - # * Libtabe (Chinese) - # - https://sourceforge.net/project/?group_id=1519 - # - Its license terms and conditions are shown below. - # - # * IPADIC (Japanese) - # - http://chasen.aist-nara.ac.jp/chasen/distribution.html - # - Its license terms and conditions are shown below. - # - # ---------COPYING.libtabe ---- BEGIN-------------------- - # - # /* - # * Copyright (c) 1999 TaBE Project. - # * Copyright (c) 1999 Pai-Hsiang Hsiao. - # * All rights reserved. - # * - # * Redistribution and use in source and binary forms, with or without - # * modification, are permitted provided that the following conditions - # * are met: - # * - # * . Redistributions of source code must retain the above copyright - # * notice, this list of conditions and the following disclaimer. - # * . Redistributions in binary form must reproduce the above copyright - # * notice, this list of conditions and the following disclaimer in - # * the documentation and/or other materials provided with the - # * distribution. - # * . Neither the name of the TaBE Project nor the names of its - # * contributors may be used to endorse or promote products derived - # * from this software without specific prior written permission. - # * - # * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - # * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - # * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - # * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE - # * REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - # * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - # * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - # * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - # * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - # * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - # * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - # * OF THE POSSIBILITY OF SUCH DAMAGE. - # */ - # - # /* - # * Copyright (c) 1999 Computer Systems and Communication Lab, - # * Institute of Information Science, Academia - # * Sinica. All rights reserved. - # * - # * Redistribution and use in source and binary forms, with or without - # * modification, are permitted provided that the following conditions - # * are met: - # * - # * . Redistributions of source code must retain the above copyright - # * notice, this list of conditions and the following disclaimer. - # * . Redistributions in binary form must reproduce the above copyright - # * notice, this list of conditions and the following disclaimer in - # * the documentation and/or other materials provided with the - # * distribution. - # * . Neither the name of the Computer Systems and Communication Lab - # * nor the names of its contributors may be used to endorse or - # * promote products derived from this software without specific - # * prior written permission. - # * - # * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - # * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - # * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - # * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE - # * REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - # * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - # * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - # * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - # * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - # * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - # * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - # * OF THE POSSIBILITY OF SUCH DAMAGE. - # */ - # - # Copyright 1996 Chih-Hao Tsai @ Beckman Institute, - # University of Illinois - # c-tsai4@uiuc.edu http://casper.beckman.uiuc.edu/~c-tsai4 - # - # ---------------COPYING.libtabe-----END-------------------------------- - # - # - # ---------------COPYING.ipadic-----BEGIN------------------------------- - # - # Copyright 2000, 2001, 2002, 2003 Nara Institute of Science - # and Technology. All Rights Reserved. - # - # Use, reproduction, and distribution of this software is permitted. - # Any copy of this software, whether in its original form or modified, - # must include both the above copyright notice and the following - # paragraphs. - # - # Nara Institute of Science and Technology (NAIST), - # the copyright holders, disclaims all warranties with regard to this - # software, including all implied warranties of merchantability and - # fitness, in no event shall NAIST be liable for - # any special, indirect or consequential damages or any damages - # whatsoever resulting from loss of use, data or profits, whether in an - # action of contract, negligence or other tortuous action, arising out - # of or in connection with the use or performance of this software. - # - # A large portion of the dictionary entries - # originate from ICOT Free Software. The following conditions for ICOT - # Free Software applies to the current dictionary as well. - # - # Each User may also freely distribute the Program, whether in its - # original form or modified, to any third party or parties, PROVIDED - # that the provisions of Section 3 ("NO WARRANTY") will ALWAYS appear - # on, or be attached to, the Program, which is distributed substantially - # in the same form as set out herein and that such intended - # distribution, if actually made, will neither violate or otherwise - # contravene any of the laws and regulations of the countries having - # jurisdiction over the User or the intended distribution itself. - # - # NO WARRANTY - # - # The program was produced on an experimental basis in the course of the - # research and development conducted during the project and is provided - # to users as so produced on an experimental basis. Accordingly, the - # program is provided without any warranty whatsoever, whether express, - # implied, statutory or otherwise. The term "warranty" used herein - # includes, but is not limited to, any warranty of the quality, - # performance, merchantability and fitness for a particular purpose of - # the program and the nonexistence of any infringement or violation of - # any right of any third party. - # - # Each user of the program will agree and understand, and be deemed to - # have agreed and understood, that there is no warranty whatsoever for - # the program and, accordingly, the entire risk arising from or - # otherwise connected with the program is assumed by the user. - # - # Therefore, neither ICOT, the copyright holder, or any other - # organization that participated in or was otherwise related to the - # development of the program and their respective officials, directors, - # officers and other employees shall be held liable for any and all - # damages, including, without limitation, general, special, incidental - # and consequential damages, arising out of or otherwise in connection - # with the use or inability to use the program or any product, material - # or result produced or otherwise obtained by using the program, - # regardless of whether they have been advised of, or otherwise had - # knowledge of, the possibility of such damages at any time during the - # project or thereafter. Each user will be deemed to have agreed to the - # foregoing by his or her commencement of use of the program. The term - # "use" as used herein includes, but is not limited to, the use, - # modification, copying and distribution of the program and the - # production of secondary products from the program. - # - # In the case where the program, whether in its original form or - # modified, was distributed or delivered to or received by a user from - # any person, organization or entity other than ICOT, unless it makes or - # grants independently of ICOT any specific warranty to the user in - # writing, such person, organization or entity, will also be exempted - # from and not be held liable to the user for any such damages as noted - # above as far as the program is concerned. - # - # ---------------COPYING.ipadic-----END---------------------------------- - -3. Lao Word Break Dictionary Data (laodict.txt) - - # Copyright (c) 2013 International Business Machines Corporation - # and others. All Rights Reserved. - # - # Project: http://code.google.com/p/lao-dictionary/ - # Dictionary: http://lao-dictionary.googlecode.com/git/Lao-Dictionary.txt - # License: http://lao-dictionary.googlecode.com/git/Lao-Dictionary-LICENSE.txt - # (copied below) - # - # This file is derived from the above dictionary, with slight - # modifications. - # ---------------------------------------------------------------------- - # Copyright (C) 2013 Brian Eugene Wilson, Robert Martin Campbell. - # All rights reserved. - # - # Redistribution and use in source and binary forms, with or without - # modification, - # are permitted provided that the following conditions are met: - # - # - # Redistributions of source code must retain the above copyright notice, this - # list of conditions and the following disclaimer. Redistributions in - # binary form must reproduce the above copyright notice, this list of - # conditions and the following disclaimer in the documentation and/or - # other materials provided with the distribution. - # - # - # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE - # COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, - # INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - # STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - # OF THE POSSIBILITY OF SUCH DAMAGE. - # -------------------------------------------------------------------------- - -4. Burmese Word Break Dictionary Data (burmesedict.txt) - - # Copyright (c) 2014 International Business Machines Corporation - # and others. All Rights Reserved. - # - # This list is part of a project hosted at: - # github.com/kanyawtech/myanmar-karen-word-lists - # - # -------------------------------------------------------------------------- - # Copyright (c) 2013, LeRoy Benjamin Sharon - # All rights reserved. - # - # Redistribution and use in source and binary forms, with or without - # modification, are permitted provided that the following conditions - # are met: Redistributions of source code must retain the above - # copyright notice, this list of conditions and the following - # disclaimer. Redistributions in binary form must reproduce the - # above copyright notice, this list of conditions and the following - # disclaimer in the documentation and/or other materials provided - # with the distribution. - # - # Neither the name Myanmar Karen Word Lists, nor the names of its - # contributors may be used to endorse or promote products derived - # from this software without specific prior written permission. - # - # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND - # CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, - # INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF - # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS - # BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED - # TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON - # ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR - # TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF - # THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - # SUCH DAMAGE. - # -------------------------------------------------------------------------- - -5. Time Zone Database - - ICU uses the public domain data and code derived from Time Zone -Database for its time zone support. The ownership of the TZ database -is explained in BCP 175: Procedure for Maintaining the Time Zone -Database section 7. - - # 7. Database Ownership - # - # The TZ database itself is not an IETF Contribution or an IETF - # document. Rather it is a pre-existing and regularly updated work - # that is in the public domain, and is intended to remain in the - # public domain. Therefore, BCPs 78 [RFC5378] and 79 [RFC3979] do - # not apply to the TZ Database or contributions that individuals make - # to it. Should any claims be made and substantiated against the TZ - # Database, the organization that is providing the IANA - # Considerations defined in this RFC, under the memorandum of - # understanding with the IETF, currently ICANN, may act in accordance - # with all competent court orders. No ownership claims will be made - # by ICANN or the IETF Trust on the database or the code. Any person - # making a contribution to the database or code waives all rights to - # future claims in that contribution or in the TZ Database. - -6. Google double-conversion - -Copyright 2006-2011, the V8 project authors. All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following - disclaimer in the documentation and/or other materials provided - with the distribution. - * Neither the name of Google Inc. nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/src/tree_sitter/unicode/README.md b/src/tree_sitter/unicode/README.md deleted file mode 100644 index 623b8e3843..0000000000 --- a/src/tree_sitter/unicode/README.md +++ /dev/null @@ -1,29 +0,0 @@ -# ICU Parts - -This directory contains a small subset of files from the Unicode organization's [ICU repository](https://github.com/unicode-org/icu). - -### License - -The license for these files is contained in the `LICENSE` file within this directory. - -### Contents - -* Source files taken from the [`icu4c/source/common/unicode`](https://github.com/unicode-org/icu/tree/552b01f61127d30d6589aa4bf99468224979b661/icu4c/source/common/unicode) directory: - * `utf8.h` - * `utf16.h` - * `umachine.h` -* Empty source files that are referenced by the above source files, but whose original contents in `libicu` are not needed: - * `ptypes.h` - * `urename.h` - * `utf.h` -* `ICU_SHA` - File containing the Git SHA of the commit in the `icu` repository from which the files were obtained. -* `LICENSE` - The license file from the [`icu4c`](https://github.com/unicode-org/icu/tree/552b01f61127d30d6589aa4bf99468224979b661/icu4c) directory of the `icu` repository. -* `README.md` - This text file. - -### Updating ICU - -To incorporate changes from the upstream `icu` repository: - -* Update `ICU_SHA` with the new Git SHA. -* Update `LICENSE` with the license text from the directory mentioned above. -* Update `utf8.h`, `utf16.h`, and `umachine.h` with their new contents in the `icu` repository. diff --git a/src/tree_sitter/unicode/ptypes.h b/src/tree_sitter/unicode/ptypes.h deleted file mode 100644 index ac79ad0f98..0000000000 --- a/src/tree_sitter/unicode/ptypes.h +++ /dev/null @@ -1 +0,0 @@ -// This file must exist in order for `utf8.h` and `utf16.h` to be used. diff --git a/src/tree_sitter/unicode/umachine.h b/src/tree_sitter/unicode/umachine.h deleted file mode 100644 index bbf6ef9c8b..0000000000 --- a/src/tree_sitter/unicode/umachine.h +++ /dev/null @@ -1,448 +0,0 @@ -// © 2016 and later: Unicode, Inc. and others. -// License & terms of use: http://www.unicode.org/copyright.html -/* -****************************************************************************** -* -* Copyright (C) 1999-2015, International Business Machines -* Corporation and others. All Rights Reserved. -* -****************************************************************************** -* file name: umachine.h -* encoding: UTF-8 -* tab size: 8 (not used) -* indentation:4 -* -* created on: 1999sep13 -* created by: Markus W. Scherer -* -* This file defines basic types and constants for ICU to be -* platform-independent. umachine.h and utf.h are included into -* utypes.h to provide all the general definitions for ICU. -* All of these definitions used to be in utypes.h before -* the UTF-handling macros made this unmaintainable. -*/ - -#ifndef __UMACHINE_H__ -#define __UMACHINE_H__ - - -/** - * \file - * \brief Basic types and constants for UTF - * - * <h2> Basic types and constants for UTF </h2> - * This file defines basic types and constants for utf.h to be - * platform-independent. umachine.h and utf.h are included into - * utypes.h to provide all the general definitions for ICU. - * All of these definitions used to be in utypes.h before - * the UTF-handling macros made this unmaintainable. - * - */ -/*==========================================================================*/ -/* Include platform-dependent definitions */ -/* which are contained in the platform-specific file platform.h */ -/*==========================================================================*/ - -#include "./ptypes.h" /* platform.h is included in ptypes.h */ - -/* - * ANSI C headers: - * stddef.h defines wchar_t - */ -#include <stddef.h> - -/*==========================================================================*/ -/* For C wrappers, we use the symbol U_STABLE. */ -/* This works properly if the includer is C or C++. */ -/* Functions are declared U_STABLE return-type U_EXPORT2 function-name()... */ -/*==========================================================================*/ - -/** - * \def U_CFUNC - * This is used in a declaration of a library private ICU C function. - * @stable ICU 2.4 - */ - -/** - * \def U_CDECL_BEGIN - * This is used to begin a declaration of a library private ICU C API. - * @stable ICU 2.4 - */ - -/** - * \def U_CDECL_END - * This is used to end a declaration of a library private ICU C API - * @stable ICU 2.4 - */ - -#ifdef __cplusplus -# define U_CFUNC extern "C" -# define U_CDECL_BEGIN extern "C" { -# define U_CDECL_END } -#else -# define U_CFUNC extern -# define U_CDECL_BEGIN -# define U_CDECL_END -#endif - -#ifndef U_ATTRIBUTE_DEPRECATED -/** - * \def U_ATTRIBUTE_DEPRECATED - * This is used for GCC specific attributes - * @internal - */ -#if U_GCC_MAJOR_MINOR >= 302 -# define U_ATTRIBUTE_DEPRECATED __attribute__ ((deprecated)) -/** - * \def U_ATTRIBUTE_DEPRECATED - * This is used for Visual C++ specific attributes - * @internal - */ -#elif defined(_MSC_VER) && (_MSC_VER >= 1400) -# define U_ATTRIBUTE_DEPRECATED __declspec(deprecated) -#else -# define U_ATTRIBUTE_DEPRECATED -#endif -#endif - -/** This is used to declare a function as a public ICU C API @stable ICU 2.0*/ -#define U_CAPI U_CFUNC U_EXPORT -/** This is used to declare a function as a stable public ICU C API*/ -#define U_STABLE U_CAPI -/** This is used to declare a function as a draft public ICU C API */ -#define U_DRAFT U_CAPI -/** This is used to declare a function as a deprecated public ICU C API */ -#define U_DEPRECATED U_CAPI U_ATTRIBUTE_DEPRECATED -/** This is used to declare a function as an obsolete public ICU C API */ -#define U_OBSOLETE U_CAPI -/** This is used to declare a function as an internal ICU C API */ -#define U_INTERNAL U_CAPI - -/** - * \def U_OVERRIDE - * Defined to the C++11 "override" keyword if available. - * Denotes a class or member which is an override of the base class. - * May result in an error if it applied to something not an override. - * @internal - */ -#ifndef U_OVERRIDE -#define U_OVERRIDE override -#endif - -/** - * \def U_FINAL - * Defined to the C++11 "final" keyword if available. - * Denotes a class or member which may not be overridden in subclasses. - * May result in an error if subclasses attempt to override. - * @internal - */ -#if !defined(U_FINAL) || defined(U_IN_DOXYGEN) -#define U_FINAL final -#endif - -// Before ICU 65, function-like, multi-statement ICU macros were just defined as -// series of statements wrapped in { } blocks and the caller could choose to -// either treat them as if they were actual functions and end the invocation -// with a trailing ; creating an empty statement after the block or else omit -// this trailing ; using the knowledge that the macro would expand to { }. -// -// But doing so doesn't work well with macros that look like functions and -// compiler warnings about empty statements (ICU-20601) and ICU 65 therefore -// switches to the standard solution of wrapping such macros in do { } while. -// -// This will however break existing code that depends on being able to invoke -// these macros without a trailing ; so to be able to remain compatible with -// such code the wrapper is itself defined as macros so that it's possible to -// build ICU 65 and later with the old macro behaviour, like this: -// -// CPPFLAGS='-DUPRV_BLOCK_MACRO_BEGIN="" -DUPRV_BLOCK_MACRO_END=""' -// runConfigureICU ... - -/** - * \def UPRV_BLOCK_MACRO_BEGIN - * Defined as the "do" keyword by default. - * @internal - */ -#ifndef UPRV_BLOCK_MACRO_BEGIN -#define UPRV_BLOCK_MACRO_BEGIN do -#endif - -/** - * \def UPRV_BLOCK_MACRO_END - * Defined as "while (FALSE)" by default. - * @internal - */ -#ifndef UPRV_BLOCK_MACRO_END -#define UPRV_BLOCK_MACRO_END while (FALSE) -#endif - -/*==========================================================================*/ -/* limits for int32_t etc., like in POSIX inttypes.h */ -/*==========================================================================*/ - -#ifndef INT8_MIN -/** The smallest value an 8 bit signed integer can hold @stable ICU 2.0 */ -# define INT8_MIN ((int8_t)(-128)) -#endif -#ifndef INT16_MIN -/** The smallest value a 16 bit signed integer can hold @stable ICU 2.0 */ -# define INT16_MIN ((int16_t)(-32767-1)) -#endif -#ifndef INT32_MIN -/** The smallest value a 32 bit signed integer can hold @stable ICU 2.0 */ -# define INT32_MIN ((int32_t)(-2147483647-1)) -#endif - -#ifndef INT8_MAX -/** The largest value an 8 bit signed integer can hold @stable ICU 2.0 */ -# define INT8_MAX ((int8_t)(127)) -#endif -#ifndef INT16_MAX -/** The largest value a 16 bit signed integer can hold @stable ICU 2.0 */ -# define INT16_MAX ((int16_t)(32767)) -#endif -#ifndef INT32_MAX -/** The largest value a 32 bit signed integer can hold @stable ICU 2.0 */ -# define INT32_MAX ((int32_t)(2147483647)) -#endif - -#ifndef UINT8_MAX -/** The largest value an 8 bit unsigned integer can hold @stable ICU 2.0 */ -# define UINT8_MAX ((uint8_t)(255U)) -#endif -#ifndef UINT16_MAX -/** The largest value a 16 bit unsigned integer can hold @stable ICU 2.0 */ -# define UINT16_MAX ((uint16_t)(65535U)) -#endif -#ifndef UINT32_MAX -/** The largest value a 32 bit unsigned integer can hold @stable ICU 2.0 */ -# define UINT32_MAX ((uint32_t)(4294967295U)) -#endif - -#if defined(U_INT64_T_UNAVAILABLE) -# error int64_t is required for decimal format and rule-based number format. -#else -# ifndef INT64_C -/** - * Provides a platform independent way to specify a signed 64-bit integer constant. - * note: may be wrong for some 64 bit platforms - ensure your compiler provides INT64_C - * @stable ICU 2.8 - */ -# define INT64_C(c) c ## LL -# endif -# ifndef UINT64_C -/** - * Provides a platform independent way to specify an unsigned 64-bit integer constant. - * note: may be wrong for some 64 bit platforms - ensure your compiler provides UINT64_C - * @stable ICU 2.8 - */ -# define UINT64_C(c) c ## ULL -# endif -# ifndef U_INT64_MIN -/** The smallest value a 64 bit signed integer can hold @stable ICU 2.8 */ -# define U_INT64_MIN ((int64_t)(INT64_C(-9223372036854775807)-1)) -# endif -# ifndef U_INT64_MAX -/** The largest value a 64 bit signed integer can hold @stable ICU 2.8 */ -# define U_INT64_MAX ((int64_t)(INT64_C(9223372036854775807))) -# endif -# ifndef U_UINT64_MAX -/** The largest value a 64 bit unsigned integer can hold @stable ICU 2.8 */ -# define U_UINT64_MAX ((uint64_t)(UINT64_C(18446744073709551615))) -# endif -#endif - -/*==========================================================================*/ -/* Boolean data type */ -/*==========================================================================*/ - -/** The ICU boolean type @stable ICU 2.0 */ -typedef int8_t UBool; - -#ifndef TRUE -/** The TRUE value of a UBool @stable ICU 2.0 */ -# define TRUE 1 -#endif -#ifndef FALSE -/** The FALSE value of a UBool @stable ICU 2.0 */ -# define FALSE 0 -#endif - - -/*==========================================================================*/ -/* Unicode data types */ -/*==========================================================================*/ - -/* wchar_t-related definitions -------------------------------------------- */ - -/* - * \def U_WCHAR_IS_UTF16 - * Defined if wchar_t uses UTF-16. - * - * @stable ICU 2.0 - */ -/* - * \def U_WCHAR_IS_UTF32 - * Defined if wchar_t uses UTF-32. - * - * @stable ICU 2.0 - */ -#if !defined(U_WCHAR_IS_UTF16) && !defined(U_WCHAR_IS_UTF32) -# ifdef __STDC_ISO_10646__ -# if (U_SIZEOF_WCHAR_T==2) -# define U_WCHAR_IS_UTF16 -# elif (U_SIZEOF_WCHAR_T==4) -# define U_WCHAR_IS_UTF32 -# endif -# elif defined __UCS2__ -# if (U_PF_OS390 <= U_PLATFORM && U_PLATFORM <= U_PF_OS400) && (U_SIZEOF_WCHAR_T==2) -# define U_WCHAR_IS_UTF16 -# endif -# elif defined(__UCS4__) || (U_PLATFORM == U_PF_OS400 && defined(__UTF32__)) -# if (U_SIZEOF_WCHAR_T==4) -# define U_WCHAR_IS_UTF32 -# endif -# elif U_PLATFORM_IS_DARWIN_BASED || (U_SIZEOF_WCHAR_T==4 && U_PLATFORM_IS_LINUX_BASED) -# define U_WCHAR_IS_UTF32 -# elif U_PLATFORM_HAS_WIN32_API -# define U_WCHAR_IS_UTF16 -# endif -#endif - -/* UChar and UChar32 definitions -------------------------------------------- */ - -/** Number of bytes in a UChar. @stable ICU 2.0 */ -#define U_SIZEOF_UCHAR 2 - -/** - * \def U_CHAR16_IS_TYPEDEF - * If 1, then char16_t is a typedef and not a real type (yet) - * @internal - */ -#if (U_PLATFORM == U_PF_AIX) && defined(__cplusplus) &&(U_CPLUSPLUS_VERSION < 11) -// for AIX, uchar.h needs to be included -# include <uchar.h> -# define U_CHAR16_IS_TYPEDEF 1 -#elif defined(_MSC_VER) && (_MSC_VER < 1900) -// Versions of Visual Studio/MSVC below 2015 do not support char16_t as a real type, -// and instead use a typedef. https://msdn.microsoft.com/library/bb531344.aspx -# define U_CHAR16_IS_TYPEDEF 1 -#else -# define U_CHAR16_IS_TYPEDEF 0 -#endif - - -/** - * \var UChar - * - * The base type for UTF-16 code units and pointers. - * Unsigned 16-bit integer. - * Starting with ICU 59, C++ API uses char16_t directly, while C API continues to use UChar. - * - * UChar is configurable by defining the macro UCHAR_TYPE - * on the preprocessor or compiler command line: - * -DUCHAR_TYPE=uint16_t or -DUCHAR_TYPE=wchar_t (if U_SIZEOF_WCHAR_T==2) etc. - * (The UCHAR_TYPE can also be \#defined earlier in this file, for outside the ICU library code.) - * This is for transitional use from application code that uses uint16_t or wchar_t for UTF-16. - * - * The default is UChar=char16_t. - * - * C++11 defines char16_t as bit-compatible with uint16_t, but as a distinct type. - * - * In C, char16_t is a simple typedef of uint_least16_t. - * ICU requires uint_least16_t=uint16_t for data memory mapping. - * On macOS, char16_t is not available because the uchar.h standard header is missing. - * - * @stable ICU 4.4 - */ - -#if 1 - // #if 1 is normal. UChar defaults to char16_t in C++. - // For configuration testing of UChar=uint16_t temporarily change this to #if 0. - // The intltest Makefile #defines UCHAR_TYPE=char16_t, - // so we only #define it to uint16_t if it is undefined so far. -#elif !defined(UCHAR_TYPE) -# define UCHAR_TYPE uint16_t -#endif - -#if defined(U_COMBINED_IMPLEMENTATION) || defined(U_COMMON_IMPLEMENTATION) || \ - defined(U_I18N_IMPLEMENTATION) || defined(U_IO_IMPLEMENTATION) - // Inside the ICU library code, never configurable. - typedef char16_t UChar; -#elif defined(UCHAR_TYPE) - typedef UCHAR_TYPE UChar; -#elif defined(__cplusplus) - typedef char16_t UChar; -#else - typedef uint16_t UChar; -#endif - -/** - * \var OldUChar - * Default ICU 58 definition of UChar. - * A base type for UTF-16 code units and pointers. - * Unsigned 16-bit integer. - * - * Define OldUChar to be wchar_t if that is 16 bits wide. - * If wchar_t is not 16 bits wide, then define UChar to be uint16_t. - * - * This makes the definition of OldUChar platform-dependent - * but allows direct string type compatibility with platforms with - * 16-bit wchar_t types. - * - * This is how UChar was defined in ICU 58, for transition convenience. - * Exception: ICU 58 UChar was defined to UCHAR_TYPE if that macro was defined. - * The current UChar responds to UCHAR_TYPE but OldUChar does not. - * - * @stable ICU 59 - */ -#if U_SIZEOF_WCHAR_T==2 - typedef wchar_t OldUChar; -#elif defined(__CHAR16_TYPE__) - typedef __CHAR16_TYPE__ OldUChar; -#else - typedef uint16_t OldUChar; -#endif - -/** - * Define UChar32 as a type for single Unicode code points. - * UChar32 is a signed 32-bit integer (same as int32_t). - * - * The Unicode code point range is 0..0x10ffff. - * All other values (negative or >=0x110000) are illegal as Unicode code points. - * They may be used as sentinel values to indicate "done", "error" - * or similar non-code point conditions. - * - * Before ICU 2.4 (Jitterbug 2146), UChar32 was defined - * to be wchar_t if that is 32 bits wide (wchar_t may be signed or unsigned) - * or else to be uint32_t. - * That is, the definition of UChar32 was platform-dependent. - * - * @see U_SENTINEL - * @stable ICU 2.4 - */ -typedef int32_t UChar32; - -/** - * This value is intended for sentinel values for APIs that - * (take or) return single code points (UChar32). - * It is outside of the Unicode code point range 0..0x10ffff. - * - * For example, a "done" or "error" value in a new API - * could be indicated with U_SENTINEL. - * - * ICU APIs designed before ICU 2.4 usually define service-specific "done" - * values, mostly 0xffff. - * Those may need to be distinguished from - * actual U+ffff text contents by calling functions like - * CharacterIterator::hasNext() or UnicodeString::length(). - * - * @return -1 - * @see UChar32 - * @stable ICU 2.4 - */ -#define U_SENTINEL (-1) - -#include "./urename.h" - -#endif diff --git a/src/tree_sitter/unicode/urename.h b/src/tree_sitter/unicode/urename.h deleted file mode 100644 index ac79ad0f98..0000000000 --- a/src/tree_sitter/unicode/urename.h +++ /dev/null @@ -1 +0,0 @@ -// This file must exist in order for `utf8.h` and `utf16.h` to be used. diff --git a/src/tree_sitter/unicode/utf.h b/src/tree_sitter/unicode/utf.h deleted file mode 100644 index ac79ad0f98..0000000000 --- a/src/tree_sitter/unicode/utf.h +++ /dev/null @@ -1 +0,0 @@ -// This file must exist in order for `utf8.h` and `utf16.h` to be used. diff --git a/src/tree_sitter/unicode/utf16.h b/src/tree_sitter/unicode/utf16.h deleted file mode 100644 index b547922441..0000000000 --- a/src/tree_sitter/unicode/utf16.h +++ /dev/null @@ -1,733 +0,0 @@ -// © 2016 and later: Unicode, Inc. and others. -// License & terms of use: http://www.unicode.org/copyright.html -/* -******************************************************************************* -* -* Copyright (C) 1999-2012, International Business Machines -* Corporation and others. All Rights Reserved. -* -******************************************************************************* -* file name: utf16.h -* encoding: UTF-8 -* tab size: 8 (not used) -* indentation:4 -* -* created on: 1999sep09 -* created by: Markus W. Scherer -*/ - -/** - * \file - * \brief C API: 16-bit Unicode handling macros - * - * This file defines macros to deal with 16-bit Unicode (UTF-16) code units and strings. - * - * For more information see utf.h and the ICU User Guide Strings chapter - * (http://userguide.icu-project.org/strings). - * - * <em>Usage:</em> - * ICU coding guidelines for if() statements should be followed when using these macros. - * Compound statements (curly braces {}) must be used for if-else-while... - * bodies and all macro statements should be terminated with semicolon. - */ - -#ifndef __UTF16_H__ -#define __UTF16_H__ - -#include "./umachine.h" -#ifndef __UTF_H__ -# include "./utf.h" -#endif - -/* single-code point definitions -------------------------------------------- */ - -/** - * Does this code unit alone encode a code point (BMP, not a surrogate)? - * @param c 16-bit code unit - * @return TRUE or FALSE - * @stable ICU 2.4 - */ -#define U16_IS_SINGLE(c) !U_IS_SURROGATE(c) - -/** - * Is this code unit a lead surrogate (U+d800..U+dbff)? - * @param c 16-bit code unit - * @return TRUE or FALSE - * @stable ICU 2.4 - */ -#define U16_IS_LEAD(c) (((c)&0xfffffc00)==0xd800) - -/** - * Is this code unit a trail surrogate (U+dc00..U+dfff)? - * @param c 16-bit code unit - * @return TRUE or FALSE - * @stable ICU 2.4 - */ -#define U16_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00) - -/** - * Is this code unit a surrogate (U+d800..U+dfff)? - * @param c 16-bit code unit - * @return TRUE or FALSE - * @stable ICU 2.4 - */ -#define U16_IS_SURROGATE(c) U_IS_SURROGATE(c) - -/** - * Assuming c is a surrogate code point (U16_IS_SURROGATE(c)), - * is it a lead surrogate? - * @param c 16-bit code unit - * @return TRUE or FALSE - * @stable ICU 2.4 - */ -#define U16_IS_SURROGATE_LEAD(c) (((c)&0x400)==0) - -/** - * Assuming c is a surrogate code point (U16_IS_SURROGATE(c)), - * is it a trail surrogate? - * @param c 16-bit code unit - * @return TRUE or FALSE - * @stable ICU 4.2 - */ -#define U16_IS_SURROGATE_TRAIL(c) (((c)&0x400)!=0) - -/** - * Helper constant for U16_GET_SUPPLEMENTARY. - * @internal - */ -#define U16_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000) - -/** - * Get a supplementary code point value (U+10000..U+10ffff) - * from its lead and trail surrogates. - * The result is undefined if the input values are not - * lead and trail surrogates. - * - * @param lead lead surrogate (U+d800..U+dbff) - * @param trail trail surrogate (U+dc00..U+dfff) - * @return supplementary code point (U+10000..U+10ffff) - * @stable ICU 2.4 - */ -#define U16_GET_SUPPLEMENTARY(lead, trail) \ - (((UChar32)(lead)<<10UL)+(UChar32)(trail)-U16_SURROGATE_OFFSET) - - -/** - * Get the lead surrogate (0xd800..0xdbff) for a - * supplementary code point (0x10000..0x10ffff). - * @param supplementary 32-bit code point (U+10000..U+10ffff) - * @return lead surrogate (U+d800..U+dbff) for supplementary - * @stable ICU 2.4 - */ -#define U16_LEAD(supplementary) (UChar)(((supplementary)>>10)+0xd7c0) - -/** - * Get the trail surrogate (0xdc00..0xdfff) for a - * supplementary code point (0x10000..0x10ffff). - * @param supplementary 32-bit code point (U+10000..U+10ffff) - * @return trail surrogate (U+dc00..U+dfff) for supplementary - * @stable ICU 2.4 - */ -#define U16_TRAIL(supplementary) (UChar)(((supplementary)&0x3ff)|0xdc00) - -/** - * How many 16-bit code units are used to encode this Unicode code point? (1 or 2) - * The result is not defined if c is not a Unicode code point (U+0000..U+10ffff). - * @param c 32-bit code point - * @return 1 or 2 - * @stable ICU 2.4 - */ -#define U16_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2) - -/** - * The maximum number of 16-bit code units per Unicode code point (U+0000..U+10ffff). - * @return 2 - * @stable ICU 2.4 - */ -#define U16_MAX_LENGTH 2 - -/** - * Get a code point from a string at a random-access offset, - * without changing the offset. - * "Unsafe" macro, assumes well-formed UTF-16. - * - * The offset may point to either the lead or trail surrogate unit - * for a supplementary code point, in which case the macro will read - * the adjacent matching surrogate as well. - * The result is undefined if the offset points to a single, unpaired surrogate. - * Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT. - * - * @param s const UChar * string - * @param i string offset - * @param c output UChar32 variable - * @see U16_GET - * @stable ICU 2.4 - */ -#define U16_GET_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \ - (c)=(s)[i]; \ - if(U16_IS_SURROGATE(c)) { \ - if(U16_IS_SURROGATE_LEAD(c)) { \ - (c)=U16_GET_SUPPLEMENTARY((c), (s)[(i)+1]); \ - } else { \ - (c)=U16_GET_SUPPLEMENTARY((s)[(i)-1], (c)); \ - } \ - } \ -} UPRV_BLOCK_MACRO_END - -/** - * Get a code point from a string at a random-access offset, - * without changing the offset. - * "Safe" macro, handles unpaired surrogates and checks for string boundaries. - * - * The offset may point to either the lead or trail surrogate unit - * for a supplementary code point, in which case the macro will read - * the adjacent matching surrogate as well. - * - * The length can be negative for a NUL-terminated string. - * - * If the offset points to a single, unpaired surrogate, then - * c is set to that unpaired surrogate. - * Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT. - * - * @param s const UChar * string - * @param start starting string offset (usually 0) - * @param i string offset, must be start<=i<length - * @param length string length - * @param c output UChar32 variable - * @see U16_GET_UNSAFE - * @stable ICU 2.4 - */ -#define U16_GET(s, start, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \ - (c)=(s)[i]; \ - if(U16_IS_SURROGATE(c)) { \ - uint16_t __c2; \ - if(U16_IS_SURROGATE_LEAD(c)) { \ - if((i)+1!=(length) && U16_IS_TRAIL(__c2=(s)[(i)+1])) { \ - (c)=U16_GET_SUPPLEMENTARY((c), __c2); \ - } \ - } else { \ - if((i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \ - (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \ - } \ - } \ - } \ -} UPRV_BLOCK_MACRO_END - -/** - * Get a code point from a string at a random-access offset, - * without changing the offset. - * "Safe" macro, handles unpaired surrogates and checks for string boundaries. - * - * The offset may point to either the lead or trail surrogate unit - * for a supplementary code point, in which case the macro will read - * the adjacent matching surrogate as well. - * - * The length can be negative for a NUL-terminated string. - * - * If the offset points to a single, unpaired surrogate, then - * c is set to U+FFFD. - * Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT_OR_FFFD. - * - * @param s const UChar * string - * @param start starting string offset (usually 0) - * @param i string offset, must be start<=i<length - * @param length string length - * @param c output UChar32 variable - * @see U16_GET_UNSAFE - * @stable ICU 60 - */ -#define U16_GET_OR_FFFD(s, start, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \ - (c)=(s)[i]; \ - if(U16_IS_SURROGATE(c)) { \ - uint16_t __c2; \ - if(U16_IS_SURROGATE_LEAD(c)) { \ - if((i)+1!=(length) && U16_IS_TRAIL(__c2=(s)[(i)+1])) { \ - (c)=U16_GET_SUPPLEMENTARY((c), __c2); \ - } else { \ - (c)=0xfffd; \ - } \ - } else { \ - if((i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \ - (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \ - } else { \ - (c)=0xfffd; \ - } \ - } \ - } \ -} UPRV_BLOCK_MACRO_END - -/* definitions with forward iteration --------------------------------------- */ - -/** - * Get a code point from a string at a code point boundary offset, - * and advance the offset to the next code point boundary. - * (Post-incrementing forward iteration.) - * "Unsafe" macro, assumes well-formed UTF-16. - * - * The offset may point to the lead surrogate unit - * for a supplementary code point, in which case the macro will read - * the following trail surrogate as well. - * If the offset points to a trail surrogate, then that itself - * will be returned as the code point. - * The result is undefined if the offset points to a single, unpaired lead surrogate. - * - * @param s const UChar * string - * @param i string offset - * @param c output UChar32 variable - * @see U16_NEXT - * @stable ICU 2.4 - */ -#define U16_NEXT_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \ - (c)=(s)[(i)++]; \ - if(U16_IS_LEAD(c)) { \ - (c)=U16_GET_SUPPLEMENTARY((c), (s)[(i)++]); \ - } \ -} UPRV_BLOCK_MACRO_END - -/** - * Get a code point from a string at a code point boundary offset, - * and advance the offset to the next code point boundary. - * (Post-incrementing forward iteration.) - * "Safe" macro, handles unpaired surrogates and checks for string boundaries. - * - * The length can be negative for a NUL-terminated string. - * - * The offset may point to the lead surrogate unit - * for a supplementary code point, in which case the macro will read - * the following trail surrogate as well. - * If the offset points to a trail surrogate or - * to a single, unpaired lead surrogate, then c is set to that unpaired surrogate. - * - * @param s const UChar * string - * @param i string offset, must be i<length - * @param length string length - * @param c output UChar32 variable - * @see U16_NEXT_UNSAFE - * @stable ICU 2.4 - */ -#define U16_NEXT(s, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \ - (c)=(s)[(i)++]; \ - if(U16_IS_LEAD(c)) { \ - uint16_t __c2; \ - if((i)!=(length) && U16_IS_TRAIL(__c2=(s)[(i)])) { \ - ++(i); \ - (c)=U16_GET_SUPPLEMENTARY((c), __c2); \ - } \ - } \ -} UPRV_BLOCK_MACRO_END - -/** - * Get a code point from a string at a code point boundary offset, - * and advance the offset to the next code point boundary. - * (Post-incrementing forward iteration.) - * "Safe" macro, handles unpaired surrogates and checks for string boundaries. - * - * The length can be negative for a NUL-terminated string. - * - * The offset may point to the lead surrogate unit - * for a supplementary code point, in which case the macro will read - * the following trail surrogate as well. - * If the offset points to a trail surrogate or - * to a single, unpaired lead surrogate, then c is set to U+FFFD. - * - * @param s const UChar * string - * @param i string offset, must be i<length - * @param length string length - * @param c output UChar32 variable - * @see U16_NEXT_UNSAFE - * @stable ICU 60 - */ -#define U16_NEXT_OR_FFFD(s, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \ - (c)=(s)[(i)++]; \ - if(U16_IS_SURROGATE(c)) { \ - uint16_t __c2; \ - if(U16_IS_SURROGATE_LEAD(c) && (i)!=(length) && U16_IS_TRAIL(__c2=(s)[(i)])) { \ - ++(i); \ - (c)=U16_GET_SUPPLEMENTARY((c), __c2); \ - } else { \ - (c)=0xfffd; \ - } \ - } \ -} UPRV_BLOCK_MACRO_END - -/** - * Append a code point to a string, overwriting 1 or 2 code units. - * The offset points to the current end of the string contents - * and is advanced (post-increment). - * "Unsafe" macro, assumes a valid code point and sufficient space in the string. - * Otherwise, the result is undefined. - * - * @param s const UChar * string buffer - * @param i string offset - * @param c code point to append - * @see U16_APPEND - * @stable ICU 2.4 - */ -#define U16_APPEND_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \ - if((uint32_t)(c)<=0xffff) { \ - (s)[(i)++]=(uint16_t)(c); \ - } else { \ - (s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \ - (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \ - } \ -} UPRV_BLOCK_MACRO_END - -/** - * Append a code point to a string, overwriting 1 or 2 code units. - * The offset points to the current end of the string contents - * and is advanced (post-increment). - * "Safe" macro, checks for a valid code point. - * If a surrogate pair is written, checks for sufficient space in the string. - * If the code point is not valid or a trail surrogate does not fit, - * then isError is set to TRUE. - * - * @param s const UChar * string buffer - * @param i string offset, must be i<capacity - * @param capacity size of the string buffer - * @param c code point to append - * @param isError output UBool set to TRUE if an error occurs, otherwise not modified - * @see U16_APPEND_UNSAFE - * @stable ICU 2.4 - */ -#define U16_APPEND(s, i, capacity, c, isError) UPRV_BLOCK_MACRO_BEGIN { \ - if((uint32_t)(c)<=0xffff) { \ - (s)[(i)++]=(uint16_t)(c); \ - } else if((uint32_t)(c)<=0x10ffff && (i)+1<(capacity)) { \ - (s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \ - (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \ - } else /* c>0x10ffff or not enough space */ { \ - (isError)=TRUE; \ - } \ -} UPRV_BLOCK_MACRO_END - -/** - * Advance the string offset from one code point boundary to the next. - * (Post-incrementing iteration.) - * "Unsafe" macro, assumes well-formed UTF-16. - * - * @param s const UChar * string - * @param i string offset - * @see U16_FWD_1 - * @stable ICU 2.4 - */ -#define U16_FWD_1_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \ - if(U16_IS_LEAD((s)[(i)++])) { \ - ++(i); \ - } \ -} UPRV_BLOCK_MACRO_END - -/** - * Advance the string offset from one code point boundary to the next. - * (Post-incrementing iteration.) - * "Safe" macro, handles unpaired surrogates and checks for string boundaries. - * - * The length can be negative for a NUL-terminated string. - * - * @param s const UChar * string - * @param i string offset, must be i<length - * @param length string length - * @see U16_FWD_1_UNSAFE - * @stable ICU 2.4 - */ -#define U16_FWD_1(s, i, length) UPRV_BLOCK_MACRO_BEGIN { \ - if(U16_IS_LEAD((s)[(i)++]) && (i)!=(length) && U16_IS_TRAIL((s)[i])) { \ - ++(i); \ - } \ -} UPRV_BLOCK_MACRO_END - -/** - * Advance the string offset from one code point boundary to the n-th next one, - * i.e., move forward by n code points. - * (Post-incrementing iteration.) - * "Unsafe" macro, assumes well-formed UTF-16. - * - * @param s const UChar * string - * @param i string offset - * @param n number of code points to skip - * @see U16_FWD_N - * @stable ICU 2.4 - */ -#define U16_FWD_N_UNSAFE(s, i, n) UPRV_BLOCK_MACRO_BEGIN { \ - int32_t __N=(n); \ - while(__N>0) { \ - U16_FWD_1_UNSAFE(s, i); \ - --__N; \ - } \ -} UPRV_BLOCK_MACRO_END - -/** - * Advance the string offset from one code point boundary to the n-th next one, - * i.e., move forward by n code points. - * (Post-incrementing iteration.) - * "Safe" macro, handles unpaired surrogates and checks for string boundaries. - * - * The length can be negative for a NUL-terminated string. - * - * @param s const UChar * string - * @param i int32_t string offset, must be i<length - * @param length int32_t string length - * @param n number of code points to skip - * @see U16_FWD_N_UNSAFE - * @stable ICU 2.4 - */ -#define U16_FWD_N(s, i, length, n) UPRV_BLOCK_MACRO_BEGIN { \ - int32_t __N=(n); \ - while(__N>0 && ((i)<(length) || ((length)<0 && (s)[i]!=0))) { \ - U16_FWD_1(s, i, length); \ - --__N; \ - } \ -} UPRV_BLOCK_MACRO_END - -/** - * Adjust a random-access offset to a code point boundary - * at the start of a code point. - * If the offset points to the trail surrogate of a surrogate pair, - * then the offset is decremented. - * Otherwise, it is not modified. - * "Unsafe" macro, assumes well-formed UTF-16. - * - * @param s const UChar * string - * @param i string offset - * @see U16_SET_CP_START - * @stable ICU 2.4 - */ -#define U16_SET_CP_START_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \ - if(U16_IS_TRAIL((s)[i])) { \ - --(i); \ - } \ -} UPRV_BLOCK_MACRO_END - -/** - * Adjust a random-access offset to a code point boundary - * at the start of a code point. - * If the offset points to the trail surrogate of a surrogate pair, - * then the offset is decremented. - * Otherwise, it is not modified. - * "Safe" macro, handles unpaired surrogates and checks for string boundaries. - * - * @param s const UChar * string - * @param start starting string offset (usually 0) - * @param i string offset, must be start<=i - * @see U16_SET_CP_START_UNSAFE - * @stable ICU 2.4 - */ -#define U16_SET_CP_START(s, start, i) UPRV_BLOCK_MACRO_BEGIN { \ - if(U16_IS_TRAIL((s)[i]) && (i)>(start) && U16_IS_LEAD((s)[(i)-1])) { \ - --(i); \ - } \ -} UPRV_BLOCK_MACRO_END - -/* definitions with backward iteration -------------------------------------- */ - -/** - * Move the string offset from one code point boundary to the previous one - * and get the code point between them. - * (Pre-decrementing backward iteration.) - * "Unsafe" macro, assumes well-formed UTF-16. - * - * The input offset may be the same as the string length. - * If the offset is behind a trail surrogate unit - * for a supplementary code point, then the macro will read - * the preceding lead surrogate as well. - * If the offset is behind a lead surrogate, then that itself - * will be returned as the code point. - * The result is undefined if the offset is behind a single, unpaired trail surrogate. - * - * @param s const UChar * string - * @param i string offset - * @param c output UChar32 variable - * @see U16_PREV - * @stable ICU 2.4 - */ -#define U16_PREV_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \ - (c)=(s)[--(i)]; \ - if(U16_IS_TRAIL(c)) { \ - (c)=U16_GET_SUPPLEMENTARY((s)[--(i)], (c)); \ - } \ -} UPRV_BLOCK_MACRO_END - -/** - * Move the string offset from one code point boundary to the previous one - * and get the code point between them. - * (Pre-decrementing backward iteration.) - * "Safe" macro, handles unpaired surrogates and checks for string boundaries. - * - * The input offset may be the same as the string length. - * If the offset is behind a trail surrogate unit - * for a supplementary code point, then the macro will read - * the preceding lead surrogate as well. - * If the offset is behind a lead surrogate or behind a single, unpaired - * trail surrogate, then c is set to that unpaired surrogate. - * - * @param s const UChar * string - * @param start starting string offset (usually 0) - * @param i string offset, must be start<i - * @param c output UChar32 variable - * @see U16_PREV_UNSAFE - * @stable ICU 2.4 - */ -#define U16_PREV(s, start, i, c) UPRV_BLOCK_MACRO_BEGIN { \ - (c)=(s)[--(i)]; \ - if(U16_IS_TRAIL(c)) { \ - uint16_t __c2; \ - if((i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \ - --(i); \ - (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \ - } \ - } \ -} UPRV_BLOCK_MACRO_END - -/** - * Move the string offset from one code point boundary to the previous one - * and get the code point between them. - * (Pre-decrementing backward iteration.) - * "Safe" macro, handles unpaired surrogates and checks for string boundaries. - * - * The input offset may be the same as the string length. - * If the offset is behind a trail surrogate unit - * for a supplementary code point, then the macro will read - * the preceding lead surrogate as well. - * If the offset is behind a lead surrogate or behind a single, unpaired - * trail surrogate, then c is set to U+FFFD. - * - * @param s const UChar * string - * @param start starting string offset (usually 0) - * @param i string offset, must be start<i - * @param c output UChar32 variable - * @see U16_PREV_UNSAFE - * @stable ICU 60 - */ -#define U16_PREV_OR_FFFD(s, start, i, c) UPRV_BLOCK_MACRO_BEGIN { \ - (c)=(s)[--(i)]; \ - if(U16_IS_SURROGATE(c)) { \ - uint16_t __c2; \ - if(U16_IS_SURROGATE_TRAIL(c) && (i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \ - --(i); \ - (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \ - } else { \ - (c)=0xfffd; \ - } \ - } \ -} UPRV_BLOCK_MACRO_END - -/** - * Move the string offset from one code point boundary to the previous one. - * (Pre-decrementing backward iteration.) - * The input offset may be the same as the string length. - * "Unsafe" macro, assumes well-formed UTF-16. - * - * @param s const UChar * string - * @param i string offset - * @see U16_BACK_1 - * @stable ICU 2.4 - */ -#define U16_BACK_1_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \ - if(U16_IS_TRAIL((s)[--(i)])) { \ - --(i); \ - } \ -} UPRV_BLOCK_MACRO_END - -/** - * Move the string offset from one code point boundary to the previous one. - * (Pre-decrementing backward iteration.) - * The input offset may be the same as the string length. - * "Safe" macro, handles unpaired surrogates and checks for string boundaries. - * - * @param s const UChar * string - * @param start starting string offset (usually 0) - * @param i string offset, must be start<i - * @see U16_BACK_1_UNSAFE - * @stable ICU 2.4 - */ -#define U16_BACK_1(s, start, i) UPRV_BLOCK_MACRO_BEGIN { \ - if(U16_IS_TRAIL((s)[--(i)]) && (i)>(start) && U16_IS_LEAD((s)[(i)-1])) { \ - --(i); \ - } \ -} UPRV_BLOCK_MACRO_END - -/** - * Move the string offset from one code point boundary to the n-th one before it, - * i.e., move backward by n code points. - * (Pre-decrementing backward iteration.) - * The input offset may be the same as the string length. - * "Unsafe" macro, assumes well-formed UTF-16. - * - * @param s const UChar * string - * @param i string offset - * @param n number of code points to skip - * @see U16_BACK_N - * @stable ICU 2.4 - */ -#define U16_BACK_N_UNSAFE(s, i, n) UPRV_BLOCK_MACRO_BEGIN { \ - int32_t __N=(n); \ - while(__N>0) { \ - U16_BACK_1_UNSAFE(s, i); \ - --__N; \ - } \ -} UPRV_BLOCK_MACRO_END - -/** - * Move the string offset from one code point boundary to the n-th one before it, - * i.e., move backward by n code points. - * (Pre-decrementing backward iteration.) - * The input offset may be the same as the string length. - * "Safe" macro, handles unpaired surrogates and checks for string boundaries. - * - * @param s const UChar * string - * @param start start of string - * @param i string offset, must be start<i - * @param n number of code points to skip - * @see U16_BACK_N_UNSAFE - * @stable ICU 2.4 - */ -#define U16_BACK_N(s, start, i, n) UPRV_BLOCK_MACRO_BEGIN { \ - int32_t __N=(n); \ - while(__N>0 && (i)>(start)) { \ - U16_BACK_1(s, start, i); \ - --__N; \ - } \ -} UPRV_BLOCK_MACRO_END - -/** - * Adjust a random-access offset to a code point boundary after a code point. - * If the offset is behind the lead surrogate of a surrogate pair, - * then the offset is incremented. - * Otherwise, it is not modified. - * The input offset may be the same as the string length. - * "Unsafe" macro, assumes well-formed UTF-16. - * - * @param s const UChar * string - * @param i string offset - * @see U16_SET_CP_LIMIT - * @stable ICU 2.4 - */ -#define U16_SET_CP_LIMIT_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \ - if(U16_IS_LEAD((s)[(i)-1])) { \ - ++(i); \ - } \ -} UPRV_BLOCK_MACRO_END - -/** - * Adjust a random-access offset to a code point boundary after a code point. - * If the offset is behind the lead surrogate of a surrogate pair, - * then the offset is incremented. - * Otherwise, it is not modified. - * The input offset may be the same as the string length. - * "Safe" macro, handles unpaired surrogates and checks for string boundaries. - * - * The length can be negative for a NUL-terminated string. - * - * @param s const UChar * string - * @param start int32_t starting string offset (usually 0) - * @param i int32_t string offset, start<=i<=length - * @param length int32_t string length - * @see U16_SET_CP_LIMIT_UNSAFE - * @stable ICU 2.4 - */ -#define U16_SET_CP_LIMIT(s, start, i, length) UPRV_BLOCK_MACRO_BEGIN { \ - if((start)<(i) && ((i)<(length) || (length)<0) && U16_IS_LEAD((s)[(i)-1]) && U16_IS_TRAIL((s)[i])) { \ - ++(i); \ - } \ -} UPRV_BLOCK_MACRO_END - -#endif diff --git a/src/tree_sitter/unicode/utf8.h b/src/tree_sitter/unicode/utf8.h deleted file mode 100644 index 3b37873e37..0000000000 --- a/src/tree_sitter/unicode/utf8.h +++ /dev/null @@ -1,881 +0,0 @@ -// © 2016 and later: Unicode, Inc. and others. -// License & terms of use: http://www.unicode.org/copyright.html -/* -******************************************************************************* -* -* Copyright (C) 1999-2015, International Business Machines -* Corporation and others. All Rights Reserved. -* -******************************************************************************* -* file name: utf8.h -* encoding: UTF-8 -* tab size: 8 (not used) -* indentation:4 -* -* created on: 1999sep13 -* created by: Markus W. Scherer -*/ - -/** - * \file - * \brief C API: 8-bit Unicode handling macros - * - * This file defines macros to deal with 8-bit Unicode (UTF-8) code units (bytes) and strings. - * - * For more information see utf.h and the ICU User Guide Strings chapter - * (http://userguide.icu-project.org/strings). - * - * <em>Usage:</em> - * ICU coding guidelines for if() statements should be followed when using these macros. - * Compound statements (curly braces {}) must be used for if-else-while... - * bodies and all macro statements should be terminated with semicolon. - */ - -#ifndef __UTF8_H__ -#define __UTF8_H__ - -#include "./umachine.h" -#ifndef __UTF_H__ -# include "./utf.h" -#endif - -/* internal definitions ----------------------------------------------------- */ - -/** - * Counts the trail bytes for a UTF-8 lead byte. - * Returns 0 for 0..0xc1 as well as for 0xf5..0xff. - * leadByte might be evaluated multiple times. - * - * This is internal since it is not meant to be called directly by external clients; - * however it is called by public macros in this file and thus must remain stable. - * - * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff. - * @internal - */ -#define U8_COUNT_TRAIL_BYTES(leadByte) \ - (U8_IS_LEAD(leadByte) ? \ - ((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0)+1 : 0) - -/** - * Counts the trail bytes for a UTF-8 lead byte of a valid UTF-8 sequence. - * Returns 0 for 0..0xc1. Undefined for 0xf5..0xff. - * leadByte might be evaluated multiple times. - * - * This is internal since it is not meant to be called directly by external clients; - * however it is called by public macros in this file and thus must remain stable. - * - * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff. - * @internal - */ -#define U8_COUNT_TRAIL_BYTES_UNSAFE(leadByte) \ - (((uint8_t)(leadByte)>=0xc2)+((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0)) - -/** - * Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value. - * - * This is internal since it is not meant to be called directly by external clients; - * however it is called by public macros in this file and thus must remain stable. - * @internal - */ -#define U8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1) - -/** - * Internal bit vector for 3-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD3_AND_T1. - * Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence. - * Lead byte E0..EF bits 3..0 are used as byte index, - * first trail byte bits 7..5 are used as bit index into that byte. - * @see U8_IS_VALID_LEAD3_AND_T1 - * @internal - */ -#define U8_LEAD3_T1_BITS "\x20\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x10\x30\x30" - -/** - * Internal 3-byte UTF-8 validity check. - * Non-zero if lead byte E0..EF and first trail byte 00..FF start a valid sequence. - * @internal - */ -#define U8_IS_VALID_LEAD3_AND_T1(lead, t1) (U8_LEAD3_T1_BITS[(lead)&0xf]&(1<<((uint8_t)(t1)>>5))) - -/** - * Internal bit vector for 4-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD4_AND_T1. - * Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence. - * First trail byte bits 7..4 are used as byte index, - * lead byte F0..F4 bits 2..0 are used as bit index into that byte. - * @see U8_IS_VALID_LEAD4_AND_T1 - * @internal - */ -#define U8_LEAD4_T1_BITS "\x00\x00\x00\x00\x00\x00\x00\x00\x1E\x0F\x0F\x0F\x00\x00\x00\x00" - -/** - * Internal 4-byte UTF-8 validity check. - * Non-zero if lead byte F0..F4 and first trail byte 00..FF start a valid sequence. - * @internal - */ -#define U8_IS_VALID_LEAD4_AND_T1(lead, t1) (U8_LEAD4_T1_BITS[(uint8_t)(t1)>>4]&(1<<((lead)&7))) - -/** - * Function for handling "next code point" with error-checking. - * - * This is internal since it is not meant to be called directly by external clients; - * however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this - * file and thus must remain stable, and should not be hidden when other internal - * functions are hidden (otherwise public macros would fail to compile). - * @internal - */ -U_STABLE UChar32 U_EXPORT2 -utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict); - -/** - * Function for handling "append code point" with error-checking. - * - * This is internal since it is not meant to be called directly by external clients; - * however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this - * file and thus must remain stable, and should not be hidden when other internal - * functions are hidden (otherwise public macros would fail to compile). - * @internal - */ -U_STABLE int32_t U_EXPORT2 -utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c, UBool *pIsError); - -/** - * Function for handling "previous code point" with error-checking. - * - * This is internal since it is not meant to be called directly by external clients; - * however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this - * file and thus must remain stable, and should not be hidden when other internal - * functions are hidden (otherwise public macros would fail to compile). - * @internal - */ -U_STABLE UChar32 U_EXPORT2 -utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, UBool strict); - -/** - * Function for handling "skip backward one code point" with error-checking. - * - * This is internal since it is not meant to be called directly by external clients; - * however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this - * file and thus must remain stable, and should not be hidden when other internal - * functions are hidden (otherwise public macros would fail to compile). - * @internal - */ -U_STABLE int32_t U_EXPORT2 -utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i); - -/* single-code point definitions -------------------------------------------- */ - -/** - * Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)? - * @param c 8-bit code unit (byte) - * @return TRUE or FALSE - * @stable ICU 2.4 - */ -#define U8_IS_SINGLE(c) (((c)&0x80)==0) - -/** - * Is this code unit (byte) a UTF-8 lead byte? (0xC2..0xF4) - * @param c 8-bit code unit (byte) - * @return TRUE or FALSE - * @stable ICU 2.4 - */ -#define U8_IS_LEAD(c) ((uint8_t)((c)-0xc2)<=0x32) -// 0x32=0xf4-0xc2 - -/** - * Is this code unit (byte) a UTF-8 trail byte? (0x80..0xBF) - * @param c 8-bit code unit (byte) - * @return TRUE or FALSE - * @stable ICU 2.4 - */ -#define U8_IS_TRAIL(c) ((int8_t)(c)<-0x40) - -/** - * How many code units (bytes) are used for the UTF-8 encoding - * of this Unicode code point? - * @param c 32-bit code point - * @return 1..4, or 0 if c is a surrogate or not a Unicode code point - * @stable ICU 2.4 - */ -#define U8_LENGTH(c) \ - ((uint32_t)(c)<=0x7f ? 1 : \ - ((uint32_t)(c)<=0x7ff ? 2 : \ - ((uint32_t)(c)<=0xd7ff ? 3 : \ - ((uint32_t)(c)<=0xdfff || (uint32_t)(c)>0x10ffff ? 0 : \ - ((uint32_t)(c)<=0xffff ? 3 : 4)\ - ) \ - ) \ - ) \ - ) - -/** - * The maximum number of UTF-8 code units (bytes) per Unicode code point (U+0000..U+10ffff). - * @return 4 - * @stable ICU 2.4 - */ -#define U8_MAX_LENGTH 4 - -/** - * Get a code point from a string at a random-access offset, - * without changing the offset. - * The offset may point to either the lead byte or one of the trail bytes - * for a code point, in which case the macro will read all of the bytes - * for the code point. - * The result is undefined if the offset points to an illegal UTF-8 - * byte sequence. - * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT. - * - * @param s const uint8_t * string - * @param i string offset - * @param c output UChar32 variable - * @see U8_GET - * @stable ICU 2.4 - */ -#define U8_GET_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \ - int32_t _u8_get_unsafe_index=(int32_t)(i); \ - U8_SET_CP_START_UNSAFE(s, _u8_get_unsafe_index); \ - U8_NEXT_UNSAFE(s, _u8_get_unsafe_index, c); \ -} UPRV_BLOCK_MACRO_END - -/** - * Get a code point from a string at a random-access offset, - * without changing the offset. - * The offset may point to either the lead byte or one of the trail bytes - * for a code point, in which case the macro will read all of the bytes - * for the code point. - * - * The length can be negative for a NUL-terminated string. - * - * If the offset points to an illegal UTF-8 byte sequence, then - * c is set to a negative value. - * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT. - * - * @param s const uint8_t * string - * @param start int32_t starting string offset - * @param i int32_t string offset, must be start<=i<length - * @param length int32_t string length - * @param c output UChar32 variable, set to <0 in case of an error - * @see U8_GET_UNSAFE - * @stable ICU 2.4 - */ -#define U8_GET(s, start, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \ - int32_t _u8_get_index=(i); \ - U8_SET_CP_START(s, start, _u8_get_index); \ - U8_NEXT(s, _u8_get_index, length, c); \ -} UPRV_BLOCK_MACRO_END - -/** - * Get a code point from a string at a random-access offset, - * without changing the offset. - * The offset may point to either the lead byte or one of the trail bytes - * for a code point, in which case the macro will read all of the bytes - * for the code point. - * - * The length can be negative for a NUL-terminated string. - * - * If the offset points to an illegal UTF-8 byte sequence, then - * c is set to U+FFFD. - * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT_OR_FFFD. - * - * This macro does not distinguish between a real U+FFFD in the text - * and U+FFFD returned for an ill-formed sequence. - * Use U8_GET() if that distinction is important. - * - * @param s const uint8_t * string - * @param start int32_t starting string offset - * @param i int32_t string offset, must be start<=i<length - * @param length int32_t string length - * @param c output UChar32 variable, set to U+FFFD in case of an error - * @see U8_GET - * @stable ICU 51 - */ -#define U8_GET_OR_FFFD(s, start, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \ - int32_t _u8_get_index=(i); \ - U8_SET_CP_START(s, start, _u8_get_index); \ - U8_NEXT_OR_FFFD(s, _u8_get_index, length, c); \ -} UPRV_BLOCK_MACRO_END - -/* definitions with forward iteration --------------------------------------- */ - -/** - * Get a code point from a string at a code point boundary offset, - * and advance the offset to the next code point boundary. - * (Post-incrementing forward iteration.) - * "Unsafe" macro, assumes well-formed UTF-8. - * - * The offset may point to the lead byte of a multi-byte sequence, - * in which case the macro will read the whole sequence. - * The result is undefined if the offset points to a trail byte - * or an illegal UTF-8 sequence. - * - * @param s const uint8_t * string - * @param i string offset - * @param c output UChar32 variable - * @see U8_NEXT - * @stable ICU 2.4 - */ -#define U8_NEXT_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \ - (c)=(uint8_t)(s)[(i)++]; \ - if(!U8_IS_SINGLE(c)) { \ - if((c)<0xe0) { \ - (c)=(((c)&0x1f)<<6)|((s)[(i)++]&0x3f); \ - } else if((c)<0xf0) { \ - /* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \ - (c)=(UChar)(((c)<<12)|(((s)[i]&0x3f)<<6)|((s)[(i)+1]&0x3f)); \ - (i)+=2; \ - } else { \ - (c)=(((c)&7)<<18)|(((s)[i]&0x3f)<<12)|(((s)[(i)+1]&0x3f)<<6)|((s)[(i)+2]&0x3f); \ - (i)+=3; \ - } \ - } \ -} UPRV_BLOCK_MACRO_END - -/** - * Get a code point from a string at a code point boundary offset, - * and advance the offset to the next code point boundary. - * (Post-incrementing forward iteration.) - * "Safe" macro, checks for illegal sequences and for string boundaries. - * - * The length can be negative for a NUL-terminated string. - * - * The offset may point to the lead byte of a multi-byte sequence, - * in which case the macro will read the whole sequence. - * If the offset points to a trail byte or an illegal UTF-8 sequence, then - * c is set to a negative value. - * - * @param s const uint8_t * string - * @param i int32_t string offset, must be i<length - * @param length int32_t string length - * @param c output UChar32 variable, set to <0 in case of an error - * @see U8_NEXT_UNSAFE - * @stable ICU 2.4 - */ -#define U8_NEXT(s, i, length, c) U8_INTERNAL_NEXT_OR_SUB(s, i, length, c, U_SENTINEL) - -/** - * Get a code point from a string at a code point boundary offset, - * and advance the offset to the next code point boundary. - * (Post-incrementing forward iteration.) - * "Safe" macro, checks for illegal sequences and for string boundaries. - * - * The length can be negative for a NUL-terminated string. - * - * The offset may point to the lead byte of a multi-byte sequence, - * in which case the macro will read the whole sequence. - * If the offset points to a trail byte or an illegal UTF-8 sequence, then - * c is set to U+FFFD. - * - * This macro does not distinguish between a real U+FFFD in the text - * and U+FFFD returned for an ill-formed sequence. - * Use U8_NEXT() if that distinction is important. - * - * @param s const uint8_t * string - * @param i int32_t string offset, must be i<length - * @param length int32_t string length - * @param c output UChar32 variable, set to U+FFFD in case of an error - * @see U8_NEXT - * @stable ICU 51 - */ -#define U8_NEXT_OR_FFFD(s, i, length, c) U8_INTERNAL_NEXT_OR_SUB(s, i, length, c, 0xfffd) - -/** @internal */ -#define U8_INTERNAL_NEXT_OR_SUB(s, i, length, c, sub) UPRV_BLOCK_MACRO_BEGIN { \ - (c)=(uint8_t)(s)[(i)++]; \ - if(!U8_IS_SINGLE(c)) { \ - uint8_t __t = 0; \ - if((i)!=(length) && \ - /* fetch/validate/assemble all but last trail byte */ \ - ((c)>=0xe0 ? \ - ((c)<0xf0 ? /* U+0800..U+FFFF except surrogates */ \ - U8_LEAD3_T1_BITS[(c)&=0xf]&(1<<((__t=(s)[i])>>5)) && \ - (__t&=0x3f, 1) \ - : /* U+10000..U+10FFFF */ \ - ((c)-=0xf0)<=4 && \ - U8_LEAD4_T1_BITS[(__t=(s)[i])>>4]&(1<<(c)) && \ - ((c)=((c)<<6)|(__t&0x3f), ++(i)!=(length)) && \ - (__t=(s)[i]-0x80)<=0x3f) && \ - /* valid second-to-last trail byte */ \ - ((c)=((c)<<6)|__t, ++(i)!=(length)) \ - : /* U+0080..U+07FF */ \ - (c)>=0xc2 && ((c)&=0x1f, 1)) && \ - /* last trail byte */ \ - (__t=(s)[i]-0x80)<=0x3f && \ - ((c)=((c)<<6)|__t, ++(i), 1)) { \ - } else { \ - (c)=(sub); /* ill-formed*/ \ - } \ - } \ -} UPRV_BLOCK_MACRO_END - -/** - * Append a code point to a string, overwriting 1 to 4 bytes. - * The offset points to the current end of the string contents - * and is advanced (post-increment). - * "Unsafe" macro, assumes a valid code point and sufficient space in the string. - * Otherwise, the result is undefined. - * - * @param s const uint8_t * string buffer - * @param i string offset - * @param c code point to append - * @see U8_APPEND - * @stable ICU 2.4 - */ -#define U8_APPEND_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \ - uint32_t __uc=(c); \ - if(__uc<=0x7f) { \ - (s)[(i)++]=(uint8_t)__uc; \ - } else { \ - if(__uc<=0x7ff) { \ - (s)[(i)++]=(uint8_t)((__uc>>6)|0xc0); \ - } else { \ - if(__uc<=0xffff) { \ - (s)[(i)++]=(uint8_t)((__uc>>12)|0xe0); \ - } else { \ - (s)[(i)++]=(uint8_t)((__uc>>18)|0xf0); \ - (s)[(i)++]=(uint8_t)(((__uc>>12)&0x3f)|0x80); \ - } \ - (s)[(i)++]=(uint8_t)(((__uc>>6)&0x3f)|0x80); \ - } \ - (s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \ - } \ -} UPRV_BLOCK_MACRO_END - -/** - * Append a code point to a string, overwriting 1 to 4 bytes. - * The offset points to the current end of the string contents - * and is advanced (post-increment). - * "Safe" macro, checks for a valid code point. - * If a non-ASCII code point is written, checks for sufficient space in the string. - * If the code point is not valid or trail bytes do not fit, - * then isError is set to TRUE. - * - * @param s const uint8_t * string buffer - * @param i int32_t string offset, must be i<capacity - * @param capacity int32_t size of the string buffer - * @param c UChar32 code point to append - * @param isError output UBool set to TRUE if an error occurs, otherwise not modified - * @see U8_APPEND_UNSAFE - * @stable ICU 2.4 - */ -#define U8_APPEND(s, i, capacity, c, isError) UPRV_BLOCK_MACRO_BEGIN { \ - uint32_t __uc=(c); \ - if(__uc<=0x7f) { \ - (s)[(i)++]=(uint8_t)__uc; \ - } else if(__uc<=0x7ff && (i)+1<(capacity)) { \ - (s)[(i)++]=(uint8_t)((__uc>>6)|0xc0); \ - (s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \ - } else if((__uc<=0xd7ff || (0xe000<=__uc && __uc<=0xffff)) && (i)+2<(capacity)) { \ - (s)[(i)++]=(uint8_t)((__uc>>12)|0xe0); \ - (s)[(i)++]=(uint8_t)(((__uc>>6)&0x3f)|0x80); \ - (s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \ - } else if(0xffff<__uc && __uc<=0x10ffff && (i)+3<(capacity)) { \ - (s)[(i)++]=(uint8_t)((__uc>>18)|0xf0); \ - (s)[(i)++]=(uint8_t)(((__uc>>12)&0x3f)|0x80); \ - (s)[(i)++]=(uint8_t)(((__uc>>6)&0x3f)|0x80); \ - (s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \ - } else { \ - (isError)=TRUE; \ - } \ -} UPRV_BLOCK_MACRO_END - -/** - * Advance the string offset from one code point boundary to the next. - * (Post-incrementing iteration.) - * "Unsafe" macro, assumes well-formed UTF-8. - * - * @param s const uint8_t * string - * @param i string offset - * @see U8_FWD_1 - * @stable ICU 2.4 - */ -#define U8_FWD_1_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \ - (i)+=1+U8_COUNT_TRAIL_BYTES_UNSAFE((s)[i]); \ -} UPRV_BLOCK_MACRO_END - -/** - * Advance the string offset from one code point boundary to the next. - * (Post-incrementing iteration.) - * "Safe" macro, checks for illegal sequences and for string boundaries. - * - * The length can be negative for a NUL-terminated string. - * - * @param s const uint8_t * string - * @param i int32_t string offset, must be i<length - * @param length int32_t string length - * @see U8_FWD_1_UNSAFE - * @stable ICU 2.4 - */ -#define U8_FWD_1(s, i, length) UPRV_BLOCK_MACRO_BEGIN { \ - uint8_t __b=(s)[(i)++]; \ - if(U8_IS_LEAD(__b) && (i)!=(length)) { \ - uint8_t __t1=(s)[i]; \ - if((0xe0<=__b && __b<0xf0)) { \ - if(U8_IS_VALID_LEAD3_AND_T1(__b, __t1) && \ - ++(i)!=(length) && U8_IS_TRAIL((s)[i])) { \ - ++(i); \ - } \ - } else if(__b<0xe0) { \ - if(U8_IS_TRAIL(__t1)) { \ - ++(i); \ - } \ - } else /* c>=0xf0 */ { \ - if(U8_IS_VALID_LEAD4_AND_T1(__b, __t1) && \ - ++(i)!=(length) && U8_IS_TRAIL((s)[i]) && \ - ++(i)!=(length) && U8_IS_TRAIL((s)[i])) { \ - ++(i); \ - } \ - } \ - } \ -} UPRV_BLOCK_MACRO_END - -/** - * Advance the string offset from one code point boundary to the n-th next one, - * i.e., move forward by n code points. - * (Post-incrementing iteration.) - * "Unsafe" macro, assumes well-formed UTF-8. - * - * @param s const uint8_t * string - * @param i string offset - * @param n number of code points to skip - * @see U8_FWD_N - * @stable ICU 2.4 - */ -#define U8_FWD_N_UNSAFE(s, i, n) UPRV_BLOCK_MACRO_BEGIN { \ - int32_t __N=(n); \ - while(__N>0) { \ - U8_FWD_1_UNSAFE(s, i); \ - --__N; \ - } \ -} UPRV_BLOCK_MACRO_END - -/** - * Advance the string offset from one code point boundary to the n-th next one, - * i.e., move forward by n code points. - * (Post-incrementing iteration.) - * "Safe" macro, checks for illegal sequences and for string boundaries. - * - * The length can be negative for a NUL-terminated string. - * - * @param s const uint8_t * string - * @param i int32_t string offset, must be i<length - * @param length int32_t string length - * @param n number of code points to skip - * @see U8_FWD_N_UNSAFE - * @stable ICU 2.4 - */ -#define U8_FWD_N(s, i, length, n) UPRV_BLOCK_MACRO_BEGIN { \ - int32_t __N=(n); \ - while(__N>0 && ((i)<(length) || ((length)<0 && (s)[i]!=0))) { \ - U8_FWD_1(s, i, length); \ - --__N; \ - } \ -} UPRV_BLOCK_MACRO_END - -/** - * Adjust a random-access offset to a code point boundary - * at the start of a code point. - * If the offset points to a UTF-8 trail byte, - * then the offset is moved backward to the corresponding lead byte. - * Otherwise, it is not modified. - * "Unsafe" macro, assumes well-formed UTF-8. - * - * @param s const uint8_t * string - * @param i string offset - * @see U8_SET_CP_START - * @stable ICU 2.4 - */ -#define U8_SET_CP_START_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \ - while(U8_IS_TRAIL((s)[i])) { --(i); } \ -} UPRV_BLOCK_MACRO_END - -/** - * Adjust a random-access offset to a code point boundary - * at the start of a code point. - * If the offset points to a UTF-8 trail byte, - * then the offset is moved backward to the corresponding lead byte. - * Otherwise, it is not modified. - * - * "Safe" macro, checks for illegal sequences and for string boundaries. - * Unlike U8_TRUNCATE_IF_INCOMPLETE(), this macro always reads s[i]. - * - * @param s const uint8_t * string - * @param start int32_t starting string offset (usually 0) - * @param i int32_t string offset, must be start<=i - * @see U8_SET_CP_START_UNSAFE - * @see U8_TRUNCATE_IF_INCOMPLETE - * @stable ICU 2.4 - */ -#define U8_SET_CP_START(s, start, i) UPRV_BLOCK_MACRO_BEGIN { \ - if(U8_IS_TRAIL((s)[(i)])) { \ - (i)=utf8_back1SafeBody(s, start, (i)); \ - } \ -} UPRV_BLOCK_MACRO_END - -/** - * If the string ends with a UTF-8 byte sequence that is valid so far - * but incomplete, then reduce the length of the string to end before - * the lead byte of that incomplete sequence. - * For example, if the string ends with E1 80, the length is reduced by 2. - * - * In all other cases (the string ends with a complete sequence, or it is not - * possible for any further trail byte to extend the trailing sequence) - * the length remains unchanged. - * - * Useful for processing text split across multiple buffers - * (save the incomplete sequence for later) - * and for optimizing iteration - * (check for string length only once per character). - * - * "Safe" macro, checks for illegal sequences and for string boundaries. - * Unlike U8_SET_CP_START(), this macro never reads s[length]. - * - * (In UTF-16, simply check for U16_IS_LEAD(last code unit).) - * - * @param s const uint8_t * string - * @param start int32_t starting string offset (usually 0) - * @param length int32_t string length (usually start<=length) - * @see U8_SET_CP_START - * @stable ICU 61 - */ -#define U8_TRUNCATE_IF_INCOMPLETE(s, start, length) UPRV_BLOCK_MACRO_BEGIN { \ - if((length)>(start)) { \ - uint8_t __b1=s[(length)-1]; \ - if(U8_IS_SINGLE(__b1)) { \ - /* common ASCII character */ \ - } else if(U8_IS_LEAD(__b1)) { \ - --(length); \ - } else if(U8_IS_TRAIL(__b1) && ((length)-2)>=(start)) { \ - uint8_t __b2=s[(length)-2]; \ - if(0xe0<=__b2 && __b2<=0xf4) { \ - if(__b2<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(__b2, __b1) : \ - U8_IS_VALID_LEAD4_AND_T1(__b2, __b1)) { \ - (length)-=2; \ - } \ - } else if(U8_IS_TRAIL(__b2) && ((length)-3)>=(start)) { \ - uint8_t __b3=s[(length)-3]; \ - if(0xf0<=__b3 && __b3<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(__b3, __b2)) { \ - (length)-=3; \ - } \ - } \ - } \ - } \ -} UPRV_BLOCK_MACRO_END - -/* definitions with backward iteration -------------------------------------- */ - -/** - * Move the string offset from one code point boundary to the previous one - * and get the code point between them. - * (Pre-decrementing backward iteration.) - * "Unsafe" macro, assumes well-formed UTF-8. - * - * The input offset may be the same as the string length. - * If the offset is behind a multi-byte sequence, then the macro will read - * the whole sequence. - * If the offset is behind a lead byte, then that itself - * will be returned as the code point. - * The result is undefined if the offset is behind an illegal UTF-8 sequence. - * - * @param s const uint8_t * string - * @param i string offset - * @param c output UChar32 variable - * @see U8_PREV - * @stable ICU 2.4 - */ -#define U8_PREV_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \ - (c)=(uint8_t)(s)[--(i)]; \ - if(U8_IS_TRAIL(c)) { \ - uint8_t __b, __count=1, __shift=6; \ -\ - /* c is a trail byte */ \ - (c)&=0x3f; \ - for(;;) { \ - __b=(s)[--(i)]; \ - if(__b>=0xc0) { \ - U8_MASK_LEAD_BYTE(__b, __count); \ - (c)|=(UChar32)__b<<__shift; \ - break; \ - } else { \ - (c)|=(UChar32)(__b&0x3f)<<__shift; \ - ++__count; \ - __shift+=6; \ - } \ - } \ - } \ -} UPRV_BLOCK_MACRO_END - -/** - * Move the string offset from one code point boundary to the previous one - * and get the code point between them. - * (Pre-decrementing backward iteration.) - * "Safe" macro, checks for illegal sequences and for string boundaries. - * - * The input offset may be the same as the string length. - * If the offset is behind a multi-byte sequence, then the macro will read - * the whole sequence. - * If the offset is behind a lead byte, then that itself - * will be returned as the code point. - * If the offset is behind an illegal UTF-8 sequence, then c is set to a negative value. - * - * @param s const uint8_t * string - * @param start int32_t starting string offset (usually 0) - * @param i int32_t string offset, must be start<i - * @param c output UChar32 variable, set to <0 in case of an error - * @see U8_PREV_UNSAFE - * @stable ICU 2.4 - */ -#define U8_PREV(s, start, i, c) UPRV_BLOCK_MACRO_BEGIN { \ - (c)=(uint8_t)(s)[--(i)]; \ - if(!U8_IS_SINGLE(c)) { \ - (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -1); \ - } \ -} UPRV_BLOCK_MACRO_END - -/** - * Move the string offset from one code point boundary to the previous one - * and get the code point between them. - * (Pre-decrementing backward iteration.) - * "Safe" macro, checks for illegal sequences and for string boundaries. - * - * The input offset may be the same as the string length. - * If the offset is behind a multi-byte sequence, then the macro will read - * the whole sequence. - * If the offset is behind a lead byte, then that itself - * will be returned as the code point. - * If the offset is behind an illegal UTF-8 sequence, then c is set to U+FFFD. - * - * This macro does not distinguish between a real U+FFFD in the text - * and U+FFFD returned for an ill-formed sequence. - * Use U8_PREV() if that distinction is important. - * - * @param s const uint8_t * string - * @param start int32_t starting string offset (usually 0) - * @param i int32_t string offset, must be start<i - * @param c output UChar32 variable, set to U+FFFD in case of an error - * @see U8_PREV - * @stable ICU 51 - */ -#define U8_PREV_OR_FFFD(s, start, i, c) UPRV_BLOCK_MACRO_BEGIN { \ - (c)=(uint8_t)(s)[--(i)]; \ - if(!U8_IS_SINGLE(c)) { \ - (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -3); \ - } \ -} UPRV_BLOCK_MACRO_END - -/** - * Move the string offset from one code point boundary to the previous one. - * (Pre-decrementing backward iteration.) - * The input offset may be the same as the string length. - * "Unsafe" macro, assumes well-formed UTF-8. - * - * @param s const uint8_t * string - * @param i string offset - * @see U8_BACK_1 - * @stable ICU 2.4 - */ -#define U8_BACK_1_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \ - while(U8_IS_TRAIL((s)[--(i)])) {} \ -} UPRV_BLOCK_MACRO_END - -/** - * Move the string offset from one code point boundary to the previous one. - * (Pre-decrementing backward iteration.) - * The input offset may be the same as the string length. - * "Safe" macro, checks for illegal sequences and for string boundaries. - * - * @param s const uint8_t * string - * @param start int32_t starting string offset (usually 0) - * @param i int32_t string offset, must be start<i - * @see U8_BACK_1_UNSAFE - * @stable ICU 2.4 - */ -#define U8_BACK_1(s, start, i) UPRV_BLOCK_MACRO_BEGIN { \ - if(U8_IS_TRAIL((s)[--(i)])) { \ - (i)=utf8_back1SafeBody(s, start, (i)); \ - } \ -} UPRV_BLOCK_MACRO_END - -/** - * Move the string offset from one code point boundary to the n-th one before it, - * i.e., move backward by n code points. - * (Pre-decrementing backward iteration.) - * The input offset may be the same as the string length. - * "Unsafe" macro, assumes well-formed UTF-8. - * - * @param s const uint8_t * string - * @param i string offset - * @param n number of code points to skip - * @see U8_BACK_N - * @stable ICU 2.4 - */ -#define U8_BACK_N_UNSAFE(s, i, n) UPRV_BLOCK_MACRO_BEGIN { \ - int32_t __N=(n); \ - while(__N>0) { \ - U8_BACK_1_UNSAFE(s, i); \ - --__N; \ - } \ -} UPRV_BLOCK_MACRO_END - -/** - * Move the string offset from one code point boundary to the n-th one before it, - * i.e., move backward by n code points. - * (Pre-decrementing backward iteration.) - * The input offset may be the same as the string length. - * "Safe" macro, checks for illegal sequences and for string boundaries. - * - * @param s const uint8_t * string - * @param start int32_t index of the start of the string - * @param i int32_t string offset, must be start<i - * @param n number of code points to skip - * @see U8_BACK_N_UNSAFE - * @stable ICU 2.4 - */ -#define U8_BACK_N(s, start, i, n) UPRV_BLOCK_MACRO_BEGIN { \ - int32_t __N=(n); \ - while(__N>0 && (i)>(start)) { \ - U8_BACK_1(s, start, i); \ - --__N; \ - } \ -} UPRV_BLOCK_MACRO_END - -/** - * Adjust a random-access offset to a code point boundary after a code point. - * If the offset is behind a partial multi-byte sequence, - * then the offset is incremented to behind the whole sequence. - * Otherwise, it is not modified. - * The input offset may be the same as the string length. - * "Unsafe" macro, assumes well-formed UTF-8. - * - * @param s const uint8_t * string - * @param i string offset - * @see U8_SET_CP_LIMIT - * @stable ICU 2.4 - */ -#define U8_SET_CP_LIMIT_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \ - U8_BACK_1_UNSAFE(s, i); \ - U8_FWD_1_UNSAFE(s, i); \ -} UPRV_BLOCK_MACRO_END - -/** - * Adjust a random-access offset to a code point boundary after a code point. - * If the offset is behind a partial multi-byte sequence, - * then the offset is incremented to behind the whole sequence. - * Otherwise, it is not modified. - * The input offset may be the same as the string length. - * "Safe" macro, checks for illegal sequences and for string boundaries. - * - * The length can be negative for a NUL-terminated string. - * - * @param s const uint8_t * string - * @param start int32_t starting string offset (usually 0) - * @param i int32_t string offset, must be start<=i<=length - * @param length int32_t string length - * @see U8_SET_CP_LIMIT_UNSAFE - * @stable ICU 2.4 - */ -#define U8_SET_CP_LIMIT(s, start, i, length) UPRV_BLOCK_MACRO_BEGIN { \ - if((start)<(i) && ((i)<(length) || (length)<0)) { \ - U8_BACK_1(s, start, i); \ - U8_FWD_1(s, i, length); \ - } \ -} UPRV_BLOCK_MACRO_END - -#endif diff --git a/third-party/CMakeLists.txt b/third-party/CMakeLists.txt index 493d7aacdd..27fa05660a 100644 --- a/third-party/CMakeLists.txt +++ b/third-party/CMakeLists.txt @@ -42,6 +42,7 @@ option(USE_BUNDLED_LUV "Use the bundled version of luv." ${USE_BUNDLED}) # build it unless explicitly requested option(USE_BUNDLED_LUA "Use the bundled version of lua." OFF) option(USE_BUNDLED_TS_PARSERS "Use the bundled treesitter parsers." ${USE_BUNDLED}) +option(USE_BUNDLED_TS "Use the bundled treesitter runtime." ${USE_BUNDLED}) if(USE_BUNDLED AND MSVC) option(USE_BUNDLED_GETTEXT "Use the bundled version of gettext." ON) @@ -198,6 +199,9 @@ set(LIBICONV_SHA256 ccf536620a45458d26ba83887a983b96827001e92a13847b45e4925cc891 set(TREESITTER_C_URL https://github.com/tree-sitter/tree-sitter-c/archive/6002fcd.tar.gz) set(TREESITTER_C_SHA256 46f8d44fa886d9ddb92571bb6fa8b175992c8758eca749cb1217464e512b6e97) +set(TREESITTER_URL https://github.com/tree-sitter/tree-sitter/archive/0.16.9.zip) +set(TREESITTER_SHA256 63ef1f0cfde0f37f4f15803e9412863a397c5276dbc680e8fc917c9f6851ea9b) + if(USE_BUNDLED_UNIBILIUM) include(BuildUnibilium) endif() @@ -253,6 +257,10 @@ if(USE_BUNDLED_TS_PARSERS) include(BuildTreesitterParsers) endif() +if(USE_BUNDLED_TS) + include(BuildTreesitter) +endif() + if(WIN32) include(GetBinaryDeps) diff --git a/third-party/cmake/BuildTreesitter.cmake b/third-party/cmake/BuildTreesitter.cmake new file mode 100644 index 0000000000..3212d6ea08 --- /dev/null +++ b/third-party/cmake/BuildTreesitter.cmake @@ -0,0 +1,22 @@ +set(TS_CFLAGS "-O3 -Wall -Wextra") + +ExternalProject_Add(tree-sitter + PREFIX ${DEPS_BUILD_DIR} + URL ${TREESITTER_URL} + DOWNLOAD_DIR ${DEPS_DOWNLOAD_DIR}/tree-sitter + INSTALL_DIR ${DEPS_INSTALL_DIR} + DOWNLOAD_COMMAND ${CMAKE_COMMAND} + -DPREFIX=${DEPS_BUILD_DIR} + -DDOWNLOAD_DIR=${DEPS_DOWNLOAD_DIR}/tree-sitter + -DURL=${TREESITTER_URL} + -DEXPECTED_SHA256=${TREESITTER_SHA256} + -DTARGET=tree-sitter + -DUSE_EXISTING_SRC_DIR=${USE_EXISTING_SRC_DIR} + -P ${CMAKE_CURRENT_SOURCE_DIR}/cmake/DownloadAndExtractFile.cmake + BUILD_IN_SOURCE 1 + PATCH_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_COMMAND ${MAKE_PRG} CC=${DEPS_C_COMPILER} CFLAGS=${TS_CFLAGS} + INSTALL_COMMAND ${MAKE_PRG} CC=${DEPS_C_COMPILER} PREFIX=${DEPS_INSTALL_DIR} install) + +list(APPEND THIRD_PARTY_DEPS tree-sitter) |