From 79bd8d2ab6cae1c0be3233a9a7551d0b7bcc5944 Mon Sep 17 00:00:00 2001 From: Björn Linse Date: Sat, 28 Sep 2019 18:41:49 +0200 Subject: tree-sitter: update vendored tree-sitter runtime tree-sitter/tree-sitter commit edb569310005c66838b7d69fa60850acac6abeee Included files are: lib/include/tree-sitter/*.h lib/src/*.[ch] lib/src/unicode/* LICENSE --- src/tree_sitter/query.c | 1450 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1450 insertions(+) create mode 100644 src/tree_sitter/query.c (limited to 'src/tree_sitter/query.c') diff --git a/src/tree_sitter/query.c b/src/tree_sitter/query.c new file mode 100644 index 0000000000..a5ce86032c --- /dev/null +++ b/src/tree_sitter/query.c @@ -0,0 +1,1450 @@ +#include "tree_sitter/api.h" +#include "./alloc.h" +#include "./array.h" +#include "./bits.h" +#include "./language.h" +#include "./point.h" +#include "./tree_cursor.h" +#include "./unicode.h" +#include + +/* + * Stream - A sequence of unicode characters derived from a UTF8 string. + * This struct is used in parsing queries from S-expressions. + */ +typedef struct { + const char *input; + const char *end; + int32_t next; + uint8_t next_size; +} Stream; + +/* + * QueryStep - A step in the process of matching a query. Each node within + * a query S-expression maps to one of these steps. An entire pattern is + * represented as a sequence of these steps. Fields: + * + * - `symbol` - The grammar symbol to match. A zero value represents the + * wildcard symbol, '*'. + * - `field` - The field name to match. A zero value means that a field name + * was not specified. + * - `capture_id` - An integer representing the name of the capture associated + * with this node in the pattern. A `NONE` value means this node is not + * captured in this pattern. + * - `depth` - The depth where this node occurs in the pattern. The root node + * of the pattern has depth zero. + */ +typedef struct { + TSSymbol symbol; + TSFieldId field; + uint16_t capture_id; + uint16_t depth: 15; + bool contains_captures: 1; +} QueryStep; + +/* + * Slice - A slice of an external array. Within a query, capture names, + * literal string values, and predicate step informations are stored in three + * contiguous arrays. Individual captures, string values, and predicates are + * represented as slices of these three arrays. + */ +typedef struct { + uint32_t offset; + uint32_t length; +} Slice; + +/* + * SymbolTable - a two-way mapping of strings to ids. + */ +typedef struct { + Array(char) characters; + Array(Slice) slices; +} SymbolTable; + +/* + * PatternEntry - The set of steps needed to match a particular pattern, + * represented as a slice of a shared array. These entries are stored in a + * 'pattern map' - a sorted array that makes it possible to efficiently lookup + * patterns based on the symbol for their first step. + */ +typedef struct { + uint16_t step_index; + uint16_t pattern_index; +} PatternEntry; + +/* + * QueryState - The state of an in-progress match of a particular pattern + * in a query. While executing, a `TSQueryCursor` must keep track of a number + * of possible in-progress matches. Each of those possible matches is + * represented as one of these states. + */ +typedef struct { + uint16_t start_depth; + uint16_t pattern_index; + uint16_t step_index; + uint16_t capture_count; + uint16_t capture_list_id; + uint16_t consumed_capture_count; + uint32_t id; +} QueryState; + +/* + * CaptureListPool - A collection of *lists* of captures. Each QueryState + * needs to maintain its own list of captures. They are all represented as + * slices of one shared array. The CaptureListPool keeps track of which + * parts of the shared array are currently in use by a QueryState. + */ +typedef struct { + Array(TSQueryCapture) list; + uint32_t usage_map; +} CaptureListPool; + +/* + * TSQuery - A tree query, compiled from a string of S-expressions. The query + * itself is immutable. The mutable state used in the process of executing the + * query is stored in a `TSQueryCursor`. + */ +struct TSQuery { + SymbolTable captures; + SymbolTable predicate_values; + Array(QueryStep) steps; + Array(PatternEntry) pattern_map; + Array(TSQueryPredicateStep) predicate_steps; + Array(Slice) predicates_by_pattern; + Array(uint32_t) start_bytes_by_pattern; + const TSLanguage *language; + uint16_t max_capture_count; + uint16_t wildcard_root_pattern_count; + TSSymbol *symbol_map; +}; + +/* + * TSQueryCursor - A stateful struct used to execute a query on a tree. + */ +struct TSQueryCursor { + const TSQuery *query; + TSTreeCursor cursor; + Array(QueryState) states; + Array(QueryState) finished_states; + CaptureListPool capture_list_pool; + uint32_t depth; + uint32_t start_byte; + uint32_t end_byte; + uint32_t next_state_id; + TSPoint start_point; + TSPoint end_point; + bool ascending; +}; + +static const TSQueryError PARENT_DONE = -1; +static const uint8_t PATTERN_DONE_MARKER = UINT8_MAX; +static const uint16_t NONE = UINT16_MAX; +static const TSSymbol WILDCARD_SYMBOL = 0; +static const uint16_t MAX_STATE_COUNT = 32; + +// #define LOG(...) fprintf(stderr, __VA_ARGS__) +#define LOG(...) + +/********** + * Stream + **********/ + +// Advance to the next unicode code point in the stream. +static bool stream_advance(Stream *self) { + self->input += self->next_size; + if (self->input < self->end) { + uint32_t size = ts_decode_utf8( + (const uint8_t *)self->input, + self->end - self->input, + &self->next + ); + if (size > 0) { + self->next_size = size; + return true; + } + } else { + self->next_size = 0; + self->next = '\0'; + } + return false; +} + +// Reset the stream to the given input position, represented as a pointer +// into the input string. +static void stream_reset(Stream *self, const char *input) { + self->input = input; + self->next_size = 0; + stream_advance(self); +} + +static Stream stream_new(const char *string, uint32_t length) { + Stream self = { + .next = 0, + .input = string, + .end = string + length, + }; + stream_advance(&self); + return self; +} + +static void stream_skip_whitespace(Stream *stream) { + for (;;) { + if (iswspace(stream->next)) { + stream_advance(stream); + } else if (stream->next == ';') { + // skip over comments + stream_advance(stream); + while (stream->next && stream->next != '\n') { + if (!stream_advance(stream)) break; + } + } else { + break; + } + } +} + +static bool stream_is_ident_start(Stream *stream) { + return iswalnum(stream->next) || stream->next == '_' || stream->next == '-'; +} + +static void stream_scan_identifier(Stream *stream) { + do { + stream_advance(stream); + } while ( + iswalnum(stream->next) || + stream->next == '_' || + stream->next == '-' || + stream->next == '.' || + stream->next == '?' || + stream->next == '!' + ); +} + +/****************** + * CaptureListPool + ******************/ + +static CaptureListPool capture_list_pool_new() { + return (CaptureListPool) { + .list = array_new(), + .usage_map = UINT32_MAX, + }; +} + +static void capture_list_pool_reset(CaptureListPool *self, uint16_t list_size) { + self->usage_map = UINT32_MAX; + uint32_t total_size = MAX_STATE_COUNT * list_size; + array_reserve(&self->list, total_size); + self->list.size = total_size; +} + +static void capture_list_pool_delete(CaptureListPool *self) { + array_delete(&self->list); +} + +static TSQueryCapture *capture_list_pool_get(CaptureListPool *self, uint16_t id) { + return &self->list.contents[id * (self->list.size / MAX_STATE_COUNT)]; +} + +static bool capture_list_pool_is_empty(const CaptureListPool *self) { + return self->usage_map == 0; +} + +static uint16_t capture_list_pool_acquire(CaptureListPool *self) { + // In the usage_map bitmask, ones represent free lists, and zeros represent + // lists that are in use. A free list id can quickly be found by counting + // the leading zeros in the usage map. An id of zero corresponds to the + // highest-order bit in the bitmask. + uint16_t id = count_leading_zeros(self->usage_map); + if (id == 32) return NONE; + self->usage_map &= ~bitmask_for_index(id); + return id; +} + +static void capture_list_pool_release(CaptureListPool *self, uint16_t id) { + self->usage_map |= bitmask_for_index(id); +} + +/************** + * SymbolTable + **************/ + +static SymbolTable symbol_table_new() { + return (SymbolTable) { + .characters = array_new(), + .slices = array_new(), + }; +} + +static void symbol_table_delete(SymbolTable *self) { + array_delete(&self->characters); + array_delete(&self->slices); +} + +static int symbol_table_id_for_name( + const SymbolTable *self, + const char *name, + uint32_t length +) { + for (unsigned i = 0; i < self->slices.size; i++) { + Slice slice = self->slices.contents[i]; + if ( + slice.length == length && + !strncmp(&self->characters.contents[slice.offset], name, length) + ) return i; + } + return -1; +} + +static const char *symbol_table_name_for_id( + const SymbolTable *self, + uint16_t id, + uint32_t *length +) { + Slice slice = self->slices.contents[id]; + *length = slice.length; + return &self->characters.contents[slice.offset]; +} + +static uint16_t symbol_table_insert_name( + SymbolTable *self, + const char *name, + uint32_t length +) { + int id = symbol_table_id_for_name(self, name, length); + if (id >= 0) return (uint16_t)id; + Slice slice = { + .offset = self->characters.size, + .length = length, + }; + array_grow_by(&self->characters, length + 1); + memcpy(&self->characters.contents[slice.offset], name, length); + self->characters.contents[self->characters.size - 1] = 0; + array_push(&self->slices, slice); + return self->slices.size - 1; +} + +/********* + * Query + *********/ + +// The `pattern_map` contains a mapping from TSSymbol values to indices in the +// `steps` array. For a given syntax node, the `pattern_map` makes it possible +// to quickly find the starting steps of all of the patterns whose root matches +// that node. Each entry has two fields: a `pattern_index`, which identifies one +// of the patterns in the query, and a `step_index`, which indicates the start +// offset of that pattern's steps pattern within the `steps` array. +// +// The entries are sorted by the patterns' root symbols, and lookups use a +// binary search. This ensures that the cost of this initial lookup step +// scales logarithmically with the number of patterns in the query. +// +// This returns `true` if the symbol is present and `false` otherwise. +// If the symbol is not present `*result` is set to the index where the +// symbol should be inserted. +static inline bool ts_query__pattern_map_search( + const TSQuery *self, + TSSymbol needle, + uint32_t *result +) { + uint32_t base_index = self->wildcard_root_pattern_count; + uint32_t size = self->pattern_map.size - base_index; + if (size == 0) { + *result = base_index; + return false; + } + while (size > 1) { + uint32_t half_size = size / 2; + uint32_t mid_index = base_index + half_size; + TSSymbol mid_symbol = self->steps.contents[ + self->pattern_map.contents[mid_index].step_index + ].symbol; + if (needle > mid_symbol) base_index = mid_index; + size -= half_size; + } + + TSSymbol symbol = self->steps.contents[ + self->pattern_map.contents[base_index].step_index + ].symbol; + + if (needle > symbol) { + base_index++; + if (base_index < self->pattern_map.size) { + symbol = self->steps.contents[ + self->pattern_map.contents[base_index].step_index + ].symbol; + } + } + + *result = base_index; + return needle == symbol; +} + +// Insert a new pattern's start index into the pattern map, maintaining +// the pattern map's ordering invariant. +static inline void ts_query__pattern_map_insert( + TSQuery *self, + TSSymbol symbol, + uint32_t start_step_index +) { + uint32_t index; + ts_query__pattern_map_search(self, symbol, &index); + array_insert(&self->pattern_map, index, ((PatternEntry) { + .step_index = start_step_index, + .pattern_index = self->pattern_map.size, + })); +} + +static void ts_query__finalize_steps(TSQuery *self) { + for (unsigned i = 0; i < self->steps.size; i++) { + QueryStep *step = &self->steps.contents[i]; + uint32_t depth = step->depth; + if (step->capture_id != NONE) { + step->contains_captures = true; + } else { + step->contains_captures = false; + for (unsigned j = i + 1; j < self->steps.size; j++) { + QueryStep *s = &self->steps.contents[j]; + if (s->depth == PATTERN_DONE_MARKER || s->depth <= depth) break; + if (s->capture_id != NONE) step->contains_captures = true; + } + } + } +} + +// Parse a single predicate associated with a pattern, adding it to the +// query's internal `predicate_steps` array. Predicates are arbitrary +// S-expressions associated with a pattern which are meant to be handled at +// a higher level of abstraction, such as the Rust/JavaScript bindings. They +// can contain '@'-prefixed capture names, double-quoted strings, and bare +// symbols, which also represent strings. +static TSQueryError ts_query__parse_predicate( + TSQuery *self, + Stream *stream +) { + if (stream->next == ')') return PARENT_DONE; + if (stream->next != '(') return TSQueryErrorSyntax; + stream_advance(stream); + stream_skip_whitespace(stream); + + unsigned step_count = 0; + for (;;) { + if (stream->next == ')') { + stream_advance(stream); + stream_skip_whitespace(stream); + array_back(&self->predicates_by_pattern)->length++; + array_push(&self->predicate_steps, ((TSQueryPredicateStep) { + .type = TSQueryPredicateStepTypeDone, + .value_id = 0, + })); + break; + } + + // Parse an '@'-prefixed capture name + else if (stream->next == '@') { + stream_advance(stream); + + // Parse the capture name + if (!stream_is_ident_start(stream)) return TSQueryErrorSyntax; + const char *capture_name = stream->input; + stream_scan_identifier(stream); + uint32_t length = stream->input - capture_name; + + // Add the capture id to the first step of the pattern + int capture_id = symbol_table_id_for_name( + &self->captures, + capture_name, + length + ); + if (capture_id == -1) { + stream_reset(stream, capture_name); + return TSQueryErrorCapture; + } + + array_back(&self->predicates_by_pattern)->length++; + array_push(&self->predicate_steps, ((TSQueryPredicateStep) { + .type = TSQueryPredicateStepTypeCapture, + .value_id = capture_id, + })); + } + + // Parse a string literal + else if (stream->next == '"') { + stream_advance(stream); + + // Parse the string content + const char *string_content = stream->input; + while (stream->next != '"') { + if (stream->next == '\n' || !stream_advance(stream)) { + stream_reset(stream, string_content - 1); + return TSQueryErrorSyntax; + } + } + uint32_t length = stream->input - string_content; + + // Add a step for the node + uint16_t id = symbol_table_insert_name( + &self->predicate_values, + string_content, + length + ); + array_back(&self->predicates_by_pattern)->length++; + array_push(&self->predicate_steps, ((TSQueryPredicateStep) { + .type = TSQueryPredicateStepTypeString, + .value_id = id, + })); + + if (stream->next != '"') return TSQueryErrorSyntax; + stream_advance(stream); + } + + // Parse a bare symbol + else if (stream_is_ident_start(stream)) { + const char *symbol_start = stream->input; + stream_scan_identifier(stream); + uint32_t length = stream->input - symbol_start; + uint16_t id = symbol_table_insert_name( + &self->predicate_values, + symbol_start, + length + ); + array_back(&self->predicates_by_pattern)->length++; + array_push(&self->predicate_steps, ((TSQueryPredicateStep) { + .type = TSQueryPredicateStepTypeString, + .value_id = id, + })); + } + + else { + return TSQueryErrorSyntax; + } + + step_count++; + stream_skip_whitespace(stream); + } + + return 0; +} + +// Read one S-expression pattern from the stream, and incorporate it into +// the query's internal state machine representation. For nested patterns, +// this function calls itself recursively. +static TSQueryError ts_query__parse_pattern( + TSQuery *self, + Stream *stream, + uint32_t depth, + uint32_t *capture_count +) { + uint16_t starting_step_index = self->steps.size; + + if (stream->next == 0) return TSQueryErrorSyntax; + + // Finish the parent S-expression + if (stream->next == ')') { + return PARENT_DONE; + } + + // Parse a parenthesized node expression + else if (stream->next == '(') { + stream_advance(stream); + stream_skip_whitespace(stream); + + // Parse a nested list, which represents a pattern followed by + // zero-or-more predicates. + if (stream->next == '(' && depth == 0) { + TSQueryError e = ts_query__parse_pattern(self, stream, 0, capture_count); + if (e) return e; + + // Parse the predicates. + stream_skip_whitespace(stream); + for (;;) { + TSQueryError e = ts_query__parse_predicate(self, stream); + if (e == PARENT_DONE) { + stream_advance(stream); + stream_skip_whitespace(stream); + return 0; + } else if (e) { + return e; + } + } + } + + TSSymbol symbol; + + // Parse the wildcard symbol + if (stream->next == '*') { + symbol = WILDCARD_SYMBOL; + stream_advance(stream); + } + + // Parse a normal node name + else if (stream_is_ident_start(stream)) { + const char *node_name = stream->input; + stream_scan_identifier(stream); + uint32_t length = stream->input - node_name; + symbol = ts_language_symbol_for_name( + self->language, + node_name, + length, + true + ); + if (!symbol) { + stream_reset(stream, node_name); + return TSQueryErrorNodeType; + } + } else { + return TSQueryErrorSyntax; + } + + // Add a step for the node. + array_push(&self->steps, ((QueryStep) { + .depth = depth, + .symbol = symbol, + .field = 0, + .capture_id = NONE, + .contains_captures = false, + })); + + // Parse the child patterns + stream_skip_whitespace(stream); + for (;;) { + TSQueryError e = ts_query__parse_pattern(self, stream, depth + 1, capture_count); + if (e == PARENT_DONE) { + stream_advance(stream); + break; + } else if (e) { + return e; + } + } + } + + // Parse a double-quoted anonymous leaf node expression + else if (stream->next == '"') { + stream_advance(stream); + + // Parse the string content + const char *string_content = stream->input; + while (stream->next != '"') { + if (!stream_advance(stream)) { + stream_reset(stream, string_content - 1); + return TSQueryErrorSyntax; + } + } + uint32_t length = stream->input - string_content; + + // Add a step for the node + TSSymbol symbol = ts_language_symbol_for_name( + self->language, + string_content, + length, + false + ); + if (!symbol) { + stream_reset(stream, string_content); + return TSQueryErrorNodeType; + } + array_push(&self->steps, ((QueryStep) { + .depth = depth, + .symbol = symbol, + .field = 0, + .capture_id = NONE, + .contains_captures = false, + })); + + if (stream->next != '"') return TSQueryErrorSyntax; + stream_advance(stream); + } + + // Parse a field-prefixed pattern + else if (stream_is_ident_start(stream)) { + // Parse the field name + const char *field_name = stream->input; + stream_scan_identifier(stream); + uint32_t length = stream->input - field_name; + stream_skip_whitespace(stream); + + if (stream->next != ':') { + stream_reset(stream, field_name); + return TSQueryErrorSyntax; + } + stream_advance(stream); + stream_skip_whitespace(stream); + + // Parse the pattern + uint32_t step_index = self->steps.size; + TSQueryError e = ts_query__parse_pattern(self, stream, depth, capture_count); + if (e == PARENT_DONE) return TSQueryErrorSyntax; + if (e) return e; + + // Add the field name to the first step of the pattern + TSFieldId field_id = ts_language_field_id_for_name( + self->language, + field_name, + length + ); + if (!field_id) { + stream->input = field_name; + return TSQueryErrorField; + } + self->steps.contents[step_index].field = field_id; + } + + // Parse a wildcard pattern + else if (stream->next == '*') { + stream_advance(stream); + stream_skip_whitespace(stream); + + // Add a step that matches any kind of node + array_push(&self->steps, ((QueryStep) { + .depth = depth, + .symbol = WILDCARD_SYMBOL, + .field = 0, + .contains_captures = false, + })); + } + + else { + return TSQueryErrorSyntax; + } + + stream_skip_whitespace(stream); + + // Parse an '@'-prefixed capture pattern + if (stream->next == '@') { + stream_advance(stream); + + // Parse the capture name + if (!stream_is_ident_start(stream)) return TSQueryErrorSyntax; + const char *capture_name = stream->input; + stream_scan_identifier(stream); + uint32_t length = stream->input - capture_name; + + // Add the capture id to the first step of the pattern + uint16_t capture_id = symbol_table_insert_name( + &self->captures, + capture_name, + length + ); + self->steps.contents[starting_step_index].capture_id = capture_id; + (*capture_count)++; + + stream_skip_whitespace(stream); + } + + return 0; +} + +TSQuery *ts_query_new( + const TSLanguage *language, + const char *source, + uint32_t source_len, + uint32_t *error_offset, + TSQueryError *error_type +) { + TSSymbol *symbol_map; + if (ts_language_version(language) >= TREE_SITTER_LANGUAGE_VERSION_WITH_SYMBOL_DEDUPING) { + symbol_map = NULL; + } else { + // Work around the fact that multiple symbols can currently be + // associated with the same name, due to "simple aliases". + // In the next language ABI version, this map will be contained + // in the language's `public_symbol_map` field. + uint32_t symbol_count = ts_language_symbol_count(language); + symbol_map = ts_malloc(sizeof(TSSymbol) * symbol_count); + for (unsigned i = 0; i < symbol_count; i++) { + const char *name = ts_language_symbol_name(language, i); + const TSSymbolType symbol_type = ts_language_symbol_type(language, i); + + symbol_map[i] = i; + + for (unsigned j = 0; j < i; j++) { + if (ts_language_symbol_type(language, j) == symbol_type) { + if (!strcmp(name, ts_language_symbol_name(language, j))) { + symbol_map[i] = j; + break; + } + } + } + } + } + + TSQuery *self = ts_malloc(sizeof(TSQuery)); + *self = (TSQuery) { + .steps = array_new(), + .pattern_map = array_new(), + .captures = symbol_table_new(), + .predicate_values = symbol_table_new(), + .predicate_steps = array_new(), + .predicates_by_pattern = array_new(), + .symbol_map = symbol_map, + .wildcard_root_pattern_count = 0, + .max_capture_count = 0, + .language = language, + }; + + // Parse all of the S-expressions in the given string. + Stream stream = stream_new(source, source_len); + stream_skip_whitespace(&stream); + uint32_t start_step_index; + while (stream.input < stream.end) { + start_step_index = self->steps.size; + uint32_t capture_count = 0; + array_push(&self->start_bytes_by_pattern, stream.input - source); + array_push(&self->predicates_by_pattern, ((Slice) { + .offset = self->predicate_steps.size, + .length = 0, + })); + *error_type = ts_query__parse_pattern(self, &stream, 0, &capture_count); + array_push(&self->steps, ((QueryStep) { .depth = PATTERN_DONE_MARKER })); + + // If any pattern could not be parsed, then report the error information + // and terminate. + if (*error_type) { + *error_offset = stream.input - source; + ts_query_delete(self); + return NULL; + } + + // Maintain a map that can look up patterns for a given root symbol. + ts_query__pattern_map_insert( + self, + self->steps.contents[start_step_index].symbol, + start_step_index + ); + if (self->steps.contents[start_step_index].symbol == WILDCARD_SYMBOL) { + self->wildcard_root_pattern_count++; + } + + // Keep track of the maximum number of captures in pattern, because + // that numer determines how much space is needed to store each capture + // list. + if (capture_count > self->max_capture_count) { + self->max_capture_count = capture_count; + } + } + + ts_query__finalize_steps(self); + return self; +} + +void ts_query_delete(TSQuery *self) { + if (self) { + array_delete(&self->steps); + array_delete(&self->pattern_map); + array_delete(&self->predicate_steps); + array_delete(&self->predicates_by_pattern); + array_delete(&self->start_bytes_by_pattern); + symbol_table_delete(&self->captures); + symbol_table_delete(&self->predicate_values); + ts_free(self->symbol_map); + ts_free(self); + } +} + +uint32_t ts_query_pattern_count(const TSQuery *self) { + return self->predicates_by_pattern.size; +} + +uint32_t ts_query_capture_count(const TSQuery *self) { + return self->captures.slices.size; +} + +uint32_t ts_query_string_count(const TSQuery *self) { + return self->predicate_values.slices.size; +} + +const char *ts_query_capture_name_for_id( + const TSQuery *self, + uint32_t index, + uint32_t *length +) { + return symbol_table_name_for_id(&self->captures, index, length); +} + +const char *ts_query_string_value_for_id( + const TSQuery *self, + uint32_t index, + uint32_t *length +) { + return symbol_table_name_for_id(&self->predicate_values, index, length); +} + +const TSQueryPredicateStep *ts_query_predicates_for_pattern( + const TSQuery *self, + uint32_t pattern_index, + uint32_t *step_count +) { + Slice slice = self->predicates_by_pattern.contents[pattern_index]; + *step_count = slice.length; + return &self->predicate_steps.contents[slice.offset]; +} + +uint32_t ts_query_start_byte_for_pattern( + const TSQuery *self, + uint32_t pattern_index +) { + return self->start_bytes_by_pattern.contents[pattern_index]; +} + +void ts_query_disable_capture( + TSQuery *self, + const char *name, + uint32_t length +) { + int id = symbol_table_id_for_name(&self->captures, name, length); + if (id != -1) { + for (unsigned i = 0; i < self->steps.size; i++) { + QueryStep *step = &self->steps.contents[i]; + if (step->capture_id == id) { + step->capture_id = NONE; + } + } + } + ts_query__finalize_steps(self); +} + +/*************** + * QueryCursor + ***************/ + +TSQueryCursor *ts_query_cursor_new() { + TSQueryCursor *self = ts_malloc(sizeof(TSQueryCursor)); + *self = (TSQueryCursor) { + .ascending = false, + .states = array_new(), + .finished_states = array_new(), + .capture_list_pool = capture_list_pool_new(), + .start_byte = 0, + .end_byte = UINT32_MAX, + .start_point = {0, 0}, + .end_point = POINT_MAX, + }; + array_reserve(&self->states, MAX_STATE_COUNT); + array_reserve(&self->finished_states, MAX_STATE_COUNT); + return self; +} + +void ts_query_cursor_delete(TSQueryCursor *self) { + array_delete(&self->states); + array_delete(&self->finished_states); + ts_tree_cursor_delete(&self->cursor); + capture_list_pool_delete(&self->capture_list_pool); + ts_free(self); +} + +void ts_query_cursor_exec( + TSQueryCursor *self, + const TSQuery *query, + TSNode node +) { + array_clear(&self->states); + array_clear(&self->finished_states); + ts_tree_cursor_reset(&self->cursor, node); + capture_list_pool_reset(&self->capture_list_pool, query->max_capture_count); + self->next_state_id = 0; + self->depth = 0; + self->ascending = false; + self->query = query; +} + +void ts_query_cursor_set_byte_range( + TSQueryCursor *self, + uint32_t start_byte, + uint32_t end_byte +) { + if (end_byte == 0) { + start_byte = 0; + end_byte = UINT32_MAX; + } + self->start_byte = start_byte; + self->end_byte = end_byte; +} + +void ts_query_cursor_set_point_range( + TSQueryCursor *self, + TSPoint start_point, + TSPoint end_point +) { + if (end_point.row == 0 && end_point.column == 0) { + start_point = POINT_ZERO; + end_point = POINT_MAX; + } + self->start_point = start_point; + self->end_point = end_point; +} + +// Search through all of the in-progress states, and find the captured +// node that occurs earliest in the document. +static bool ts_query_cursor__first_in_progress_capture( + TSQueryCursor *self, + uint32_t *state_index, + uint32_t *byte_offset, + uint32_t *pattern_index +) { + bool result = false; + for (unsigned i = 0; i < self->states.size; i++) { + const QueryState *state = &self->states.contents[i]; + if (state->capture_count > 0) { + const TSQueryCapture *captures = capture_list_pool_get( + &self->capture_list_pool, + state->capture_list_id + ); + uint32_t capture_byte = ts_node_start_byte(captures[0].node); + if ( + !result || + capture_byte < *byte_offset || + ( + capture_byte == *byte_offset && + state->pattern_index < *pattern_index + ) + ) { + result = true; + *state_index = i; + *byte_offset = capture_byte; + *pattern_index = state->pattern_index; + } + } + } + return result; +} + +static bool ts_query__cursor_add_state( + TSQueryCursor *self, + const PatternEntry *slice +) { + uint32_t list_id = capture_list_pool_acquire(&self->capture_list_pool); + + // If there are no capture lists left in the pool, then terminate whichever + // state has captured the earliest node in the document, and steal its + // capture list. + if (list_id == NONE) { + uint32_t state_index, byte_offset, pattern_index; + if (ts_query_cursor__first_in_progress_capture( + self, + &state_index, + &byte_offset, + &pattern_index + )) { + LOG( + " abandon state. index:%u, pattern:%u, offset:%u.\n", + state_index, pattern_index, byte_offset + ); + list_id = self->states.contents[state_index].capture_list_id; + array_erase(&self->states, state_index); + } else { + LOG(" too many finished states.\n"); + return false; + } + } + + LOG(" start state. pattern:%u\n", slice->pattern_index); + array_push(&self->states, ((QueryState) { + .capture_list_id = list_id, + .step_index = slice->step_index, + .pattern_index = slice->pattern_index, + .start_depth = self->depth, + .capture_count = 0, + .consumed_capture_count = 0, + })); + return true; +} + +static QueryState *ts_query__cursor_copy_state( + TSQueryCursor *self, + const QueryState *state +) { + uint32_t new_list_id = capture_list_pool_acquire(&self->capture_list_pool); + if (new_list_id == NONE) return NULL; + array_push(&self->states, *state); + QueryState *new_state = array_back(&self->states); + new_state->capture_list_id = new_list_id; + TSQueryCapture *old_captures = capture_list_pool_get( + &self->capture_list_pool, + state->capture_list_id + ); + TSQueryCapture *new_captures = capture_list_pool_get( + &self->capture_list_pool, + new_list_id + ); + memcpy(new_captures, old_captures, state->capture_count * sizeof(TSQueryCapture)); + return new_state; +} + +// Walk the tree, processing patterns until at least one pattern finishes, +// If one or more patterns finish, return `true` and store their states in the +// `finished_states` array. Multiple patterns can finish on the same node. If +// there are no more matches, return `false`. +static inline bool ts_query_cursor__advance(TSQueryCursor *self) { + do { + if (self->ascending) { + LOG("leave node. type:%s\n", ts_node_type(ts_tree_cursor_current_node(&self->cursor))); + + // When leaving a node, remove any unfinished states whose next step + // needed to match something within that node. + uint32_t deleted_count = 0; + for (unsigned i = 0, n = self->states.size; i < n; i++) { + QueryState *state = &self->states.contents[i]; + QueryStep *step = &self->query->steps.contents[state->step_index]; + + if ((uint32_t)state->start_depth + (uint32_t)step->depth > self->depth) { + LOG( + " failed to match. pattern:%u, step:%u\n", + state->pattern_index, + state->step_index + ); + + capture_list_pool_release( + &self->capture_list_pool, + state->capture_list_id + ); + deleted_count++; + } else if (deleted_count > 0) { + self->states.contents[i - deleted_count] = *state; + } + } + + self->states.size -= deleted_count; + + if (ts_tree_cursor_goto_next_sibling(&self->cursor)) { + self->ascending = false; + } else if (ts_tree_cursor_goto_parent(&self->cursor)) { + self->depth--; + } else { + return self->finished_states.size > 0; + } + } else { + bool can_have_later_siblings; + bool can_have_later_siblings_with_this_field; + TSFieldId field_id = ts_tree_cursor_current_status( + &self->cursor, + &can_have_later_siblings, + &can_have_later_siblings_with_this_field + ); + TSNode node = ts_tree_cursor_current_node(&self->cursor); + TSSymbol symbol = ts_node_symbol(node); + if (symbol != ts_builtin_sym_error && self->query->symbol_map) { + symbol = self->query->symbol_map[symbol]; + } + + // If this node is before the selected range, then avoid descending + // into it. + if ( + ts_node_end_byte(node) <= self->start_byte || + point_lte(ts_node_end_point(node), self->start_point) + ) { + if (!ts_tree_cursor_goto_next_sibling(&self->cursor)) { + self->ascending = true; + } + continue; + } + + // If this node is after the selected range, then stop walking. + if ( + self->end_byte <= ts_node_start_byte(node) || + point_lte(self->end_point, ts_node_start_point(node)) + ) return false; + + LOG( + "enter node. type:%s, field:%s, row:%u state_count:%u, finished_state_count:%u, can_have_later_siblings:%d, can_have_later_siblings_with_this_field:%d\n", + ts_node_type(node), + ts_language_field_name_for_id(self->query->language, field_id), + ts_node_start_point(node).row, + self->states.size, + self->finished_states.size, + can_have_later_siblings, + can_have_later_siblings_with_this_field + ); + + // Add new states for any patterns whose root node is a wildcard. + for (unsigned i = 0; i < self->query->wildcard_root_pattern_count; i++) { + PatternEntry *slice = &self->query->pattern_map.contents[i]; + QueryStep *step = &self->query->steps.contents[slice->step_index]; + + // If this node matches the first step of the pattern, then add a new + // state at the start of this pattern. + if (step->field && field_id != step->field) continue; + if (!ts_query__cursor_add_state(self, slice)) break; + } + + // Add new states for any patterns whose root node matches this node. + unsigned i; + if (ts_query__pattern_map_search(self->query, symbol, &i)) { + PatternEntry *slice = &self->query->pattern_map.contents[i]; + QueryStep *step = &self->query->steps.contents[slice->step_index]; + do { + // If this node matches the first step of the pattern, then add a new + // state at the start of this pattern. + if (step->field && field_id != step->field) continue; + if (!ts_query__cursor_add_state(self, slice)) break; + + // Advance to the next pattern whose root node matches this node. + i++; + if (i == self->query->pattern_map.size) break; + slice = &self->query->pattern_map.contents[i]; + step = &self->query->steps.contents[slice->step_index]; + } while (step->symbol == symbol); + } + + // Update all of the in-progress states with current node. + for (unsigned i = 0, n = self->states.size; i < n; i++) { + QueryState *state = &self->states.contents[i]; + QueryStep *step = &self->query->steps.contents[state->step_index]; + + // Check that the node matches all of the criteria for the next + // step of the pattern.if ( + if ((uint32_t)state->start_depth + (uint32_t)step->depth != self->depth) continue; + + // Determine if this node matches this step of the pattern, and also + // if this node can have later siblings that match this step of the + // pattern. + bool node_does_match = !step->symbol || step->symbol == symbol; + bool later_sibling_can_match = can_have_later_siblings; + if (step->field) { + if (step->field == field_id) { + if (!can_have_later_siblings_with_this_field) { + later_sibling_can_match = false; + } + } else { + node_does_match = false; + } + } + + if (!node_does_match) { + if (!later_sibling_can_match) { + LOG( + " discard state. pattern:%u, step:%u\n", + state->pattern_index, + state->step_index + ); + capture_list_pool_release( + &self->capture_list_pool, + state->capture_list_id + ); + array_erase(&self->states, i); + i--; + n--; + } + continue; + } + + // Some patterns can match their root node in multiple ways, + // capturing different children. If this pattern step could match + // later children within the same parent, then this query state + // cannot simply be updated in place. It must be split into two + // states: one that matches this node, and one which skips over + // this node, to preserve the possibility of matching later + // siblings. + QueryState *next_state = state; + if ( + step->depth > 0 && + step->contains_captures && + later_sibling_can_match + ) { + QueryState *copy = ts_query__cursor_copy_state(self, state); + if (copy) { + LOG( + " split state. pattern:%u, step:%u\n", + copy->pattern_index, + copy->step_index + ); + next_state = copy; + } else { + LOG(" canot split state.\n"); + } + } + + LOG( + " advance state. pattern:%u, step:%u\n", + next_state->pattern_index, + next_state->step_index + ); + + // If the current node is captured in this pattern, add it to the + // capture list. + if (step->capture_id != NONE) { + LOG( + " capture node. pattern:%u, capture_id:%u\n", + next_state->pattern_index, + step->capture_id + ); + TSQueryCapture *capture_list = capture_list_pool_get( + &self->capture_list_pool, + next_state->capture_list_id + ); + capture_list[next_state->capture_count++] = (TSQueryCapture) { + node, + step->capture_id + }; + } + + // If the pattern is now done, then remove it from the list of + // in-progress states, and add it to the list of finished states. + next_state->step_index++; + QueryStep *next_step = step + 1; + if (next_step->depth == PATTERN_DONE_MARKER) { + LOG(" finish pattern %u\n", next_state->pattern_index); + + next_state->id = self->next_state_id++; + array_push(&self->finished_states, *next_state); + if (next_state == state) { + array_erase(&self->states, i); + i--; + n--; + } else { + self->states.size--; + } + } + } + + // Continue descending if possible. + if (ts_tree_cursor_goto_first_child(&self->cursor)) { + self->depth++; + } else { + self->ascending = true; + } + } + } while (self->finished_states.size == 0); + + return true; +} + +bool ts_query_cursor_next_match( + TSQueryCursor *self, + TSQueryMatch *match +) { + if (self->finished_states.size == 0) { + if (!ts_query_cursor__advance(self)) { + return false; + } + } + + QueryState *state = &self->finished_states.contents[0]; + match->id = state->id; + match->pattern_index = state->pattern_index; + match->capture_count = state->capture_count; + match->captures = capture_list_pool_get( + &self->capture_list_pool, + state->capture_list_id + ); + capture_list_pool_release(&self->capture_list_pool, state->capture_list_id); + array_erase(&self->finished_states, 0); + return true; +} + +void ts_query_cursor_remove_match( + TSQueryCursor *self, + uint32_t match_id +) { + for (unsigned i = 0; i < self->finished_states.size; i++) { + const QueryState *state = &self->finished_states.contents[i]; + if (state->id == match_id) { + capture_list_pool_release( + &self->capture_list_pool, + state->capture_list_id + ); + array_erase(&self->finished_states, i); + return; + } + } +} + +bool ts_query_cursor_next_capture( + TSQueryCursor *self, + TSQueryMatch *match, + uint32_t *capture_index +) { + for (;;) { + // The goal here is to return captures in order, even though they may not + // be discovered in order, because patterns can overlap. If there are any + // finished patterns, then try to find one that contains a capture that + // is *definitely* before any capture in an *unfinished* pattern. + if (self->finished_states.size > 0) { + // First, identify the position of the earliest capture in an unfinished + // match. For a finished capture to be returned, it must be *before* + // this position. + uint32_t first_unfinished_capture_byte = UINT32_MAX; + uint32_t first_unfinished_pattern_index = UINT32_MAX; + uint32_t first_unfinished_state_index; + ts_query_cursor__first_in_progress_capture( + self, + &first_unfinished_state_index, + &first_unfinished_capture_byte, + &first_unfinished_pattern_index + ); + + // Find the earliest capture in a finished match. + int first_finished_state_index = -1; + uint32_t first_finished_capture_byte = first_unfinished_capture_byte; + uint32_t first_finished_pattern_index = first_unfinished_pattern_index; + for (unsigned i = 0; i < self->finished_states.size; i++) { + const QueryState *state = &self->finished_states.contents[i]; + if (state->capture_count > state->consumed_capture_count) { + const TSQueryCapture *captures = capture_list_pool_get( + &self->capture_list_pool, + state->capture_list_id + ); + uint32_t capture_byte = ts_node_start_byte( + captures[state->consumed_capture_count].node + ); + if ( + capture_byte < first_finished_capture_byte || + ( + capture_byte == first_finished_capture_byte && + state->pattern_index < first_finished_pattern_index + ) + ) { + first_finished_state_index = i; + first_finished_capture_byte = capture_byte; + first_finished_pattern_index = state->pattern_index; + } + } else { + capture_list_pool_release( + &self->capture_list_pool, + state->capture_list_id + ); + array_erase(&self->finished_states, i); + i--; + } + } + + // If there is finished capture that is clearly before any unfinished + // capture, then return its match, and its capture index. Internally + // record the fact that the capture has been 'consumed'. + if (first_finished_state_index != -1) { + QueryState *state = &self->finished_states.contents[ + first_finished_state_index + ]; + match->id = state->id; + match->pattern_index = state->pattern_index; + match->capture_count = state->capture_count; + match->captures = capture_list_pool_get( + &self->capture_list_pool, + state->capture_list_id + ); + *capture_index = state->consumed_capture_count; + state->consumed_capture_count++; + return true; + } + + if (capture_list_pool_is_empty(&self->capture_list_pool)) { + LOG( + " abandon state. index:%u, pattern:%u, offset:%u.\n", + first_unfinished_state_index, + first_unfinished_pattern_index, + first_unfinished_capture_byte + ); + capture_list_pool_release( + &self->capture_list_pool, + self->states.contents[first_unfinished_state_index].capture_list_id + ); + array_erase(&self->states, first_unfinished_state_index); + } + } + + // If there are no finished matches that are ready to be returned, then + // continue finding more matches. + if (!ts_query_cursor__advance(self)) return false; + } +} + +#undef LOG -- cgit From c21511b2f48685461bf2655b28eff4434c91d449 Mon Sep 17 00:00:00 2001 From: Björn Linse Date: Sat, 21 Dec 2019 09:57:33 +0100 Subject: tree-sitter: fix prototypes (to be upstreamed) --- src/tree_sitter/query.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'src/tree_sitter/query.c') diff --git a/src/tree_sitter/query.c b/src/tree_sitter/query.c index a5ce86032c..2563325248 100644 --- a/src/tree_sitter/query.c +++ b/src/tree_sitter/query.c @@ -224,7 +224,7 @@ static void stream_scan_identifier(Stream *stream) { * CaptureListPool ******************/ -static CaptureListPool capture_list_pool_new() { +static CaptureListPool capture_list_pool_new(void) { return (CaptureListPool) { .list = array_new(), .usage_map = UINT32_MAX, @@ -269,7 +269,7 @@ static void capture_list_pool_release(CaptureListPool *self, uint16_t id) { * SymbolTable **************/ -static SymbolTable symbol_table_new() { +static SymbolTable symbol_table_new(void) { return (SymbolTable) { .characters = array_new(), .slices = array_new(), -- cgit From 3ce9b05653a71efe2eeaeca6b6132d1227c367b7 Mon Sep 17 00:00:00 2001 From: Björn Linse Date: Tue, 25 Feb 2020 13:03:40 +0100 Subject: treesitter: update vendored tree-sitter runtime tree-sitter/tree-sitter commit 6cb8d24de2d99c4c50c9a0fd1e719ca5b3abc87f Included files are: lib/include/tree-sitter/*.h lib/src/*.[ch] lib/src/unicode/* LICENSE --- src/tree_sitter/query.c | 218 ++++++++++++++++++++++++++++++++++-------------- 1 file changed, 154 insertions(+), 64 deletions(-) (limited to 'src/tree_sitter/query.c') diff --git a/src/tree_sitter/query.c b/src/tree_sitter/query.c index 2563325248..053882cf68 100644 --- a/src/tree_sitter/query.c +++ b/src/tree_sitter/query.c @@ -19,6 +19,8 @@ typedef struct { uint8_t next_size; } Stream; +#define MAX_STEP_CAPTURE_COUNT 4 + /* * QueryStep - A step in the process of matching a query. Each node within * a query S-expression maps to one of these steps. An entire pattern is @@ -37,9 +39,11 @@ typedef struct { typedef struct { TSSymbol symbol; TSFieldId field; - uint16_t capture_id; - uint16_t depth: 15; + uint16_t capture_ids[MAX_STEP_CAPTURE_COUNT]; + uint16_t depth: 13; bool contains_captures: 1; + bool is_immediate: 1; + bool is_last: 1; } QueryStep; /* @@ -62,10 +66,12 @@ typedef struct { } SymbolTable; /* - * PatternEntry - The set of steps needed to match a particular pattern, - * represented as a slice of a shared array. These entries are stored in a - * 'pattern map' - a sorted array that makes it possible to efficiently lookup - * patterns based on the symbol for their first step. + * PatternEntry - Information about the starting point for matching a + * particular pattern, consisting of the index of the pattern within the query, + * and the index of the patter's first step in the shared `steps` array. These + * entries are stored in a 'pattern map' - a sorted array that makes it + * possible to efficiently lookup patterns based on the symbol for their first + * step. */ typedef struct { uint16_t step_index; @@ -140,6 +146,7 @@ static const TSQueryError PARENT_DONE = -1; static const uint8_t PATTERN_DONE_MARKER = UINT8_MAX; static const uint16_t NONE = UINT16_MAX; static const TSSymbol WILDCARD_SYMBOL = 0; +static const TSSymbol NAMED_WILDCARD_SYMBOL = UINT16_MAX - 1; static const uint16_t MAX_STATE_COUNT = 32; // #define LOG(...) fprintf(stderr, __VA_ARGS__) @@ -324,6 +331,49 @@ static uint16_t symbol_table_insert_name( return self->slices.size - 1; } +/************ + * QueryStep + ************/ + +static QueryStep query_step__new( + TSSymbol symbol, + uint16_t depth, + bool is_immediate +) { + return (QueryStep) { + .symbol = symbol, + .depth = depth, + .field = 0, + .capture_ids = {NONE, NONE, NONE, NONE}, + .contains_captures = false, + .is_immediate = is_immediate, + }; +} + +static void query_step__add_capture(QueryStep *self, uint16_t capture_id) { + for (unsigned i = 0; i < MAX_STEP_CAPTURE_COUNT; i++) { + if (self->capture_ids[i] == NONE) { + self->capture_ids[i] = capture_id; + break; + } + } +} + +static void query_step__remove_capture(QueryStep *self, uint16_t capture_id) { + for (unsigned i = 0; i < MAX_STEP_CAPTURE_COUNT; i++) { + if (self->capture_ids[i] == capture_id) { + self->capture_ids[i] = NONE; + while (i + 1 < MAX_STEP_CAPTURE_COUNT) { + if (self->capture_ids[i + 1] == NONE) break; + self->capture_ids[i] = self->capture_ids[i + 1]; + self->capture_ids[i + 1] = NONE; + i++; + } + break; + } + } +} + /********* * Query *********/ @@ -333,7 +383,7 @@ static uint16_t symbol_table_insert_name( // to quickly find the starting steps of all of the patterns whose root matches // that node. Each entry has two fields: a `pattern_index`, which identifies one // of the patterns in the query, and a `step_index`, which indicates the start -// offset of that pattern's steps pattern within the `steps` array. +// offset of that pattern's steps within the `steps` array. // // The entries are sorted by the patterns' root symbols, and lookups use a // binary search. This ensures that the cost of this initial lookup step @@ -399,14 +449,14 @@ static void ts_query__finalize_steps(TSQuery *self) { for (unsigned i = 0; i < self->steps.size; i++) { QueryStep *step = &self->steps.contents[i]; uint32_t depth = step->depth; - if (step->capture_id != NONE) { + if (step->capture_ids[0] != NONE) { step->contains_captures = true; } else { step->contains_captures = false; for (unsigned j = i + 1; j < self->steps.size; j++) { QueryStep *s = &self->steps.contents[j]; if (s->depth == PATTERN_DONE_MARKER || s->depth <= depth) break; - if (s->capture_id != NONE) step->contains_captures = true; + if (s->capture_ids[0] != NONE) step->contains_captures = true; } } } @@ -533,7 +583,8 @@ static TSQueryError ts_query__parse_pattern( TSQuery *self, Stream *stream, uint32_t depth, - uint32_t *capture_count + uint32_t *capture_count, + bool is_immediate ) { uint16_t starting_step_index = self->steps.size; @@ -552,7 +603,7 @@ static TSQueryError ts_query__parse_pattern( // Parse a nested list, which represents a pattern followed by // zero-or-more predicates. if (stream->next == '(' && depth == 0) { - TSQueryError e = ts_query__parse_pattern(self, stream, 0, capture_count); + TSQueryError e = ts_query__parse_pattern(self, stream, 0, capture_count, is_immediate); if (e) return e; // Parse the predicates. @@ -573,7 +624,7 @@ static TSQueryError ts_query__parse_pattern( // Parse the wildcard symbol if (stream->next == '*') { - symbol = WILDCARD_SYMBOL; + symbol = NAMED_WILDCARD_SYMBOL; stream_advance(stream); } @@ -597,24 +648,37 @@ static TSQueryError ts_query__parse_pattern( } // Add a step for the node. - array_push(&self->steps, ((QueryStep) { - .depth = depth, - .symbol = symbol, - .field = 0, - .capture_id = NONE, - .contains_captures = false, - })); + array_push(&self->steps, query_step__new(symbol, depth, is_immediate)); // Parse the child patterns stream_skip_whitespace(stream); + bool child_is_immediate = false; + uint16_t child_start_step_index = self->steps.size; for (;;) { - TSQueryError e = ts_query__parse_pattern(self, stream, depth + 1, capture_count); + if (stream->next == '.') { + child_is_immediate = true; + stream_advance(stream); + stream_skip_whitespace(stream); + } + + TSQueryError e = ts_query__parse_pattern( + self, + stream, + depth + 1, + capture_count, + child_is_immediate + ); if (e == PARENT_DONE) { + if (child_is_immediate) { + self->steps.contents[child_start_step_index].is_last = true; + } stream_advance(stream); break; } else if (e) { return e; } + + child_is_immediate = false; } } @@ -643,13 +707,7 @@ static TSQueryError ts_query__parse_pattern( stream_reset(stream, string_content); return TSQueryErrorNodeType; } - array_push(&self->steps, ((QueryStep) { - .depth = depth, - .symbol = symbol, - .field = 0, - .capture_id = NONE, - .contains_captures = false, - })); + array_push(&self->steps, query_step__new(symbol, depth, is_immediate)); if (stream->next != '"') return TSQueryErrorSyntax; stream_advance(stream); @@ -672,7 +730,13 @@ static TSQueryError ts_query__parse_pattern( // Parse the pattern uint32_t step_index = self->steps.size; - TSQueryError e = ts_query__parse_pattern(self, stream, depth, capture_count); + TSQueryError e = ts_query__parse_pattern( + self, + stream, + depth, + capture_count, + is_immediate + ); if (e == PARENT_DONE) return TSQueryErrorSyntax; if (e) return e; @@ -695,12 +759,7 @@ static TSQueryError ts_query__parse_pattern( stream_skip_whitespace(stream); // Add a step that matches any kind of node - array_push(&self->steps, ((QueryStep) { - .depth = depth, - .symbol = WILDCARD_SYMBOL, - .field = 0, - .contains_captures = false, - })); + array_push(&self->steps, query_step__new(WILDCARD_SYMBOL, depth, is_immediate)); } else { @@ -710,7 +769,7 @@ static TSQueryError ts_query__parse_pattern( stream_skip_whitespace(stream); // Parse an '@'-prefixed capture pattern - if (stream->next == '@') { + while (stream->next == '@') { stream_advance(stream); // Parse the capture name @@ -725,7 +784,8 @@ static TSQueryError ts_query__parse_pattern( capture_name, length ); - self->steps.contents[starting_step_index].capture_id = capture_id; + QueryStep *step = &self->steps.contents[starting_step_index]; + query_step__add_capture(step, capture_id); (*capture_count)++; stream_skip_whitespace(stream); @@ -794,8 +854,8 @@ TSQuery *ts_query_new( .offset = self->predicate_steps.size, .length = 0, })); - *error_type = ts_query__parse_pattern(self, &stream, 0, &capture_count); - array_push(&self->steps, ((QueryStep) { .depth = PATTERN_DONE_MARKER })); + *error_type = ts_query__parse_pattern(self, &stream, 0, &capture_count, false); + array_push(&self->steps, query_step__new(0, PATTERN_DONE_MARKER, false)); // If any pattern could not be parsed, then report the error information // and terminate. @@ -891,16 +951,31 @@ void ts_query_disable_capture( const char *name, uint32_t length ) { + // Remove capture information for any pattern step that previously + // captured with the given name. int id = symbol_table_id_for_name(&self->captures, name, length); if (id != -1) { for (unsigned i = 0; i < self->steps.size; i++) { QueryStep *step = &self->steps.contents[i]; - if (step->capture_id == id) { - step->capture_id = NONE; - } + query_step__remove_capture(step, id); + } + ts_query__finalize_steps(self); + } +} + +void ts_query_disable_pattern( + TSQuery *self, + uint32_t pattern_index +) { + // Remove the given pattern from the pattern map. Its steps will still + // be in the `steps` array, but they will never be read. + for (unsigned i = 0; i < self->pattern_map.size; i++) { + PatternEntry *pattern = &self->pattern_map.contents[i]; + if (pattern->pattern_index == pattern_index) { + array_erase(&self->pattern_map, i); + i--; } } - ts_query__finalize_steps(self); } /*************** @@ -1010,7 +1085,7 @@ static bool ts_query_cursor__first_in_progress_capture( static bool ts_query__cursor_add_state( TSQueryCursor *self, - const PatternEntry *slice + const PatternEntry *pattern ) { uint32_t list_id = capture_list_pool_acquire(&self->capture_list_pool); @@ -1037,11 +1112,11 @@ static bool ts_query__cursor_add_state( } } - LOG(" start state. pattern:%u\n", slice->pattern_index); + LOG(" start state. pattern:%u\n", pattern->pattern_index); array_push(&self->states, ((QueryState) { .capture_list_id = list_id, - .step_index = slice->step_index, - .pattern_index = slice->pattern_index, + .step_index = pattern->step_index, + .pattern_index = pattern->pattern_index, .start_depth = self->depth, .capture_count = 0, .consumed_capture_count = 0, @@ -1113,15 +1188,16 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { return self->finished_states.size > 0; } } else { - bool can_have_later_siblings; + bool has_later_siblings; bool can_have_later_siblings_with_this_field; TSFieldId field_id = ts_tree_cursor_current_status( &self->cursor, - &can_have_later_siblings, + &has_later_siblings, &can_have_later_siblings_with_this_field ); TSNode node = ts_tree_cursor_current_node(&self->cursor); TSSymbol symbol = ts_node_symbol(node); + bool is_named = ts_node_is_named(node); if (symbol != ts_builtin_sym_error && self->query->symbol_map) { symbol = self->query->symbol_map[symbol]; } @@ -1145,43 +1221,46 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { ) return false; LOG( - "enter node. type:%s, field:%s, row:%u state_count:%u, finished_state_count:%u, can_have_later_siblings:%d, can_have_later_siblings_with_this_field:%d\n", + "enter node. " + "type:%s, field:%s, row:%u state_count:%u, " + "finished_state_count:%u, has_later_siblings:%d, " + "can_have_later_siblings_with_this_field:%d\n", ts_node_type(node), ts_language_field_name_for_id(self->query->language, field_id), ts_node_start_point(node).row, self->states.size, self->finished_states.size, - can_have_later_siblings, + has_later_siblings, can_have_later_siblings_with_this_field ); // Add new states for any patterns whose root node is a wildcard. for (unsigned i = 0; i < self->query->wildcard_root_pattern_count; i++) { - PatternEntry *slice = &self->query->pattern_map.contents[i]; - QueryStep *step = &self->query->steps.contents[slice->step_index]; + PatternEntry *pattern = &self->query->pattern_map.contents[i]; + QueryStep *step = &self->query->steps.contents[pattern->step_index]; // If this node matches the first step of the pattern, then add a new // state at the start of this pattern. if (step->field && field_id != step->field) continue; - if (!ts_query__cursor_add_state(self, slice)) break; + if (!ts_query__cursor_add_state(self, pattern)) break; } // Add new states for any patterns whose root node matches this node. unsigned i; if (ts_query__pattern_map_search(self->query, symbol, &i)) { - PatternEntry *slice = &self->query->pattern_map.contents[i]; - QueryStep *step = &self->query->steps.contents[slice->step_index]; + PatternEntry *pattern = &self->query->pattern_map.contents[i]; + QueryStep *step = &self->query->steps.contents[pattern->step_index]; do { // If this node matches the first step of the pattern, then add a new // state at the start of this pattern. if (step->field && field_id != step->field) continue; - if (!ts_query__cursor_add_state(self, slice)) break; + if (!ts_query__cursor_add_state(self, pattern)) break; // Advance to the next pattern whose root node matches this node. i++; if (i == self->query->pattern_map.size) break; - slice = &self->query->pattern_map.contents[i]; - step = &self->query->steps.contents[slice->step_index]; + pattern = &self->query->pattern_map.contents[i]; + step = &self->query->steps.contents[pattern->step_index]; } while (step->symbol == symbol); } @@ -1191,14 +1270,23 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { QueryStep *step = &self->query->steps.contents[state->step_index]; // Check that the node matches all of the criteria for the next - // step of the pattern.if ( + // step of the pattern. if ((uint32_t)state->start_depth + (uint32_t)step->depth != self->depth) continue; // Determine if this node matches this step of the pattern, and also // if this node can have later siblings that match this step of the // pattern. - bool node_does_match = !step->symbol || step->symbol == symbol; - bool later_sibling_can_match = can_have_later_siblings; + bool node_does_match = + step->symbol == symbol || + step->symbol == WILDCARD_SYMBOL || + (step->symbol == NAMED_WILDCARD_SYMBOL && is_named); + bool later_sibling_can_match = has_later_siblings; + if (step->is_immediate && is_named) { + later_sibling_can_match = false; + } + if (step->is_last && has_later_siblings) { + node_does_match = false; + } if (step->field) { if (step->field == field_id) { if (!can_have_later_siblings_with_this_field) { @@ -1261,11 +1349,13 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { // If the current node is captured in this pattern, add it to the // capture list. - if (step->capture_id != NONE) { + for (unsigned j = 0; j < MAX_STEP_CAPTURE_COUNT; j++) { + uint16_t capture_id = step->capture_ids[j]; + if (step->capture_ids[j] == NONE) break; LOG( " capture node. pattern:%u, capture_id:%u\n", next_state->pattern_index, - step->capture_id + capture_id ); TSQueryCapture *capture_list = capture_list_pool_get( &self->capture_list_pool, @@ -1273,7 +1363,7 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { ); capture_list[next_state->capture_count++] = (TSQueryCapture) { node, - step->capture_id + capture_id }; } -- cgit From 727040c9530c6bca1b2d9ce70a5c968bef576469 Mon Sep 17 00:00:00 2001 From: Thomas Vigouroux Date: Wed, 15 Apr 2020 16:48:10 +0200 Subject: treesitter: update vendor code Update treesitter vendor code to commit 35f82ce301951315e08de3b7e44a18c9170b28b8 --- src/tree_sitter/query.c | 386 +++++++++++++++++++++++++++++++++++------------- 1 file changed, 283 insertions(+), 103 deletions(-) (limited to 'src/tree_sitter/query.c') diff --git a/src/tree_sitter/query.c b/src/tree_sitter/query.c index 053882cf68..92a8006179 100644 --- a/src/tree_sitter/query.c +++ b/src/tree_sitter/query.c @@ -35,15 +35,21 @@ typedef struct { * captured in this pattern. * - `depth` - The depth where this node occurs in the pattern. The root node * of the pattern has depth zero. + * - `repeat_step_index` - If this step is part of a repetition, the index of + * the beginning of the repetition. A `NONE` value means this step is not + * part of a repetition. */ typedef struct { TSSymbol symbol; TSFieldId field; uint16_t capture_ids[MAX_STEP_CAPTURE_COUNT]; - uint16_t depth: 13; + uint16_t repeat_step_index; + uint16_t depth: 11; bool contains_captures: 1; + bool is_pattern_start: 1; bool is_immediate: 1; bool is_last: 1; + bool is_repeated: 1; } QueryStep; /* @@ -85,23 +91,27 @@ typedef struct { * represented as one of these states. */ typedef struct { + uint32_t id; uint16_t start_depth; uint16_t pattern_index; uint16_t step_index; - uint16_t capture_count; - uint16_t capture_list_id; uint16_t consumed_capture_count; - uint32_t id; + uint16_t repeat_match_count; + uint16_t step_index_on_failure; + uint8_t capture_list_id; + bool seeking_non_match; } QueryState; +typedef Array(TSQueryCapture) CaptureList; + /* * CaptureListPool - A collection of *lists* of captures. Each QueryState - * needs to maintain its own list of captures. They are all represented as - * slices of one shared array. The CaptureListPool keeps track of which - * parts of the shared array are currently in use by a QueryState. + * needs to maintain its own list of captures. To avoid repeated allocations, + * the reuses a fixed set of capture lists, and keeps track of which ones + * are currently in use. */ typedef struct { - Array(TSQueryCapture) list; + CaptureList list[32]; uint32_t usage_map; } CaptureListPool; @@ -119,7 +129,6 @@ struct TSQuery { Array(Slice) predicates_by_pattern; Array(uint32_t) start_bytes_by_pattern; const TSLanguage *language; - uint16_t max_capture_count; uint16_t wildcard_root_pattern_count; TSSymbol *symbol_map; }; @@ -233,24 +242,25 @@ static void stream_scan_identifier(Stream *stream) { static CaptureListPool capture_list_pool_new(void) { return (CaptureListPool) { - .list = array_new(), .usage_map = UINT32_MAX, }; } -static void capture_list_pool_reset(CaptureListPool *self, uint16_t list_size) { +static void capture_list_pool_reset(CaptureListPool *self) { self->usage_map = UINT32_MAX; - uint32_t total_size = MAX_STATE_COUNT * list_size; - array_reserve(&self->list, total_size); - self->list.size = total_size; + for (unsigned i = 0; i < 32; i++) { + array_clear(&self->list[i]); + } } static void capture_list_pool_delete(CaptureListPool *self) { - array_delete(&self->list); + for (unsigned i = 0; i < 32; i++) { + array_delete(&self->list[i]); + } } -static TSQueryCapture *capture_list_pool_get(CaptureListPool *self, uint16_t id) { - return &self->list.contents[id * (self->list.size / MAX_STATE_COUNT)]; +static CaptureList *capture_list_pool_get(CaptureListPool *self, uint16_t id) { + return &self->list[id]; } static bool capture_list_pool_is_empty(const CaptureListPool *self) { @@ -269,6 +279,7 @@ static uint16_t capture_list_pool_acquire(CaptureListPool *self) { } static void capture_list_pool_release(CaptureListPool *self, uint16_t id) { + array_clear(&self->list[id]); self->usage_map |= bitmask_for_index(id); } @@ -331,6 +342,67 @@ static uint16_t symbol_table_insert_name( return self->slices.size - 1; } +static uint16_t symbol_table_insert_name_with_escapes( + SymbolTable *self, + const char *escaped_name, + uint32_t escaped_length +) { + Slice slice = { + .offset = self->characters.size, + .length = 0, + }; + array_grow_by(&self->characters, escaped_length + 1); + + // Copy the contents of the literal into the characters buffer, processing escape + // sequences like \n and \". This needs to be done before checking if the literal + // is already present, in order to do the string comparison. + bool is_escaped = false; + for (unsigned i = 0; i < escaped_length; i++) { + const char *src = &escaped_name[i]; + char *dest = &self->characters.contents[slice.offset + slice.length]; + if (is_escaped) { + switch (*src) { + case 'n': + *dest = '\n'; + break; + case 'r': + *dest = '\r'; + break; + case 't': + *dest = '\t'; + break; + case '0': + *dest = '\0'; + break; + default: + *dest = *src; + break; + } + is_escaped = false; + slice.length++; + } else { + if (*src == '\\') { + is_escaped = true; + } else { + *dest = *src; + slice.length++; + } + } + } + + // If the string is already present, remove the redundant content from the characters + // buffer and return the existing id. + int id = symbol_table_id_for_name(self, &self->characters.contents[slice.offset], slice.length); + if (id >= 0) { + self->characters.size -= (escaped_length + 1); + return id; + } + + self->characters.contents[slice.offset + slice.length] = 0; + array_push(&self->slices, slice); + return self->slices.size - 1; +} + /************ * QueryStep ************/ @@ -346,7 +418,11 @@ static QueryStep query_step__new( .field = 0, .capture_ids = {NONE, NONE, NONE, NONE}, .contains_captures = false, + .is_repeated = false, + .is_last = false, + .is_pattern_start = false, .is_immediate = is_immediate, + .repeat_step_index = NONE, }; } @@ -523,9 +599,22 @@ static TSQueryError ts_query__parse_predicate( stream_advance(stream); // Parse the string content + bool is_escaped = false; const char *string_content = stream->input; - while (stream->next != '"') { - if (stream->next == '\n' || !stream_advance(stream)) { + for (;;) { + if (is_escaped) { + is_escaped = false; + } else { + if (stream->next == '\\') { + is_escaped = true; + } else if (stream->next == '"') { + break; + } else if (stream->next == '\n') { + stream_reset(stream, string_content - 1); + return TSQueryErrorSyntax; + } + } + if (!stream_advance(stream)) { stream_reset(stream, string_content - 1); return TSQueryErrorSyntax; } @@ -533,7 +622,7 @@ static TSQueryError ts_query__parse_predicate( uint32_t length = stream->input - string_content; // Add a step for the node - uint16_t id = symbol_table_insert_name( + uint16_t id = symbol_table_insert_name_with_escapes( &self->predicate_values, string_content, length @@ -624,7 +713,7 @@ static TSQueryError ts_query__parse_pattern( // Parse the wildcard symbol if (stream->next == '*') { - symbol = NAMED_WILDCARD_SYMBOL; + symbol = depth > 0 ? NAMED_WILDCARD_SYMBOL : WILDCARD_SYMBOL; stream_advance(stream); } @@ -768,27 +857,43 @@ static TSQueryError ts_query__parse_pattern( stream_skip_whitespace(stream); - // Parse an '@'-prefixed capture pattern - while (stream->next == '@') { - stream_advance(stream); + // Parse suffixes modifiers for this pattern + for (;;) { + QueryStep *step = &self->steps.contents[starting_step_index]; - // Parse the capture name - if (!stream_is_ident_start(stream)) return TSQueryErrorSyntax; - const char *capture_name = stream->input; - stream_scan_identifier(stream); - uint32_t length = stream->input - capture_name; + if (stream->next == '+') { + stream_advance(stream); + step->is_repeated = true; + array_back(&self->steps)->repeat_step_index = starting_step_index; + stream_skip_whitespace(stream); + } - // Add the capture id to the first step of the pattern - uint16_t capture_id = symbol_table_insert_name( - &self->captures, - capture_name, - length - ); - QueryStep *step = &self->steps.contents[starting_step_index]; - query_step__add_capture(step, capture_id); - (*capture_count)++; + // Parse an '@'-prefixed capture pattern + else if (stream->next == '@') { + stream_advance(stream); - stream_skip_whitespace(stream); + // Parse the capture name + if (!stream_is_ident_start(stream)) return TSQueryErrorSyntax; + const char *capture_name = stream->input; + stream_scan_identifier(stream); + uint32_t length = stream->input - capture_name; + + // Add the capture id to the first step of the pattern + uint16_t capture_id = symbol_table_insert_name( + &self->captures, + capture_name, + length + ); + query_step__add_capture(step, capture_id); + (*capture_count)++; + + stream_skip_whitespace(stream); + } + + // No more suffix modifiers + else { + break; + } } return 0; @@ -838,16 +943,14 @@ TSQuery *ts_query_new( .predicates_by_pattern = array_new(), .symbol_map = symbol_map, .wildcard_root_pattern_count = 0, - .max_capture_count = 0, .language = language, }; // Parse all of the S-expressions in the given string. Stream stream = stream_new(source, source_len); stream_skip_whitespace(&stream); - uint32_t start_step_index; while (stream.input < stream.end) { - start_step_index = self->steps.size; + uint32_t start_step_index = self->steps.size; uint32_t capture_count = 0; array_push(&self->start_bytes_by_pattern, stream.input - source); array_push(&self->predicates_by_pattern, ((Slice) { @@ -865,7 +968,19 @@ TSQuery *ts_query_new( return NULL; } + // If a pattern has a wildcard at its root, optimize the matching process + // by skipping matching the wildcard. + if ( + self->steps.contents[start_step_index].symbol == WILDCARD_SYMBOL + ) { + QueryStep *second_step = &self->steps.contents[start_step_index + 1]; + if (second_step->symbol != WILDCARD_SYMBOL && second_step->depth != PATTERN_DONE_MARKER) { + start_step_index += 1; + } + } + // Maintain a map that can look up patterns for a given root symbol. + self->steps.contents[start_step_index].is_pattern_start = true; ts_query__pattern_map_insert( self, self->steps.contents[start_step_index].symbol, @@ -874,13 +989,6 @@ TSQuery *ts_query_new( if (self->steps.contents[start_step_index].symbol == WILDCARD_SYMBOL) { self->wildcard_root_pattern_count++; } - - // Keep track of the maximum number of captures in pattern, because - // that numer determines how much space is needed to store each capture - // list. - if (capture_count > self->max_capture_count) { - self->max_capture_count = capture_count; - } } ts_query__finalize_steps(self); @@ -1015,7 +1123,7 @@ void ts_query_cursor_exec( array_clear(&self->states); array_clear(&self->finished_states); ts_tree_cursor_reset(&self->cursor, node); - capture_list_pool_reset(&self->capture_list_pool, query->max_capture_count); + capture_list_pool_reset(&self->capture_list_pool); self->next_state_id = 0; self->depth = 0; self->ascending = false; @@ -1059,12 +1167,12 @@ static bool ts_query_cursor__first_in_progress_capture( bool result = false; for (unsigned i = 0; i < self->states.size; i++) { const QueryState *state = &self->states.contents[i]; - if (state->capture_count > 0) { - const TSQueryCapture *captures = capture_list_pool_get( - &self->capture_list_pool, - state->capture_list_id - ); - uint32_t capture_byte = ts_node_start_byte(captures[0].node); + const CaptureList *captures = capture_list_pool_get( + &self->capture_list_pool, + state->capture_list_id + ); + if (captures->size > 0) { + uint32_t capture_byte = ts_node_start_byte(captures->contents[0].node); if ( !result || capture_byte < *byte_offset || @@ -1087,6 +1195,19 @@ static bool ts_query__cursor_add_state( TSQueryCursor *self, const PatternEntry *pattern ) { + QueryStep *step = &self->query->steps.contents[pattern->step_index]; + + // If this pattern begins with a repetition, then avoid creating + // new states after already matching the repetition one or more times. + // The query should only one match for the repetition - the one that + // started the earliest. + if (step->is_repeated) { + for (unsigned i = 0; i < self->states.size; i++) { + QueryState *state = &self->states.contents[i]; + if (state->step_index == pattern->step_index) return true; + } + } + uint32_t list_id = capture_list_pool_acquire(&self->capture_list_pool); // If there are no capture lists left in the pool, then terminate whichever @@ -1112,14 +1233,20 @@ static bool ts_query__cursor_add_state( } } - LOG(" start state. pattern:%u\n", pattern->pattern_index); + LOG( + " start state. pattern:%u, step:%u\n", + pattern->pattern_index, + pattern->step_index + ); array_push(&self->states, ((QueryState) { .capture_list_id = list_id, .step_index = pattern->step_index, .pattern_index = pattern->pattern_index, - .start_depth = self->depth, - .capture_count = 0, + .start_depth = self->depth - step->depth, .consumed_capture_count = 0, + .repeat_match_count = 0, + .step_index_on_failure = NONE, + .seeking_non_match = false, })); return true; } @@ -1133,15 +1260,15 @@ static QueryState *ts_query__cursor_copy_state( array_push(&self->states, *state); QueryState *new_state = array_back(&self->states); new_state->capture_list_id = new_list_id; - TSQueryCapture *old_captures = capture_list_pool_get( + CaptureList *old_captures = capture_list_pool_get( &self->capture_list_pool, state->capture_list_id ); - TSQueryCapture *new_captures = capture_list_pool_get( + CaptureList *new_captures = capture_list_pool_get( &self->capture_list_pool, new_list_id ); - memcpy(new_captures, old_captures, state->capture_count * sizeof(TSQueryCapture)); + array_push_all(new_captures, old_captures); return new_state; } @@ -1298,6 +1425,24 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { } if (!node_does_match) { + // If this QueryState has processed a repeating sequence, and that repeating + // sequence has ended, move on to the *next* step of this state's pattern. + if ( + state->step_index_on_failure != NONE && + (!later_sibling_can_match || step->is_repeated) + ) { + LOG( + " finish repetition state. pattern:%u, step:%u\n", + state->pattern_index, + state->step_index + ); + state->step_index = state->step_index_on_failure; + state->step_index_on_failure = NONE; + state->repeat_match_count = 0; + i--; + continue; + } + if (!later_sibling_can_match) { LOG( " discard state. pattern:%u, step:%u\n", @@ -1312,9 +1457,17 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { i--; n--; } + + state->seeking_non_match = false; continue; } + // The `seeking_non_match` flag indicates that a previous QueryState + // has already begun processing this repeating sequence, so that *this* + // QueryState should not begin matching until a separate repeating sequence + // is found. + if (state->seeking_non_match) continue; + // Some patterns can match their root node in multiple ways, // capturing different children. If this pattern step could match // later children within the same parent, then this query state @@ -1324,11 +1477,20 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { // siblings. QueryState *next_state = state; if ( - step->depth > 0 && + !step->is_pattern_start && step->contains_captures && - later_sibling_can_match + later_sibling_can_match && + state->repeat_match_count == 0 ) { QueryState *copy = ts_query__cursor_copy_state(self, state); + + // The QueryState that matched this node has begun matching a repeating + // sequence. The QueryState that *skipped* this node should not start + // matching later elements of the same repeating sequence. + if (step->is_repeated) { + state->seeking_non_match = true; + } + if (copy) { LOG( " split state. pattern:%u, step:%u\n", @@ -1337,55 +1499,71 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { ); next_state = copy; } else { - LOG(" canot split state.\n"); + LOG(" cannot split state.\n"); } } - LOG( - " advance state. pattern:%u, step:%u\n", - next_state->pattern_index, - next_state->step_index - ); - // If the current node is captured in this pattern, add it to the // capture list. for (unsigned j = 0; j < MAX_STEP_CAPTURE_COUNT; j++) { uint16_t capture_id = step->capture_ids[j]; if (step->capture_ids[j] == NONE) break; - LOG( - " capture node. pattern:%u, capture_id:%u\n", - next_state->pattern_index, - capture_id - ); - TSQueryCapture *capture_list = capture_list_pool_get( + CaptureList *capture_list = capture_list_pool_get( &self->capture_list_pool, next_state->capture_list_id ); - capture_list[next_state->capture_count++] = (TSQueryCapture) { + array_push(capture_list, ((TSQueryCapture) { node, capture_id - }; + })); + LOG( + " capture node. pattern:%u, capture_id:%u, capture_count:%u\n", + next_state->pattern_index, + capture_id, + capture_list->size + ); } - // If the pattern is now done, then remove it from the list of - // in-progress states, and add it to the list of finished states. - next_state->step_index++; - QueryStep *next_step = step + 1; - if (next_step->depth == PATTERN_DONE_MARKER) { - LOG(" finish pattern %u\n", next_state->pattern_index); + // If this is the end of a repetition, then jump back to the beginning + // of that repetition. + if (step->repeat_step_index != NONE) { + next_state->step_index_on_failure = next_state->step_index + 1; + next_state->step_index = step->repeat_step_index; + next_state->repeat_match_count++; + LOG( + " continue repeat. pattern:%u, match_count:%u\n", + next_state->pattern_index, + next_state->repeat_match_count + ); + } else { + next_state->step_index++; + LOG( + " advance state. pattern:%u, step:%u\n", + next_state->pattern_index, + next_state->step_index + ); - next_state->id = self->next_state_id++; - array_push(&self->finished_states, *next_state); - if (next_state == state) { - array_erase(&self->states, i); - i--; - n--; - } else { - self->states.size--; + QueryStep *next_step = step + 1; + + // If the pattern is now done, then remove it from the list of + // in-progress states, and add it to the list of finished states. + if (next_step->depth == PATTERN_DONE_MARKER) { + LOG(" finish pattern %u\n", next_state->pattern_index); + + next_state->id = self->next_state_id++; + array_push(&self->finished_states, *next_state); + if (next_state == state) { + array_erase(&self->states, i); + i--; + n--; + } else { + self->states.size--; + } } } } + // Continue descending if possible. if (ts_tree_cursor_goto_first_child(&self->cursor)) { self->depth++; @@ -1411,11 +1589,12 @@ bool ts_query_cursor_next_match( QueryState *state = &self->finished_states.contents[0]; match->id = state->id; match->pattern_index = state->pattern_index; - match->capture_count = state->capture_count; - match->captures = capture_list_pool_get( + CaptureList *captures = capture_list_pool_get( &self->capture_list_pool, state->capture_list_id ); + match->captures = captures->contents; + match->capture_count = captures->size; capture_list_pool_release(&self->capture_list_pool, state->capture_list_id); array_erase(&self->finished_states, 0); return true; @@ -1468,13 +1647,13 @@ bool ts_query_cursor_next_capture( uint32_t first_finished_pattern_index = first_unfinished_pattern_index; for (unsigned i = 0; i < self->finished_states.size; i++) { const QueryState *state = &self->finished_states.contents[i]; - if (state->capture_count > state->consumed_capture_count) { - const TSQueryCapture *captures = capture_list_pool_get( - &self->capture_list_pool, - state->capture_list_id - ); + CaptureList *captures = capture_list_pool_get( + &self->capture_list_pool, + state->capture_list_id + ); + if (captures->size > state->consumed_capture_count) { uint32_t capture_byte = ts_node_start_byte( - captures[state->consumed_capture_count].node + captures->contents[state->consumed_capture_count].node ); if ( capture_byte < first_finished_capture_byte || @@ -1506,11 +1685,12 @@ bool ts_query_cursor_next_capture( ]; match->id = state->id; match->pattern_index = state->pattern_index; - match->capture_count = state->capture_count; - match->captures = capture_list_pool_get( + CaptureList *captures = capture_list_pool_get( &self->capture_list_pool, state->capture_list_id ); + match->captures = captures->contents; + match->capture_count = captures->size; *capture_index = state->consumed_capture_count; state->consumed_capture_count++; return true; -- cgit From 8349192503450d645bad6a2b30a72c67fd97f7c8 Mon Sep 17 00:00:00 2001 From: Thomas Vigouroux Date: Fri, 15 May 2020 12:23:26 +0200 Subject: treesitter: update runtime Since tree-sitter PR 615, predicates are not parsed the same. "Old" way of writing predicates is still supported. --- src/tree_sitter/query.c | 700 +++++++++++++++++++++++++++++++----------------- 1 file changed, 449 insertions(+), 251 deletions(-) (limited to 'src/tree_sitter/query.c') diff --git a/src/tree_sitter/query.c b/src/tree_sitter/query.c index 92a8006179..d5fb90075d 100644 --- a/src/tree_sitter/query.c +++ b/src/tree_sitter/query.c @@ -27,7 +27,7 @@ typedef struct { * represented as a sequence of these steps. Fields: * * - `symbol` - The grammar symbol to match. A zero value represents the - * wildcard symbol, '*'. + * wildcard symbol, '_'. * - `field` - The field name to match. A zero value means that a field name * was not specified. * - `capture_id` - An integer representing the name of the capture associated @@ -35,21 +35,21 @@ typedef struct { * captured in this pattern. * - `depth` - The depth where this node occurs in the pattern. The root node * of the pattern has depth zero. - * - `repeat_step_index` - If this step is part of a repetition, the index of - * the beginning of the repetition. A `NONE` value means this step is not - * part of a repetition. + * - `alternative_index` - The index of a different query step that serves as + * an alternative to this step. */ typedef struct { TSSymbol symbol; TSFieldId field; uint16_t capture_ids[MAX_STEP_CAPTURE_COUNT]; - uint16_t repeat_step_index; - uint16_t depth: 11; + uint16_t alternative_index; + uint8_t depth; bool contains_captures: 1; bool is_pattern_start: 1; bool is_immediate: 1; bool is_last: 1; - bool is_repeated: 1; + bool is_placeholder: 1; + bool alternative_is_immediate: 1; } QueryStep; /* @@ -88,7 +88,25 @@ typedef struct { * QueryState - The state of an in-progress match of a particular pattern * in a query. While executing, a `TSQueryCursor` must keep track of a number * of possible in-progress matches. Each of those possible matches is - * represented as one of these states. + * represented as one of these states. Fields: + * - `id` - A numeric id that is exposed to the public API. This allows the + * caller to remove a given match, preventing any more of its captures + * from being returned. + * - `start_depth` - The depth in the tree where the first step of the state's + * pattern was matched. + * - `pattern_index` - The pattern that the state is matching. + * - `consumed_capture_count` - The number of captures from this match that + * have already been returned. + * - `capture_list_id` - A numeric id that can be used to retrieve the state's + * list of captures from the `CaptureListPool`. + * - `seeking_immediate_match` - A flag that indicates that the state's next + * step must be matched by the very next sibling. This is used when + * processing repetitions. + * - `has_in_progress_alternatives` - A flag that indicates that there is are + * other states that have the same captures as this state, but are at + * different steps in their pattern. This means that in order to obey the + * 'longest-match' rule, this state should not be returned as a match until + * it is clear that there can be no longer match. */ typedef struct { uint32_t id; @@ -96,10 +114,9 @@ typedef struct { uint16_t pattern_index; uint16_t step_index; uint16_t consumed_capture_count; - uint16_t repeat_match_count; - uint16_t step_index_on_failure; uint8_t capture_list_id; - bool seeking_non_match; + bool seeking_immediate_match: 1; + bool has_in_progress_alternatives: 1; } QueryState; typedef Array(TSQueryCapture) CaptureList; @@ -417,12 +434,13 @@ static QueryStep query_step__new( .depth = depth, .field = 0, .capture_ids = {NONE, NONE, NONE, NONE}, + .alternative_index = NONE, .contains_captures = false, - .is_repeated = false, .is_last = false, .is_pattern_start = false, + .is_placeholder = false, .is_immediate = is_immediate, - .repeat_step_index = NONE, + .alternative_is_immediate = false, }; } @@ -511,13 +529,14 @@ static inline bool ts_query__pattern_map_search( static inline void ts_query__pattern_map_insert( TSQuery *self, TSSymbol symbol, - uint32_t start_step_index + uint32_t start_step_index, + uint32_t pattern_index ) { uint32_t index; ts_query__pattern_map_search(self, symbol, &index); array_insert(&self->pattern_map, index, ((PatternEntry) { .step_index = start_step_index, - .pattern_index = self->pattern_map.size, + .pattern_index = pattern_index, })); } @@ -548,12 +567,22 @@ static TSQueryError ts_query__parse_predicate( TSQuery *self, Stream *stream ) { - if (stream->next == ')') return PARENT_DONE; - if (stream->next != '(') return TSQueryErrorSyntax; - stream_advance(stream); + if (!stream_is_ident_start(stream)) return TSQueryErrorSyntax; + const char *predicate_name = stream->input; + stream_scan_identifier(stream); + uint32_t length = stream->input - predicate_name; + uint16_t id = symbol_table_insert_name( + &self->predicate_values, + predicate_name, + length + ); + array_back(&self->predicates_by_pattern)->length++; + array_push(&self->predicate_steps, ((TSQueryPredicateStep) { + .type = TSQueryPredicateStepTypeString, + .value_id = id, + })); stream_skip_whitespace(stream); - unsigned step_count = 0; for (;;) { if (stream->next == ')') { stream_advance(stream); @@ -658,7 +687,6 @@ static TSQueryError ts_query__parse_predicate( return TSQueryErrorSyntax; } - step_count++; stream_skip_whitespace(stream); } @@ -684,93 +712,141 @@ static TSQueryError ts_query__parse_pattern( return PARENT_DONE; } - // Parse a parenthesized node expression + // An open parenthesis can be the start of three possible constructs: + // * A grouped sequence + // * A predicate + // * A named node else if (stream->next == '(') { stream_advance(stream); stream_skip_whitespace(stream); - // Parse a nested list, which represents a pattern followed by - // zero-or-more predicates. - if (stream->next == '(' && depth == 0) { - TSQueryError e = ts_query__parse_pattern(self, stream, 0, capture_count, is_immediate); - if (e) return e; - - // Parse the predicates. - stream_skip_whitespace(stream); + // If this parenthesis is followed by a node, then it represents a grouped sequence. + if (stream->next == '(' || stream->next == '"') { + bool child_is_immediate = false; for (;;) { - TSQueryError e = ts_query__parse_predicate(self, stream); - if (e == PARENT_DONE) { + if (stream->next == '.') { + child_is_immediate = true; stream_advance(stream); stream_skip_whitespace(stream); - return 0; + } + TSQueryError e = ts_query__parse_pattern( + self, + stream, + depth, + capture_count, + child_is_immediate + ); + if (e == PARENT_DONE) { + stream_advance(stream); + break; } else if (e) { return e; } + + child_is_immediate = false; } } - TSSymbol symbol; - - // Parse the wildcard symbol - if (stream->next == '*') { - symbol = depth > 0 ? NAMED_WILDCARD_SYMBOL : WILDCARD_SYMBOL; + // A pound character indicates the start of a predicate. + else if (stream->next == '#') { stream_advance(stream); + return ts_query__parse_predicate(self, stream); } - // Parse a normal node name - else if (stream_is_ident_start(stream)) { - const char *node_name = stream->input; - stream_scan_identifier(stream); - uint32_t length = stream->input - node_name; - symbol = ts_language_symbol_for_name( - self->language, - node_name, - length, - true - ); - if (!symbol) { - stream_reset(stream, node_name); - return TSQueryErrorNodeType; - } - } else { - return TSQueryErrorSyntax; - } + // Otherwise, this parenthesis is the start of a named node. + else { + TSSymbol symbol; - // Add a step for the node. - array_push(&self->steps, query_step__new(symbol, depth, is_immediate)); + // Parse the wildcard symbol + if ( + stream->next == '_' || - // Parse the child patterns - stream_skip_whitespace(stream); - bool child_is_immediate = false; - uint16_t child_start_step_index = self->steps.size; - for (;;) { - if (stream->next == '.') { - child_is_immediate = true; + // TODO - remove. + // For temporary backward compatibility, handle '*' as a wildcard. + stream->next == '*' + ) { + symbol = depth > 0 ? NAMED_WILDCARD_SYMBOL : WILDCARD_SYMBOL; stream_advance(stream); - stream_skip_whitespace(stream); } - TSQueryError e = ts_query__parse_pattern( - self, - stream, - depth + 1, - capture_count, - child_is_immediate - ); - if (e == PARENT_DONE) { - if (child_is_immediate) { - self->steps.contents[child_start_step_index].is_last = true; + // Parse a normal node name + else if (stream_is_ident_start(stream)) { + const char *node_name = stream->input; + stream_scan_identifier(stream); + uint32_t length = stream->input - node_name; + + // TODO - remove. + // For temporary backward compatibility, handle predicates without the leading '#' sign. + if (length > 0 && (node_name[length - 1] == '!' || node_name[length - 1] == '?')) { + stream_reset(stream, node_name); + return ts_query__parse_predicate(self, stream); } - stream_advance(stream); - break; - } else if (e) { - return e; + + symbol = ts_language_symbol_for_name( + self->language, + node_name, + length, + true + ); + if (!symbol) { + stream_reset(stream, node_name); + return TSQueryErrorNodeType; + } + } else { + return TSQueryErrorSyntax; } - child_is_immediate = false; + // Add a step for the node. + array_push(&self->steps, query_step__new(symbol, depth, is_immediate)); + + // Parse the child patterns + stream_skip_whitespace(stream); + bool child_is_immediate = false; + uint16_t child_start_step_index = self->steps.size; + for (;;) { + if (stream->next == '.') { + child_is_immediate = true; + stream_advance(stream); + stream_skip_whitespace(stream); + } + + TSQueryError e = ts_query__parse_pattern( + self, + stream, + depth + 1, + capture_count, + child_is_immediate + ); + if (e == PARENT_DONE) { + if (child_is_immediate) { + self->steps.contents[child_start_step_index].is_last = true; + } + stream_advance(stream); + break; + } else if (e) { + return e; + } + + child_is_immediate = false; + } } } + // Parse a wildcard pattern + else if ( + stream->next == '_' || + + // TODO remove. + // For temporary backward compatibility, handle '*' as a wildcard. + stream->next == '*' + ) { + stream_advance(stream); + stream_skip_whitespace(stream); + + // Add a step that matches any kind of node + array_push(&self->steps, query_step__new(WILDCARD_SYMBOL, depth, is_immediate)); + } + // Parse a double-quoted anonymous leaf node expression else if (stream->next == '"') { stream_advance(stream); @@ -842,15 +918,6 @@ static TSQueryError ts_query__parse_pattern( self->steps.contents[step_index].field = field_id; } - // Parse a wildcard pattern - else if (stream->next == '*') { - stream_advance(stream); - stream_skip_whitespace(stream); - - // Add a step that matches any kind of node - array_push(&self->steps, query_step__new(WILDCARD_SYMBOL, depth, is_immediate)); - } - else { return TSQueryErrorSyntax; } @@ -863,9 +930,29 @@ static TSQueryError ts_query__parse_pattern( if (stream->next == '+') { stream_advance(stream); - step->is_repeated = true; - array_back(&self->steps)->repeat_step_index = starting_step_index; stream_skip_whitespace(stream); + QueryStep repeat_step = query_step__new(WILDCARD_SYMBOL, depth, false); + repeat_step.alternative_index = starting_step_index; + repeat_step.is_placeholder = true; + repeat_step.alternative_is_immediate = true; + array_push(&self->steps, repeat_step); + } + + else if (stream->next == '?') { + stream_advance(stream); + stream_skip_whitespace(stream); + step->alternative_index = self->steps.size; + } + + else if (stream->next == '*') { + stream_advance(stream); + stream_skip_whitespace(stream); + QueryStep repeat_step = query_step__new(WILDCARD_SYMBOL, depth, false); + repeat_step.alternative_index = starting_step_index; + repeat_step.is_placeholder = true; + repeat_step.alternative_is_immediate = true; + array_push(&self->steps, repeat_step); + step->alternative_index = self->steps.size; } // Parse an '@'-prefixed capture pattern @@ -950,6 +1037,7 @@ TSQuery *ts_query_new( Stream stream = stream_new(source, source_len); stream_skip_whitespace(&stream); while (stream.input < stream.end) { + uint32_t pattern_index = self->predicates_by_pattern.size; uint32_t start_step_index = self->steps.size; uint32_t capture_count = 0; array_push(&self->start_bytes_by_pattern, stream.input - source); @@ -980,14 +1068,18 @@ TSQuery *ts_query_new( } // Maintain a map that can look up patterns for a given root symbol. - self->steps.contents[start_step_index].is_pattern_start = true; - ts_query__pattern_map_insert( - self, - self->steps.contents[start_step_index].symbol, - start_step_index - ); - if (self->steps.contents[start_step_index].symbol == WILDCARD_SYMBOL) { - self->wildcard_root_pattern_count++; + for (;;) { + QueryStep *step = &self->steps.contents[start_step_index]; + step->is_pattern_start = true; + ts_query__pattern_map_insert(self, step->symbol, start_step_index, pattern_index); + if (step->symbol == WILDCARD_SYMBOL) { + self->wildcard_root_pattern_count++; + } + if (step->alternative_index != NONE) { + start_step_index = step->alternative_index; + } else { + break; + } } } @@ -1191,22 +1283,84 @@ static bool ts_query_cursor__first_in_progress_capture( return result; } -static bool ts_query__cursor_add_state( +// Determine which node is first in a depth-first traversal +int ts_query_cursor__compare_nodes(TSNode left, TSNode right) { + if (left.id != right.id) { + uint32_t left_start = ts_node_start_byte(left); + uint32_t right_start = ts_node_start_byte(right); + if (left_start < right_start) return -1; + if (left_start > right_start) return 1; + uint32_t left_node_count = ts_node_end_byte(left); + uint32_t right_node_count = ts_node_end_byte(right); + if (left_node_count > right_node_count) return -1; + if (left_node_count < right_node_count) return 1; + } + return 0; +} + +// Determine if either state contains a superset of the other state's captures. +void ts_query_cursor__compare_captures( TSQueryCursor *self, - const PatternEntry *pattern + QueryState *left_state, + QueryState *right_state, + bool *left_contains_right, + bool *right_contains_left ) { - QueryStep *step = &self->query->steps.contents[pattern->step_index]; - - // If this pattern begins with a repetition, then avoid creating - // new states after already matching the repetition one or more times. - // The query should only one match for the repetition - the one that - // started the earliest. - if (step->is_repeated) { - for (unsigned i = 0; i < self->states.size; i++) { - QueryState *state = &self->states.contents[i]; - if (state->step_index == pattern->step_index) return true; + CaptureList *left_captures = capture_list_pool_get( + &self->capture_list_pool, + left_state->capture_list_id + ); + CaptureList *right_captures = capture_list_pool_get( + &self->capture_list_pool, + right_state->capture_list_id + ); + *left_contains_right = true; + *right_contains_left = true; + unsigned i = 0, j = 0; + for (;;) { + if (i < left_captures->size) { + if (j < right_captures->size) { + TSQueryCapture *left = &left_captures->contents[i]; + TSQueryCapture *right = &right_captures->contents[j]; + if (left->node.id == right->node.id && left->index == right->index) { + i++; + j++; + } else { + switch (ts_query_cursor__compare_nodes(left->node, right->node)) { + case -1: + *right_contains_left = false; + i++; + break; + case 1: + *left_contains_right = false; + j++; + break; + default: + *right_contains_left = false; + *left_contains_right = false; + i++; + j++; + break; + } + } + } else { + *right_contains_left = false; + break; + } + } else { + if (j < right_captures->size) { + *left_contains_right = false; + } + break; } } +} + +static bool ts_query_cursor__add_state( + TSQueryCursor *self, + const PatternEntry *pattern +) { + QueryStep *step = &self->query->steps.contents[pattern->step_index]; uint32_t list_id = capture_list_pool_acquire(&self->capture_list_pool); @@ -1244,21 +1398,23 @@ static bool ts_query__cursor_add_state( .pattern_index = pattern->pattern_index, .start_depth = self->depth - step->depth, .consumed_capture_count = 0, - .repeat_match_count = 0, - .step_index_on_failure = NONE, - .seeking_non_match = false, + .seeking_immediate_match = false, })); return true; } +// Duplicate the given state and insert the newly-created state immediately after +// the given state in the `states` array. static QueryState *ts_query__cursor_copy_state( TSQueryCursor *self, const QueryState *state ) { uint32_t new_list_id = capture_list_pool_acquire(&self->capture_list_pool); if (new_list_id == NONE) return NULL; - array_push(&self->states, *state); - QueryState *new_state = array_back(&self->states); + uint32_t index = (state - self->states.contents) + 1; + QueryState copy = *state; + array_insert(&self->states, index, copy); + QueryState *new_state = &self->states.contents[index]; new_state->capture_list_id = new_list_id; CaptureList *old_captures = capture_list_pool_get( &self->capture_list_pool, @@ -1281,56 +1437,62 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { if (self->ascending) { LOG("leave node. type:%s\n", ts_node_type(ts_tree_cursor_current_node(&self->cursor))); - // When leaving a node, remove any unfinished states whose next step - // needed to match something within that node. + // Leave this node by stepping to its next sibling or to its parent. + bool did_move = true; + if (ts_tree_cursor_goto_next_sibling(&self->cursor)) { + self->ascending = false; + } else if (ts_tree_cursor_goto_parent(&self->cursor)) { + self->depth--; + } else { + did_move = false; + } + + // After leaving a node, remove any states that cannot make further progress. uint32_t deleted_count = 0; for (unsigned i = 0, n = self->states.size; i < n; i++) { QueryState *state = &self->states.contents[i]; QueryStep *step = &self->query->steps.contents[state->step_index]; - if ((uint32_t)state->start_depth + (uint32_t)step->depth > self->depth) { + // If a state completed its pattern inside of this node, but was deferred from finishing + // in order to search for longer matches, mark it as finished. + if (step->depth == PATTERN_DONE_MARKER) { + if (state->start_depth > self->depth || !did_move) { + LOG(" finish pattern %u\n", state->pattern_index); + state->id = self->next_state_id++; + array_push(&self->finished_states, *state); + deleted_count++; + continue; + } + } + + // If a state needed to match something within this node, then remove that state + // as it has failed to match. + else if ((uint32_t)state->start_depth + (uint32_t)step->depth > self->depth) { LOG( " failed to match. pattern:%u, step:%u\n", state->pattern_index, state->step_index ); - capture_list_pool_release( &self->capture_list_pool, state->capture_list_id ); deleted_count++; - } else if (deleted_count > 0) { + continue; + } + + if (deleted_count > 0) { self->states.contents[i - deleted_count] = *state; } } - self->states.size -= deleted_count; - if (ts_tree_cursor_goto_next_sibling(&self->cursor)) { - self->ascending = false; - } else if (ts_tree_cursor_goto_parent(&self->cursor)) { - self->depth--; - } else { + if (!did_move) { return self->finished_states.size > 0; } } else { - bool has_later_siblings; - bool can_have_later_siblings_with_this_field; - TSFieldId field_id = ts_tree_cursor_current_status( - &self->cursor, - &has_later_siblings, - &can_have_later_siblings_with_this_field - ); + // If this node is before the selected range, then avoid descending into it. TSNode node = ts_tree_cursor_current_node(&self->cursor); - TSSymbol symbol = ts_node_symbol(node); - bool is_named = ts_node_is_named(node); - if (symbol != ts_builtin_sym_error && self->query->symbol_map) { - symbol = self->query->symbol_map[symbol]; - } - - // If this node is before the selected range, then avoid descending - // into it. if ( ts_node_end_byte(node) <= self->start_byte || point_lte(ts_node_end_point(node), self->start_point) @@ -1347,18 +1509,26 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { point_lte(self->end_point, ts_node_start_point(node)) ) return false; + // Get the properties of the current node. + TSSymbol symbol = ts_node_symbol(node); + bool is_named = ts_node_is_named(node); + if (symbol != ts_builtin_sym_error && self->query->symbol_map) { + symbol = self->query->symbol_map[symbol]; + } + bool can_have_later_siblings; + bool can_have_later_siblings_with_this_field; + TSFieldId field_id = ts_tree_cursor_current_status( + &self->cursor, + &can_have_later_siblings, + &can_have_later_siblings_with_this_field + ); LOG( - "enter node. " - "type:%s, field:%s, row:%u state_count:%u, " - "finished_state_count:%u, has_later_siblings:%d, " - "can_have_later_siblings_with_this_field:%d\n", + "enter node. type:%s, field:%s, row:%u state_count:%u, finished_state_count:%u\n", ts_node_type(node), ts_language_field_name_for_id(self->query->language, field_id), ts_node_start_point(node).row, self->states.size, - self->finished_states.size, - has_later_siblings, - can_have_later_siblings_with_this_field + self->finished_states.size ); // Add new states for any patterns whose root node is a wildcard. @@ -1369,7 +1539,7 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { // If this node matches the first step of the pattern, then add a new // state at the start of this pattern. if (step->field && field_id != step->field) continue; - if (!ts_query__cursor_add_state(self, pattern)) break; + if (!ts_query_cursor__add_state(self, pattern)) break; } // Add new states for any patterns whose root node matches this node. @@ -1381,7 +1551,7 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { // If this node matches the first step of the pattern, then add a new // state at the start of this pattern. if (step->field && field_id != step->field) continue; - if (!ts_query__cursor_add_state(self, pattern)) break; + if (!ts_query_cursor__add_state(self, pattern)) break; // Advance to the next pattern whose root node matches this node. i++; @@ -1392,9 +1562,10 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { } // Update all of the in-progress states with current node. - for (unsigned i = 0, n = self->states.size; i < n; i++) { + for (unsigned i = 0; i < self->states.size; i++) { QueryState *state = &self->states.contents[i]; QueryStep *step = &self->query->steps.contents[state->step_index]; + state->has_in_progress_alternatives = false; // Check that the node matches all of the criteria for the next // step of the pattern. @@ -1407,11 +1578,11 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { step->symbol == symbol || step->symbol == WILDCARD_SYMBOL || (step->symbol == NAMED_WILDCARD_SYMBOL && is_named); - bool later_sibling_can_match = has_later_siblings; - if (step->is_immediate && is_named) { + bool later_sibling_can_match = can_have_later_siblings; + if ((step->is_immediate && is_named) || state->seeking_immediate_match) { later_sibling_can_match = false; } - if (step->is_last && has_later_siblings) { + if (step->is_last && can_have_later_siblings) { node_does_match = false; } if (step->field) { @@ -1424,25 +1595,8 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { } } + // Remove states immediately if it is ever clear that they cannot match. if (!node_does_match) { - // If this QueryState has processed a repeating sequence, and that repeating - // sequence has ended, move on to the *next* step of this state's pattern. - if ( - state->step_index_on_failure != NONE && - (!later_sibling_can_match || step->is_repeated) - ) { - LOG( - " finish repetition state. pattern:%u, step:%u\n", - state->pattern_index, - state->step_index - ); - state->step_index = state->step_index_on_failure; - state->step_index_on_failure = NONE; - state->repeat_match_count = 0; - i--; - continue; - } - if (!later_sibling_can_match) { LOG( " discard state. pattern:%u, step:%u\n", @@ -1455,115 +1609,159 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { ); array_erase(&self->states, i); i--; - n--; } - - state->seeking_non_match = false; continue; } - // The `seeking_non_match` flag indicates that a previous QueryState - // has already begun processing this repeating sequence, so that *this* - // QueryState should not begin matching until a separate repeating sequence - // is found. - if (state->seeking_non_match) continue; - - // Some patterns can match their root node in multiple ways, - // capturing different children. If this pattern step could match - // later children within the same parent, then this query state - // cannot simply be updated in place. It must be split into two - // states: one that matches this node, and one which skips over - // this node, to preserve the possibility of matching later - // siblings. - QueryState *next_state = state; + // Some patterns can match their root node in multiple ways, capturing different + // children. If this pattern step could match later children within the same + // parent, then this query state cannot simply be updated in place. It must be + // split into two states: one that matches this node, and one which skips over + // this node, to preserve the possibility of matching later siblings. if ( - !step->is_pattern_start && - step->contains_captures && later_sibling_can_match && - state->repeat_match_count == 0 + !step->is_pattern_start && + step->contains_captures ) { - QueryState *copy = ts_query__cursor_copy_state(self, state); - - // The QueryState that matched this node has begun matching a repeating - // sequence. The QueryState that *skipped* this node should not start - // matching later elements of the same repeating sequence. - if (step->is_repeated) { - state->seeking_non_match = true; - } - - if (copy) { + if (ts_query__cursor_copy_state(self, state)) { LOG( - " split state. pattern:%u, step:%u\n", - copy->pattern_index, - copy->step_index + " split state for capture. pattern:%u, step:%u\n", + state->pattern_index, + state->step_index ); - next_state = copy; - } else { - LOG(" cannot split state.\n"); + i++; } } - // If the current node is captured in this pattern, add it to the - // capture list. + // If the current node is captured in this pattern, add it to the capture list. + CaptureList *capture_list = capture_list_pool_get( + &self->capture_list_pool, + state->capture_list_id + ); for (unsigned j = 0; j < MAX_STEP_CAPTURE_COUNT; j++) { uint16_t capture_id = step->capture_ids[j]; if (step->capture_ids[j] == NONE) break; - CaptureList *capture_list = capture_list_pool_get( - &self->capture_list_pool, - next_state->capture_list_id - ); - array_push(capture_list, ((TSQueryCapture) { - node, - capture_id - })); + array_push(capture_list, ((TSQueryCapture) { node, capture_id })); LOG( " capture node. pattern:%u, capture_id:%u, capture_count:%u\n", - next_state->pattern_index, + state->pattern_index, capture_id, capture_list->size ); } - // If this is the end of a repetition, then jump back to the beginning - // of that repetition. - if (step->repeat_step_index != NONE) { - next_state->step_index_on_failure = next_state->step_index + 1; - next_state->step_index = step->repeat_step_index; - next_state->repeat_match_count++; - LOG( - " continue repeat. pattern:%u, match_count:%u\n", - next_state->pattern_index, - next_state->repeat_match_count - ); - } else { - next_state->step_index++; - LOG( - " advance state. pattern:%u, step:%u\n", - next_state->pattern_index, - next_state->step_index - ); + // Advance this state to the next step of its pattern. + state->step_index++; + state->seeking_immediate_match = false; + LOG( + " advance state. pattern:%u, step:%u\n", + state->pattern_index, + state->step_index + ); - QueryStep *next_step = step + 1; + // If this state's next step has an 'alternative' step (the step is either optional, + // or is the end of a repetition), then copy the state in order to pursue both + // alternatives. The alternative step itself may have an alternative, so this is + // an interative process. + unsigned start_index = state - self->states.contents; + unsigned end_index = start_index + 1; + for (unsigned j = start_index; j < end_index; j++) { + QueryState *state = &self->states.contents[j]; + QueryStep *next_step = &self->query->steps.contents[state->step_index]; + if (next_step->alternative_index != NONE) { + QueryState *copy = ts_query__cursor_copy_state(self, state); + if (next_step->is_placeholder) { + state->step_index++; + j--; + } + if (copy) { + i++; + end_index++; + copy->step_index = next_step->alternative_index; + if (next_step->alternative_is_immediate) { + copy->seeking_immediate_match = true; + } + LOG( + " split state for branch. pattern:%u, step:%u, step:%u\n", + copy->pattern_index, + state->step_index, + copy->step_index + ); + } + } + } + } - // If the pattern is now done, then remove it from the list of - // in-progress states, and add it to the list of finished states. - if (next_step->depth == PATTERN_DONE_MARKER) { - LOG(" finish pattern %u\n", next_state->pattern_index); + for (unsigned i = 0; i < self->states.size; i++) { + QueryState *state = &self->states.contents[i]; + bool did_remove = false; - next_state->id = self->next_state_id++; - array_push(&self->finished_states, *next_state); - if (next_state == state) { - array_erase(&self->states, i); - i--; - n--; + // Enfore the longest-match criteria. When a query pattern contains optional or + // repeated nodes, this is necesssary to avoid multiple redundant states, where + // one state has a strict subset of another state's captures. + for (unsigned j = i + 1; j < self->states.size; j++) { + QueryState *other_state = &self->states.contents[j]; + if ( + state->pattern_index == other_state->pattern_index && + state->start_depth == other_state->start_depth + ) { + bool left_contains_right, right_contains_left; + ts_query_cursor__compare_captures( + self, + state, + other_state, + &left_contains_right, + &right_contains_left + ); + if (left_contains_right) { + if (state->step_index == other_state->step_index) { + LOG( + " drop shorter state. pattern: %u, step_index: %u\n", + state->pattern_index, + state->step_index + ); + capture_list_pool_release(&self->capture_list_pool, other_state->capture_list_id); + array_erase(&self->states, j); + j--; + continue; + } + other_state->has_in_progress_alternatives = true; + } + if (right_contains_left) { + if (state->step_index == other_state->step_index) { + LOG( + " drop shorter state. pattern: %u, step_index: %u\n", + state->pattern_index, + state->step_index + ); + capture_list_pool_release(&self->capture_list_pool, state->capture_list_id); + array_erase(&self->states, i); + did_remove = true; + break; + } + state->has_in_progress_alternatives = true; + } + } + } + + // If there the state is at the end of its pattern, remove it from the list + // of in-progress states and add it to the list of finished states. + if (!did_remove) { + QueryStep *next_step = &self->query->steps.contents[state->step_index]; + if (next_step->depth == PATTERN_DONE_MARKER) { + if (state->has_in_progress_alternatives) { + LOG(" defer finishing pattern %u\n", state->pattern_index); } else { - self->states.size--; + LOG(" finish pattern %u\n", state->pattern_index); + state->id = self->next_state_id++; + array_push(&self->finished_states, *state); + array_erase(&self->states, state - self->states.contents); + i--; } } } } - // Continue descending if possible. if (ts_tree_cursor_goto_first_child(&self->cursor)) { self->depth++; -- cgit From f5fbe8e3b58b707eadac70de94ca0475071e5da9 Mon Sep 17 00:00:00 2001 From: Thomas Vigouroux Date: Tue, 2 Jun 2020 22:16:54 +0200 Subject: treesitter: add update script and update runtime Update treesitter runtime to : 9a82dcc666d06617cbab3061467075019fae0b0d --- src/tree_sitter/query.c | 359 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 238 insertions(+), 121 deletions(-) (limited to 'src/tree_sitter/query.c') diff --git a/src/tree_sitter/query.c b/src/tree_sitter/query.c index d5fb90075d..1ffc5ad6e2 100644 --- a/src/tree_sitter/query.c +++ b/src/tree_sitter/query.c @@ -8,6 +8,13 @@ #include "./unicode.h" #include +// #define LOG(...) fprintf(stderr, __VA_ARGS__) +#define LOG(...) + +#define MAX_STATE_COUNT 256 +#define MAX_CAPTURE_LIST_COUNT 32 +#define MAX_STEP_CAPTURE_COUNT 3 + /* * Stream - A sequence of unicode characters derived from a UTF8 string. * This struct is used in parsing queries from S-expressions. @@ -19,8 +26,6 @@ typedef struct { uint8_t next_size; } Stream; -#define MAX_STEP_CAPTURE_COUNT 4 - /* * QueryStep - A step in the process of matching a query. Each node within * a query S-expression maps to one of these steps. An entire pattern is @@ -30,9 +35,8 @@ typedef struct { * wildcard symbol, '_'. * - `field` - The field name to match. A zero value means that a field name * was not specified. - * - `capture_id` - An integer representing the name of the capture associated - * with this node in the pattern. A `NONE` value means this node is not - * captured in this pattern. + * - `capture_ids` - An array of integers representing the names of captures + * associated with this node in the pattern, terminated by a `NONE` value. * - `depth` - The depth where this node occurs in the pattern. The root node * of the pattern has depth zero. * - `alternative_index` - The index of a different query step that serves as @@ -43,12 +47,13 @@ typedef struct { TSFieldId field; uint16_t capture_ids[MAX_STEP_CAPTURE_COUNT]; uint16_t alternative_index; - uint8_t depth; + uint16_t depth; bool contains_captures: 1; bool is_pattern_start: 1; bool is_immediate: 1; - bool is_last: 1; - bool is_placeholder: 1; + bool is_last_child: 1; + bool is_pass_through: 1; + bool is_dead_end: 1; bool alternative_is_immediate: 1; } QueryStep; @@ -111,10 +116,10 @@ typedef struct { typedef struct { uint32_t id; uint16_t start_depth; - uint16_t pattern_index; uint16_t step_index; - uint16_t consumed_capture_count; - uint8_t capture_list_id; + uint16_t pattern_index; + uint16_t capture_list_id; + uint16_t consumed_capture_count: 14; bool seeking_immediate_match: 1; bool has_in_progress_alternatives: 1; } QueryState; @@ -128,7 +133,8 @@ typedef Array(TSQueryCapture) CaptureList; * are currently in use. */ typedef struct { - CaptureList list[32]; + CaptureList list[MAX_CAPTURE_LIST_COUNT]; + CaptureList empty_list; uint32_t usage_map; } CaptureListPool; @@ -169,14 +175,10 @@ struct TSQueryCursor { }; static const TSQueryError PARENT_DONE = -1; -static const uint8_t PATTERN_DONE_MARKER = UINT8_MAX; +static const uint16_t PATTERN_DONE_MARKER = UINT16_MAX; static const uint16_t NONE = UINT16_MAX; static const TSSymbol WILDCARD_SYMBOL = 0; static const TSSymbol NAMED_WILDCARD_SYMBOL = UINT16_MAX - 1; -static const uint16_t MAX_STATE_COUNT = 32; - -// #define LOG(...) fprintf(stderr, __VA_ARGS__) -#define LOG(...) /********** * Stream @@ -259,24 +261,31 @@ static void stream_scan_identifier(Stream *stream) { static CaptureListPool capture_list_pool_new(void) { return (CaptureListPool) { + .empty_list = array_new(), .usage_map = UINT32_MAX, }; } static void capture_list_pool_reset(CaptureListPool *self) { self->usage_map = UINT32_MAX; - for (unsigned i = 0; i < 32; i++) { + for (unsigned i = 0; i < MAX_CAPTURE_LIST_COUNT; i++) { array_clear(&self->list[i]); } } static void capture_list_pool_delete(CaptureListPool *self) { - for (unsigned i = 0; i < 32; i++) { + for (unsigned i = 0; i < MAX_CAPTURE_LIST_COUNT; i++) { array_delete(&self->list[i]); } } -static CaptureList *capture_list_pool_get(CaptureListPool *self, uint16_t id) { +static const CaptureList *capture_list_pool_get(const CaptureListPool *self, uint16_t id) { + if (id >= MAX_CAPTURE_LIST_COUNT) return &self->empty_list; + return &self->list[id]; +} + +static CaptureList *capture_list_pool_get_mut(CaptureListPool *self, uint16_t id) { + assert(id < MAX_CAPTURE_LIST_COUNT); return &self->list[id]; } @@ -290,12 +299,14 @@ static uint16_t capture_list_pool_acquire(CaptureListPool *self) { // the leading zeros in the usage map. An id of zero corresponds to the // highest-order bit in the bitmask. uint16_t id = count_leading_zeros(self->usage_map); - if (id == 32) return NONE; + if (id >= MAX_CAPTURE_LIST_COUNT) return NONE; self->usage_map &= ~bitmask_for_index(id); + array_clear(&self->list[id]); return id; } static void capture_list_pool_release(CaptureListPool *self, uint16_t id) { + if (id >= MAX_CAPTURE_LIST_COUNT) return; array_clear(&self->list[id]); self->usage_map |= bitmask_for_index(id); } @@ -433,12 +444,13 @@ static QueryStep query_step__new( .symbol = symbol, .depth = depth, .field = 0, - .capture_ids = {NONE, NONE, NONE, NONE}, + .capture_ids = {NONE, NONE, NONE}, .alternative_index = NONE, .contains_captures = false, - .is_last = false, + .is_last_child = false, .is_pattern_start = false, - .is_placeholder = false, + .is_pass_through = false, + .is_dead_end = false, .is_immediate = is_immediate, .alternative_is_immediate = false, }; @@ -703,15 +715,60 @@ static TSQueryError ts_query__parse_pattern( uint32_t *capture_count, bool is_immediate ) { - uint16_t starting_step_index = self->steps.size; + uint32_t starting_step_index = self->steps.size; if (stream->next == 0) return TSQueryErrorSyntax; - // Finish the parent S-expression - if (stream->next == ')') { + // Finish the parent S-expression. + if (stream->next == ')' || stream->next == ']') { return PARENT_DONE; } + // An open bracket is the start of an alternation. + else if (stream->next == '[') { + stream_advance(stream); + stream_skip_whitespace(stream); + + // Parse each branch, and add a placeholder step in between the branches. + Array(uint32_t) branch_step_indices = array_new(); + for (;;) { + uint32_t start_index = self->steps.size; + TSQueryError e = ts_query__parse_pattern( + self, + stream, + depth, + capture_count, + is_immediate + ); + + if (e == PARENT_DONE && stream->next == ']' && branch_step_indices.size > 0) { + stream_advance(stream); + break; + } else if (e) { + array_delete(&branch_step_indices); + return e; + } + + array_push(&branch_step_indices, start_index); + array_push(&self->steps, query_step__new(0, depth, false)); + } + (void)array_pop(&self->steps); + + // For all of the branches except for the last one, add the subsequent branch as an + // alternative, and link the end of the branch to the current end of the steps. + for (unsigned i = 0; i < branch_step_indices.size - 1; i++) { + uint32_t step_index = branch_step_indices.contents[i]; + uint32_t next_step_index = branch_step_indices.contents[i + 1]; + QueryStep *start_step = &self->steps.contents[step_index]; + QueryStep *end_step = &self->steps.contents[next_step_index - 1]; + start_step->alternative_index = next_step_index; + end_step->alternative_index = self->steps.size; + end_step->is_dead_end = true; + } + + array_delete(&branch_step_indices); + } + // An open parenthesis can be the start of three possible constructs: // * A grouped sequence // * A predicate @@ -721,7 +778,7 @@ static TSQueryError ts_query__parse_pattern( stream_skip_whitespace(stream); // If this parenthesis is followed by a node, then it represents a grouped sequence. - if (stream->next == '(' || stream->next == '"') { + if (stream->next == '(' || stream->next == '"' || stream->next == '[') { bool child_is_immediate = false; for (;;) { if (stream->next == '.') { @@ -736,7 +793,7 @@ static TSQueryError ts_query__parse_pattern( capture_count, child_is_immediate ); - if (e == PARENT_DONE) { + if (e == PARENT_DONE && stream->next == ')') { stream_advance(stream); break; } else if (e) { @@ -817,9 +874,9 @@ static TSQueryError ts_query__parse_pattern( capture_count, child_is_immediate ); - if (e == PARENT_DONE) { + if (e == PARENT_DONE && stream->next == ')') { if (child_is_immediate) { - self->steps.contents[child_start_step_index].is_last = true; + self->steps.contents[child_start_step_index].is_last_child = true; } stream_advance(stream); break; @@ -928,42 +985,54 @@ static TSQueryError ts_query__parse_pattern( for (;;) { QueryStep *step = &self->steps.contents[starting_step_index]; + // Parse the one-or-more operator. if (stream->next == '+') { stream_advance(stream); stream_skip_whitespace(stream); + QueryStep repeat_step = query_step__new(WILDCARD_SYMBOL, depth, false); repeat_step.alternative_index = starting_step_index; - repeat_step.is_placeholder = true; + repeat_step.is_pass_through = true; repeat_step.alternative_is_immediate = true; array_push(&self->steps, repeat_step); } - else if (stream->next == '?') { - stream_advance(stream); - stream_skip_whitespace(stream); - step->alternative_index = self->steps.size; - } - + // Parse the zero-or-more repetition operator. else if (stream->next == '*') { stream_advance(stream); stream_skip_whitespace(stream); + QueryStep repeat_step = query_step__new(WILDCARD_SYMBOL, depth, false); repeat_step.alternative_index = starting_step_index; - repeat_step.is_placeholder = true; + repeat_step.is_pass_through = true; repeat_step.alternative_is_immediate = true; array_push(&self->steps, repeat_step); + + while (step->alternative_index != NONE) { + step = &self->steps.contents[step->alternative_index]; + } + step->alternative_index = self->steps.size; + } + + // Parse the optional operator. + else if (stream->next == '?') { + stream_advance(stream); + stream_skip_whitespace(stream); + + while (step->alternative_index != NONE) { + step = &self->steps.contents[step->alternative_index]; + } step->alternative_index = self->steps.size; } // Parse an '@'-prefixed capture pattern else if (stream->next == '@') { stream_advance(stream); - - // Parse the capture name if (!stream_is_ident_start(stream)) return TSQueryErrorSyntax; const char *capture_name = stream->input; stream_scan_identifier(stream); uint32_t length = stream->input - capture_name; + stream_skip_whitespace(stream); // Add the capture id to the first step of the pattern uint16_t capture_id = symbol_table_insert_name( @@ -971,10 +1040,22 @@ static TSQueryError ts_query__parse_pattern( capture_name, length ); - query_step__add_capture(step, capture_id); - (*capture_count)++; - stream_skip_whitespace(stream); + for (;;) { + query_step__add_capture(step, capture_id); + if ( + step->alternative_index != NONE && + step->alternative_index > starting_step_index && + step->alternative_index < self->steps.size + ) { + starting_step_index = step->alternative_index; + step = &self->steps.contents[starting_step_index]; + } else { + break; + } + } + + (*capture_count)++; } // No more suffix modifiers @@ -1051,6 +1132,7 @@ TSQuery *ts_query_new( // If any pattern could not be parsed, then report the error information // and terminate. if (*error_type) { + if (*error_type == PARENT_DONE) *error_type = TSQueryErrorSyntax; *error_offset = stream.input - source; ts_query_delete(self); return NULL; @@ -1075,6 +1157,9 @@ TSQuery *ts_query_new( if (step->symbol == WILDCARD_SYMBOL) { self->wildcard_root_pattern_count++; } + + // If there are alternatives or options at the root of the pattern, + // then add multiple entries to the pattern map. if (step->alternative_index != NONE) { start_step_index = step->alternative_index; } else { @@ -1195,7 +1280,7 @@ TSQueryCursor *ts_query_cursor_new() { .end_point = POINT_MAX, }; array_reserve(&self->states, MAX_STATE_COUNT); - array_reserve(&self->finished_states, MAX_STATE_COUNT); + array_reserve(&self->finished_states, MAX_CAPTURE_LIST_COUNT); return self; } @@ -1257,6 +1342,9 @@ static bool ts_query_cursor__first_in_progress_capture( uint32_t *pattern_index ) { bool result = false; + *state_index = UINT32_MAX; + *byte_offset = UINT32_MAX; + *pattern_index = UINT32_MAX; for (unsigned i = 0; i < self->states.size; i++) { const QueryState *state = &self->states.contents[i]; const CaptureList *captures = capture_list_pool_get( @@ -1268,10 +1356,7 @@ static bool ts_query_cursor__first_in_progress_capture( if ( !result || capture_byte < *byte_offset || - ( - capture_byte == *byte_offset && - state->pattern_index < *pattern_index - ) + (capture_byte == *byte_offset && state->pattern_index < *pattern_index) ) { result = true; *state_index = i; @@ -1306,11 +1391,11 @@ void ts_query_cursor__compare_captures( bool *left_contains_right, bool *right_contains_left ) { - CaptureList *left_captures = capture_list_pool_get( + const CaptureList *left_captures = capture_list_pool_get( &self->capture_list_pool, left_state->capture_list_id ); - CaptureList *right_captures = capture_list_pool_get( + const CaptureList *right_captures = capture_list_pool_get( &self->capture_list_pool, right_state->capture_list_id ); @@ -1360,40 +1445,18 @@ static bool ts_query_cursor__add_state( TSQueryCursor *self, const PatternEntry *pattern ) { - QueryStep *step = &self->query->steps.contents[pattern->step_index]; - - uint32_t list_id = capture_list_pool_acquire(&self->capture_list_pool); - - // If there are no capture lists left in the pool, then terminate whichever - // state has captured the earliest node in the document, and steal its - // capture list. - if (list_id == NONE) { - uint32_t state_index, byte_offset, pattern_index; - if (ts_query_cursor__first_in_progress_capture( - self, - &state_index, - &byte_offset, - &pattern_index - )) { - LOG( - " abandon state. index:%u, pattern:%u, offset:%u.\n", - state_index, pattern_index, byte_offset - ); - list_id = self->states.contents[state_index].capture_list_id; - array_erase(&self->states, state_index); - } else { - LOG(" too many finished states.\n"); - return false; - } + if (self->states.size >= MAX_STATE_COUNT) { + LOG(" too many states"); + return false; } - LOG( " start state. pattern:%u, step:%u\n", pattern->pattern_index, pattern->step_index ); + QueryStep *step = &self->query->steps.contents[pattern->step_index]; array_push(&self->states, ((QueryState) { - .capture_list_id = list_id, + .capture_list_id = NONE, .step_index = pattern->step_index, .pattern_index = pattern->pattern_index, .start_depth = self->depth - step->depth, @@ -1409,23 +1472,34 @@ static QueryState *ts_query__cursor_copy_state( TSQueryCursor *self, const QueryState *state ) { - uint32_t new_list_id = capture_list_pool_acquire(&self->capture_list_pool); - if (new_list_id == NONE) return NULL; - uint32_t index = (state - self->states.contents) + 1; + if (self->states.size >= MAX_STATE_COUNT) { + LOG(" too many states"); + return NULL; + } + + // If the state has captures, copy its capture list. QueryState copy = *state; + copy.capture_list_id = state->capture_list_id; + if (state->capture_list_id != NONE) { + copy.capture_list_id = capture_list_pool_acquire(&self->capture_list_pool); + if (copy.capture_list_id == NONE) { + LOG(" too many capture lists"); + return NULL; + } + const CaptureList *old_captures = capture_list_pool_get( + &self->capture_list_pool, + state->capture_list_id + ); + CaptureList *new_captures = capture_list_pool_get_mut( + &self->capture_list_pool, + copy.capture_list_id + ); + array_push_all(new_captures, old_captures); + } + + uint32_t index = (state - self->states.contents) + 1; array_insert(&self->states, index, copy); - QueryState *new_state = &self->states.contents[index]; - new_state->capture_list_id = new_list_id; - CaptureList *old_captures = capture_list_pool_get( - &self->capture_list_pool, - state->capture_list_id - ); - CaptureList *new_captures = capture_list_pool_get( - &self->capture_list_pool, - new_list_id - ); - array_push_all(new_captures, old_captures); - return new_state; + return &self->states.contents[index]; } // Walk the tree, processing patterns until at least one pattern finishes, @@ -1562,10 +1636,11 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { } // Update all of the in-progress states with current node. - for (unsigned i = 0; i < self->states.size; i++) { + for (unsigned i = 0, copy_count = 0; i < self->states.size; i += 1 + copy_count) { QueryState *state = &self->states.contents[i]; QueryStep *step = &self->query->steps.contents[state->step_index]; state->has_in_progress_alternatives = false; + copy_count = 0; // Check that the node matches all of the criteria for the next // step of the pattern. @@ -1582,7 +1657,7 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { if ((step->is_immediate && is_named) || state->seeking_immediate_match) { later_sibling_can_match = false; } - if (step->is_last && can_have_later_siblings) { + if (step->is_last_child && can_have_later_siblings) { node_does_match = false; } if (step->field) { @@ -1629,25 +1704,61 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { state->pattern_index, state->step_index ); - i++; + copy_count++; } } // If the current node is captured in this pattern, add it to the capture list. - CaptureList *capture_list = capture_list_pool_get( - &self->capture_list_pool, - state->capture_list_id - ); - for (unsigned j = 0; j < MAX_STEP_CAPTURE_COUNT; j++) { - uint16_t capture_id = step->capture_ids[j]; - if (step->capture_ids[j] == NONE) break; - array_push(capture_list, ((TSQueryCapture) { node, capture_id })); - LOG( - " capture node. pattern:%u, capture_id:%u, capture_count:%u\n", - state->pattern_index, - capture_id, - capture_list->size + // For the first capture in a pattern, lazily acquire a capture list. + if (step->capture_ids[0] != NONE) { + if (state->capture_list_id == NONE) { + state->capture_list_id = capture_list_pool_acquire(&self->capture_list_pool); + + // If there are no capture lists left in the pool, then terminate whichever + // state has captured the earliest node in the document, and steal its + // capture list. + if (state->capture_list_id == NONE) { + uint32_t state_index, byte_offset, pattern_index; + if (ts_query_cursor__first_in_progress_capture( + self, + &state_index, + &byte_offset, + &pattern_index + )) { + LOG( + " abandon state. index:%u, pattern:%u, offset:%u.\n", + state_index, pattern_index, byte_offset + ); + state->capture_list_id = self->states.contents[state_index].capture_list_id; + array_erase(&self->states, state_index); + if (state_index < i) { + i--; + state--; + } + } else { + LOG(" too many finished states.\n"); + array_erase(&self->states, i); + i--; + continue; + } + } + } + + CaptureList *capture_list = capture_list_pool_get_mut( + &self->capture_list_pool, + state->capture_list_id ); + for (unsigned j = 0; j < MAX_STEP_CAPTURE_COUNT; j++) { + uint16_t capture_id = step->capture_ids[j]; + if (step->capture_ids[j] == NONE) break; + array_push(capture_list, ((TSQueryCapture) { node, capture_id })); + LOG( + " capture node. pattern:%u, capture_id:%u, capture_count:%u\n", + state->pattern_index, + capture_id, + capture_list->size + ); + } } // Advance this state to the next step of its pattern. @@ -1663,29 +1774,35 @@ static inline bool ts_query_cursor__advance(TSQueryCursor *self) { // or is the end of a repetition), then copy the state in order to pursue both // alternatives. The alternative step itself may have an alternative, so this is // an interative process. - unsigned start_index = state - self->states.contents; - unsigned end_index = start_index + 1; - for (unsigned j = start_index; j < end_index; j++) { + unsigned end_index = i + 1; + for (unsigned j = i; j < end_index; j++) { QueryState *state = &self->states.contents[j]; QueryStep *next_step = &self->query->steps.contents[state->step_index]; if (next_step->alternative_index != NONE) { + if (next_step->is_dead_end) { + state->step_index = next_step->alternative_index; + j--; + continue; + } + QueryState *copy = ts_query__cursor_copy_state(self, state); - if (next_step->is_placeholder) { + if (next_step->is_pass_through) { state->step_index++; j--; } if (copy) { - i++; + copy_count++; end_index++; copy->step_index = next_step->alternative_index; if (next_step->alternative_is_immediate) { copy->seeking_immediate_match = true; } LOG( - " split state for branch. pattern:%u, step:%u, step:%u\n", + " split state for branch. pattern:%u, step:%u, step:%u, immediate:%d\n", copy->pattern_index, state->step_index, - copy->step_index + copy->step_index, + copy->seeking_immediate_match ); } } @@ -1787,7 +1904,7 @@ bool ts_query_cursor_next_match( QueryState *state = &self->finished_states.contents[0]; match->id = state->id; match->pattern_index = state->pattern_index; - CaptureList *captures = capture_list_pool_get( + const CaptureList *captures = capture_list_pool_get( &self->capture_list_pool, state->capture_list_id ); @@ -1829,8 +1946,8 @@ bool ts_query_cursor_next_capture( // First, identify the position of the earliest capture in an unfinished // match. For a finished capture to be returned, it must be *before* // this position. - uint32_t first_unfinished_capture_byte = UINT32_MAX; - uint32_t first_unfinished_pattern_index = UINT32_MAX; + uint32_t first_unfinished_capture_byte; + uint32_t first_unfinished_pattern_index; uint32_t first_unfinished_state_index; ts_query_cursor__first_in_progress_capture( self, @@ -1845,7 +1962,7 @@ bool ts_query_cursor_next_capture( uint32_t first_finished_pattern_index = first_unfinished_pattern_index; for (unsigned i = 0; i < self->finished_states.size; i++) { const QueryState *state = &self->finished_states.contents[i]; - CaptureList *captures = capture_list_pool_get( + const CaptureList *captures = capture_list_pool_get( &self->capture_list_pool, state->capture_list_id ); @@ -1883,7 +2000,7 @@ bool ts_query_cursor_next_capture( ]; match->id = state->id; match->pattern_index = state->pattern_index; - CaptureList *captures = capture_list_pool_get( + const CaptureList *captures = capture_list_pool_get( &self->capture_list_pool, state->capture_list_id ); -- cgit From 6b949211a06e21af67bf4cb3a20c6f87c932ef2a Mon Sep 17 00:00:00 2001 From: Thomas Vigouroux Date: Wed, 3 Jun 2020 21:33:34 +0200 Subject: treesitter: update runtime Update to 81d533d2d1b580fdb507accabc91ceddffb5b6f0. --- src/tree_sitter/query.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src/tree_sitter/query.c') diff --git a/src/tree_sitter/query.c b/src/tree_sitter/query.c index 1ffc5ad6e2..59902dee3b 100644 --- a/src/tree_sitter/query.c +++ b/src/tree_sitter/query.c @@ -1267,7 +1267,7 @@ void ts_query_disable_pattern( * QueryCursor ***************/ -TSQueryCursor *ts_query_cursor_new() { +TSQueryCursor *ts_query_cursor_new(void) { TSQueryCursor *self = ts_malloc(sizeof(TSQueryCursor)); *self = (TSQueryCursor) { .ascending = false, -- cgit