diff options
Diffstat (limited to 'src/tree_sitter/query.c')
-rw-r--r-- | src/tree_sitter/query.c | 2143 |
1 files changed, 0 insertions, 2143 deletions
diff --git a/src/tree_sitter/query.c b/src/tree_sitter/query.c deleted file mode 100644 index b887b74ff6..0000000000 --- a/src/tree_sitter/query.c +++ /dev/null @@ -1,2143 +0,0 @@ -#include "tree_sitter/api.h" -#include "./alloc.h" -#include "./array.h" -#include "./bits.h" -#include "./language.h" -#include "./point.h" -#include "./tree_cursor.h" -#include "./unicode.h" -#include <wctype.h> - -// #define LOG(...) fprintf(stderr, __VA_ARGS__) -#define LOG(...) - -#define MAX_CAPTURE_LIST_COUNT 32 -#define MAX_STEP_CAPTURE_COUNT 3 - -/* - * Stream - A sequence of unicode characters derived from a UTF8 string. - * This struct is used in parsing queries from S-expressions. - */ -typedef struct { - const char *input; - const char *end; - int32_t next; - uint8_t next_size; -} Stream; - -/* - * QueryStep - A step in the process of matching a query. Each node within - * a query S-expression maps to one of these steps. An entire pattern is - * represented as a sequence of these steps. Fields: - * - * - `symbol` - The grammar symbol to match. A zero value represents the - * wildcard symbol, '_'. - * - `field` - The field name to match. A zero value means that a field name - * was not specified. - * - `capture_ids` - An array of integers representing the names of captures - * associated with this node in the pattern, terminated by a `NONE` value. - * - `depth` - The depth where this node occurs in the pattern. The root node - * of the pattern has depth zero. - * - `alternative_index` - The index of a different query step that serves as - * an alternative to this step. - */ -typedef struct { - TSSymbol symbol; - TSFieldId field; - uint16_t capture_ids[MAX_STEP_CAPTURE_COUNT]; - uint16_t alternative_index; - uint16_t depth; - bool contains_captures: 1; - bool is_immediate: 1; - bool is_last_child: 1; - bool is_pass_through: 1; - bool is_dead_end: 1; - bool alternative_is_immediate: 1; -} QueryStep; - -/* - * Slice - A slice of an external array. Within a query, capture names, - * literal string values, and predicate step informations are stored in three - * contiguous arrays. Individual captures, string values, and predicates are - * represented as slices of these three arrays. - */ -typedef struct { - uint32_t offset; - uint32_t length; -} Slice; - -/* - * SymbolTable - a two-way mapping of strings to ids. - */ -typedef struct { - Array(char) characters; - Array(Slice) slices; -} SymbolTable; - -/* - * PatternEntry - Information about the starting point for matching a - * particular pattern, consisting of the index of the pattern within the query, - * and the index of the patter's first step in the shared `steps` array. These - * entries are stored in a 'pattern map' - a sorted array that makes it - * possible to efficiently lookup patterns based on the symbol for their first - * step. - */ -typedef struct { - uint16_t step_index; - uint16_t pattern_index; -} PatternEntry; - -/* - * QueryState - The state of an in-progress match of a particular pattern - * in a query. While executing, a `TSQueryCursor` must keep track of a number - * of possible in-progress matches. Each of those possible matches is - * represented as one of these states. Fields: - * - `id` - A numeric id that is exposed to the public API. This allows the - * caller to remove a given match, preventing any more of its captures - * from being returned. - * - `start_depth` - The depth in the tree where the first step of the state's - * pattern was matched. - * - `pattern_index` - The pattern that the state is matching. - * - `consumed_capture_count` - The number of captures from this match that - * have already been returned. - * - `capture_list_id` - A numeric id that can be used to retrieve the state's - * list of captures from the `CaptureListPool`. - * - `seeking_immediate_match` - A flag that indicates that the state's next - * step must be matched by the very next sibling. This is used when - * processing repetitions. - * - `has_in_progress_alternatives` - A flag that indicates that there is are - * other states that have the same captures as this state, but are at - * different steps in their pattern. This means that in order to obey the - * 'longest-match' rule, this state should not be returned as a match until - * it is clear that there can be no longer match. - */ -typedef struct { - uint32_t id; - uint16_t start_depth; - uint16_t step_index; - uint16_t pattern_index; - uint16_t capture_list_id; - uint16_t consumed_capture_count: 12; - bool seeking_immediate_match: 1; - bool has_in_progress_alternatives: 1; - bool dead: 1; -} QueryState; - -typedef Array(TSQueryCapture) CaptureList; - -/* - * CaptureListPool - A collection of *lists* of captures. Each QueryState - * needs to maintain its own list of captures. To avoid repeated allocations, - * the reuses a fixed set of capture lists, and keeps track of which ones - * are currently in use. - */ -typedef struct { - CaptureList list[MAX_CAPTURE_LIST_COUNT]; - CaptureList empty_list; - uint32_t usage_map; -} CaptureListPool; - -/* - * TSQuery - A tree query, compiled from a string of S-expressions. The query - * itself is immutable. The mutable state used in the process of executing the - * query is stored in a `TSQueryCursor`. - */ -struct TSQuery { - SymbolTable captures; - SymbolTable predicate_values; - Array(QueryStep) steps; - Array(PatternEntry) pattern_map; - Array(TSQueryPredicateStep) predicate_steps; - Array(Slice) predicates_by_pattern; - Array(uint32_t) start_bytes_by_pattern; - const TSLanguage *language; - uint16_t wildcard_root_pattern_count; - TSSymbol *symbol_map; -}; - -/* - * TSQueryCursor - A stateful struct used to execute a query on a tree. - */ -struct TSQueryCursor { - const TSQuery *query; - TSTreeCursor cursor; - Array(QueryState) states; - Array(QueryState) finished_states; - CaptureListPool capture_list_pool; - uint32_t depth; - uint32_t start_byte; - uint32_t end_byte; - uint32_t next_state_id; - TSPoint start_point; - TSPoint end_point; - bool ascending; - bool halted; -}; - -static const TSQueryError PARENT_DONE = -1; -static const uint16_t PATTERN_DONE_MARKER = UINT16_MAX; -static const uint16_t NONE = UINT16_MAX; -static const TSSymbol WILDCARD_SYMBOL = 0; -static const TSSymbol NAMED_WILDCARD_SYMBOL = UINT16_MAX - 1; - -/********** - * Stream - **********/ - -// Advance to the next unicode code point in the stream. -static bool stream_advance(Stream *self) { - self->input += self->next_size; - if (self->input < self->end) { - uint32_t size = ts_decode_utf8( - (const uint8_t *)self->input, - self->end - self->input, - &self->next - ); - if (size > 0) { - self->next_size = size; - return true; - } - } else { - self->next_size = 0; - self->next = '\0'; - } - return false; -} - -// Reset the stream to the given input position, represented as a pointer -// into the input string. -static void stream_reset(Stream *self, const char *input) { - self->input = input; - self->next_size = 0; - stream_advance(self); -} - -static Stream stream_new(const char *string, uint32_t length) { - Stream self = { - .next = 0, - .input = string, - .end = string + length, - }; - stream_advance(&self); - return self; -} - -static void stream_skip_whitespace(Stream *stream) { - for (;;) { - if (iswspace(stream->next)) { - stream_advance(stream); - } else if (stream->next == ';') { - // skip over comments - stream_advance(stream); - while (stream->next && stream->next != '\n') { - if (!stream_advance(stream)) break; - } - } else { - break; - } - } -} - -static bool stream_is_ident_start(Stream *stream) { - return iswalnum(stream->next) || stream->next == '_' || stream->next == '-'; -} - -static void stream_scan_identifier(Stream *stream) { - do { - stream_advance(stream); - } while ( - iswalnum(stream->next) || - stream->next == '_' || - stream->next == '-' || - stream->next == '.' || - stream->next == '?' || - stream->next == '!' - ); -} - -/****************** - * CaptureListPool - ******************/ - -static CaptureListPool capture_list_pool_new(void) { - return (CaptureListPool) { - .empty_list = array_new(), - .usage_map = UINT32_MAX, - }; -} - -static void capture_list_pool_reset(CaptureListPool *self) { - self->usage_map = UINT32_MAX; - for (unsigned i = 0; i < MAX_CAPTURE_LIST_COUNT; i++) { - array_clear(&self->list[i]); - } -} - -static void capture_list_pool_delete(CaptureListPool *self) { - for (unsigned i = 0; i < MAX_CAPTURE_LIST_COUNT; i++) { - array_delete(&self->list[i]); - } -} - -static const CaptureList *capture_list_pool_get(const CaptureListPool *self, uint16_t id) { - if (id >= MAX_CAPTURE_LIST_COUNT) return &self->empty_list; - return &self->list[id]; -} - -static CaptureList *capture_list_pool_get_mut(CaptureListPool *self, uint16_t id) { - assert(id < MAX_CAPTURE_LIST_COUNT); - return &self->list[id]; -} - -static bool capture_list_pool_is_empty(const CaptureListPool *self) { - return self->usage_map == 0; -} - -static uint16_t capture_list_pool_acquire(CaptureListPool *self) { - // In the usage_map bitmask, ones represent free lists, and zeros represent - // lists that are in use. A free list id can quickly be found by counting - // the leading zeros in the usage map. An id of zero corresponds to the - // highest-order bit in the bitmask. - uint16_t id = count_leading_zeros(self->usage_map); - if (id >= MAX_CAPTURE_LIST_COUNT) return NONE; - self->usage_map &= ~bitmask_for_index(id); - array_clear(&self->list[id]); - return id; -} - -static void capture_list_pool_release(CaptureListPool *self, uint16_t id) { - if (id >= MAX_CAPTURE_LIST_COUNT) return; - array_clear(&self->list[id]); - self->usage_map |= bitmask_for_index(id); -} - -/************** - * SymbolTable - **************/ - -static SymbolTable symbol_table_new(void) { - return (SymbolTable) { - .characters = array_new(), - .slices = array_new(), - }; -} - -static void symbol_table_delete(SymbolTable *self) { - array_delete(&self->characters); - array_delete(&self->slices); -} - -static int symbol_table_id_for_name( - const SymbolTable *self, - const char *name, - uint32_t length -) { - for (unsigned i = 0; i < self->slices.size; i++) { - Slice slice = self->slices.contents[i]; - if ( - slice.length == length && - !strncmp(&self->characters.contents[slice.offset], name, length) - ) return i; - } - return -1; -} - -static const char *symbol_table_name_for_id( - const SymbolTable *self, - uint16_t id, - uint32_t *length -) { - Slice slice = self->slices.contents[id]; - *length = slice.length; - return &self->characters.contents[slice.offset]; -} - -static uint16_t symbol_table_insert_name( - SymbolTable *self, - const char *name, - uint32_t length -) { - int id = symbol_table_id_for_name(self, name, length); - if (id >= 0) return (uint16_t)id; - Slice slice = { - .offset = self->characters.size, - .length = length, - }; - array_grow_by(&self->characters, length + 1); - memcpy(&self->characters.contents[slice.offset], name, length); - self->characters.contents[self->characters.size - 1] = 0; - array_push(&self->slices, slice); - return self->slices.size - 1; -} - -static uint16_t symbol_table_insert_name_with_escapes( - SymbolTable *self, - const char *escaped_name, - uint32_t escaped_length -) { - Slice slice = { - .offset = self->characters.size, - .length = 0, - }; - array_grow_by(&self->characters, escaped_length + 1); - - // Copy the contents of the literal into the characters buffer, processing escape - // sequences like \n and \". This needs to be done before checking if the literal - // is already present, in order to do the string comparison. - bool is_escaped = false; - for (unsigned i = 0; i < escaped_length; i++) { - const char *src = &escaped_name[i]; - char *dest = &self->characters.contents[slice.offset + slice.length]; - if (is_escaped) { - switch (*src) { - case 'n': - *dest = '\n'; - break; - case 'r': - *dest = '\r'; - break; - case 't': - *dest = '\t'; - break; - case '0': - *dest = '\0'; - break; - default: - *dest = *src; - break; - } - is_escaped = false; - slice.length++; - } else { - if (*src == '\\') { - is_escaped = true; - } else { - *dest = *src; - slice.length++; - } - } - } - - // If the string is already present, remove the redundant content from the characters - // buffer and return the existing id. - int id = symbol_table_id_for_name(self, &self->characters.contents[slice.offset], slice.length); - if (id >= 0) { - self->characters.size -= (escaped_length + 1); - return id; - } - - self->characters.contents[slice.offset + slice.length] = 0; - array_push(&self->slices, slice); - return self->slices.size - 1; -} - -/************ - * QueryStep - ************/ - -static QueryStep query_step__new( - TSSymbol symbol, - uint16_t depth, - bool is_immediate -) { - return (QueryStep) { - .symbol = symbol, - .depth = depth, - .field = 0, - .capture_ids = {NONE, NONE, NONE}, - .alternative_index = NONE, - .contains_captures = false, - .is_last_child = false, - .is_pass_through = false, - .is_dead_end = false, - .is_immediate = is_immediate, - .alternative_is_immediate = false, - }; -} - -static void query_step__add_capture(QueryStep *self, uint16_t capture_id) { - for (unsigned i = 0; i < MAX_STEP_CAPTURE_COUNT; i++) { - if (self->capture_ids[i] == NONE) { - self->capture_ids[i] = capture_id; - break; - } - } -} - -static void query_step__remove_capture(QueryStep *self, uint16_t capture_id) { - for (unsigned i = 0; i < MAX_STEP_CAPTURE_COUNT; i++) { - if (self->capture_ids[i] == capture_id) { - self->capture_ids[i] = NONE; - while (i + 1 < MAX_STEP_CAPTURE_COUNT) { - if (self->capture_ids[i + 1] == NONE) break; - self->capture_ids[i] = self->capture_ids[i + 1]; - self->capture_ids[i + 1] = NONE; - i++; - } - break; - } - } -} - -/********* - * Query - *********/ - -// The `pattern_map` contains a mapping from TSSymbol values to indices in the -// `steps` array. For a given syntax node, the `pattern_map` makes it possible -// to quickly find the starting steps of all of the patterns whose root matches -// that node. Each entry has two fields: a `pattern_index`, which identifies one -// of the patterns in the query, and a `step_index`, which indicates the start -// offset of that pattern's steps within the `steps` array. -// -// The entries are sorted by the patterns' root symbols, and lookups use a -// binary search. This ensures that the cost of this initial lookup step -// scales logarithmically with the number of patterns in the query. -// -// This returns `true` if the symbol is present and `false` otherwise. -// If the symbol is not present `*result` is set to the index where the -// symbol should be inserted. -static inline bool ts_query__pattern_map_search( - const TSQuery *self, - TSSymbol needle, - uint32_t *result -) { - uint32_t base_index = self->wildcard_root_pattern_count; - uint32_t size = self->pattern_map.size - base_index; - if (size == 0) { - *result = base_index; - return false; - } - while (size > 1) { - uint32_t half_size = size / 2; - uint32_t mid_index = base_index + half_size; - TSSymbol mid_symbol = self->steps.contents[ - self->pattern_map.contents[mid_index].step_index - ].symbol; - if (needle > mid_symbol) base_index = mid_index; - size -= half_size; - } - - TSSymbol symbol = self->steps.contents[ - self->pattern_map.contents[base_index].step_index - ].symbol; - - if (needle > symbol) { - base_index++; - if (base_index < self->pattern_map.size) { - symbol = self->steps.contents[ - self->pattern_map.contents[base_index].step_index - ].symbol; - } - } - - *result = base_index; - return needle == symbol; -} - -// Insert a new pattern's start index into the pattern map, maintaining -// the pattern map's ordering invariant. -static inline void ts_query__pattern_map_insert( - TSQuery *self, - TSSymbol symbol, - uint32_t start_step_index, - uint32_t pattern_index -) { - uint32_t index; - ts_query__pattern_map_search(self, symbol, &index); - - // Ensure that the entries are sorted not only by symbol, but also - // by pattern_index. This way, states for earlier patterns will be - // initiated first, which allows the ordering of the states array - // to be maintained more efficiently. - while (index < self->pattern_map.size) { - PatternEntry *entry = &self->pattern_map.contents[index]; - if ( - self->steps.contents[entry->step_index].symbol == symbol && - entry->pattern_index < pattern_index - ) { - index++; - } else { - break; - } - } - - array_insert(&self->pattern_map, index, ((PatternEntry) { - .step_index = start_step_index, - .pattern_index = pattern_index, - })); -} - -static void ts_query__finalize_steps(TSQuery *self) { - for (unsigned i = 0; i < self->steps.size; i++) { - QueryStep *step = &self->steps.contents[i]; - uint32_t depth = step->depth; - if (step->capture_ids[0] != NONE) { - step->contains_captures = true; - } else { - step->contains_captures = false; - for (unsigned j = i + 1; j < self->steps.size; j++) { - QueryStep *s = &self->steps.contents[j]; - if (s->depth == PATTERN_DONE_MARKER || s->depth <= depth) break; - if (s->capture_ids[0] != NONE) step->contains_captures = true; - } - } - } -} - -// Parse a single predicate associated with a pattern, adding it to the -// query's internal `predicate_steps` array. Predicates are arbitrary -// S-expressions associated with a pattern which are meant to be handled at -// a higher level of abstraction, such as the Rust/JavaScript bindings. They -// can contain '@'-prefixed capture names, double-quoted strings, and bare -// symbols, which also represent strings. -static TSQueryError ts_query__parse_predicate( - TSQuery *self, - Stream *stream -) { - if (!stream_is_ident_start(stream)) return TSQueryErrorSyntax; - const char *predicate_name = stream->input; - stream_scan_identifier(stream); - uint32_t length = stream->input - predicate_name; - uint16_t id = symbol_table_insert_name( - &self->predicate_values, - predicate_name, - length - ); - array_back(&self->predicates_by_pattern)->length++; - array_push(&self->predicate_steps, ((TSQueryPredicateStep) { - .type = TSQueryPredicateStepTypeString, - .value_id = id, - })); - stream_skip_whitespace(stream); - - for (;;) { - if (stream->next == ')') { - stream_advance(stream); - stream_skip_whitespace(stream); - array_back(&self->predicates_by_pattern)->length++; - array_push(&self->predicate_steps, ((TSQueryPredicateStep) { - .type = TSQueryPredicateStepTypeDone, - .value_id = 0, - })); - break; - } - - // Parse an '@'-prefixed capture name - else if (stream->next == '@') { - stream_advance(stream); - - // Parse the capture name - if (!stream_is_ident_start(stream)) return TSQueryErrorSyntax; - const char *capture_name = stream->input; - stream_scan_identifier(stream); - uint32_t length = stream->input - capture_name; - - // Add the capture id to the first step of the pattern - int capture_id = symbol_table_id_for_name( - &self->captures, - capture_name, - length - ); - if (capture_id == -1) { - stream_reset(stream, capture_name); - return TSQueryErrorCapture; - } - - array_back(&self->predicates_by_pattern)->length++; - array_push(&self->predicate_steps, ((TSQueryPredicateStep) { - .type = TSQueryPredicateStepTypeCapture, - .value_id = capture_id, - })); - } - - // Parse a string literal - else if (stream->next == '"') { - stream_advance(stream); - - // Parse the string content - bool is_escaped = false; - const char *string_content = stream->input; - for (;;) { - if (is_escaped) { - is_escaped = false; - } else { - if (stream->next == '\\') { - is_escaped = true; - } else if (stream->next == '"') { - break; - } else if (stream->next == '\n') { - stream_reset(stream, string_content - 1); - return TSQueryErrorSyntax; - } - } - if (!stream_advance(stream)) { - stream_reset(stream, string_content - 1); - return TSQueryErrorSyntax; - } - } - uint32_t length = stream->input - string_content; - - // Add a step for the node - uint16_t id = symbol_table_insert_name_with_escapes( - &self->predicate_values, - string_content, - length - ); - array_back(&self->predicates_by_pattern)->length++; - array_push(&self->predicate_steps, ((TSQueryPredicateStep) { - .type = TSQueryPredicateStepTypeString, - .value_id = id, - })); - - if (stream->next != '"') return TSQueryErrorSyntax; - stream_advance(stream); - } - - // Parse a bare symbol - else if (stream_is_ident_start(stream)) { - const char *symbol_start = stream->input; - stream_scan_identifier(stream); - uint32_t length = stream->input - symbol_start; - uint16_t id = symbol_table_insert_name( - &self->predicate_values, - symbol_start, - length - ); - array_back(&self->predicates_by_pattern)->length++; - array_push(&self->predicate_steps, ((TSQueryPredicateStep) { - .type = TSQueryPredicateStepTypeString, - .value_id = id, - })); - } - - else { - return TSQueryErrorSyntax; - } - - stream_skip_whitespace(stream); - } - - return 0; -} - -// Read one S-expression pattern from the stream, and incorporate it into -// the query's internal state machine representation. For nested patterns, -// this function calls itself recursively. -static TSQueryError ts_query__parse_pattern( - TSQuery *self, - Stream *stream, - uint32_t depth, - uint32_t *capture_count, - bool is_immediate -) { - const uint32_t starting_step_index = self->steps.size; - - if (stream->next == 0) return TSQueryErrorSyntax; - - // Finish the parent S-expression. - if (stream->next == ')' || stream->next == ']') { - return PARENT_DONE; - } - - // An open bracket is the start of an alternation. - else if (stream->next == '[') { - stream_advance(stream); - stream_skip_whitespace(stream); - - // Parse each branch, and add a placeholder step in between the branches. - Array(uint32_t) branch_step_indices = array_new(); - for (;;) { - uint32_t start_index = self->steps.size; - TSQueryError e = ts_query__parse_pattern( - self, - stream, - depth, - capture_count, - is_immediate - ); - - if (e == PARENT_DONE && stream->next == ']' && branch_step_indices.size > 0) { - stream_advance(stream); - break; - } else if (e) { - array_delete(&branch_step_indices); - return e; - } - - array_push(&branch_step_indices, start_index); - array_push(&self->steps, query_step__new(0, depth, false)); - } - (void)array_pop(&self->steps); - - // For all of the branches except for the last one, add the subsequent branch as an - // alternative, and link the end of the branch to the current end of the steps. - for (unsigned i = 0; i < branch_step_indices.size - 1; i++) { - uint32_t step_index = branch_step_indices.contents[i]; - uint32_t next_step_index = branch_step_indices.contents[i + 1]; - QueryStep *start_step = &self->steps.contents[step_index]; - QueryStep *end_step = &self->steps.contents[next_step_index - 1]; - start_step->alternative_index = next_step_index; - end_step->alternative_index = self->steps.size; - end_step->is_dead_end = true; - } - - array_delete(&branch_step_indices); - } - - // An open parenthesis can be the start of three possible constructs: - // * A grouped sequence - // * A predicate - // * A named node - else if (stream->next == '(') { - stream_advance(stream); - stream_skip_whitespace(stream); - - // If this parenthesis is followed by a node, then it represents a grouped sequence. - if (stream->next == '(' || stream->next == '"' || stream->next == '[') { - bool child_is_immediate = false; - for (;;) { - if (stream->next == '.') { - child_is_immediate = true; - stream_advance(stream); - stream_skip_whitespace(stream); - } - TSQueryError e = ts_query__parse_pattern( - self, - stream, - depth, - capture_count, - child_is_immediate - ); - if (e == PARENT_DONE && stream->next == ')') { - stream_advance(stream); - break; - } else if (e) { - return e; - } - - child_is_immediate = false; - } - } - - // A dot/pound character indicates the start of a predicate. - else if (stream->next == '.' || stream->next == '#') { - stream_advance(stream); - return ts_query__parse_predicate(self, stream); - } - - // Otherwise, this parenthesis is the start of a named node. - else { - TSSymbol symbol; - - // Parse the wildcard symbol - if ( - stream->next == '_' || - - // TODO - remove. - // For temporary backward compatibility, handle '*' as a wildcard. - stream->next == '*' - ) { - symbol = depth > 0 ? NAMED_WILDCARD_SYMBOL : WILDCARD_SYMBOL; - stream_advance(stream); - } - - // Parse a normal node name - else if (stream_is_ident_start(stream)) { - const char *node_name = stream->input; - stream_scan_identifier(stream); - uint32_t length = stream->input - node_name; - - // TODO - remove. - // For temporary backward compatibility, handle predicates without the leading '#' sign. - if (length > 0 && (node_name[length - 1] == '!' || node_name[length - 1] == '?')) { - stream_reset(stream, node_name); - return ts_query__parse_predicate(self, stream); - } - - symbol = ts_language_symbol_for_name( - self->language, - node_name, - length, - true - ); - if (!symbol) { - stream_reset(stream, node_name); - return TSQueryErrorNodeType; - } - } else { - return TSQueryErrorSyntax; - } - - // Add a step for the node. - array_push(&self->steps, query_step__new(symbol, depth, is_immediate)); - - // Parse the child patterns - stream_skip_whitespace(stream); - bool child_is_immediate = false; - uint16_t child_start_step_index = self->steps.size; - for (;;) { - if (stream->next == '.') { - child_is_immediate = true; - stream_advance(stream); - stream_skip_whitespace(stream); - } - - TSQueryError e = ts_query__parse_pattern( - self, - stream, - depth + 1, - capture_count, - child_is_immediate - ); - if (e == PARENT_DONE && stream->next == ')') { - if (child_is_immediate) { - self->steps.contents[child_start_step_index].is_last_child = true; - } - stream_advance(stream); - break; - } else if (e) { - return e; - } - - child_is_immediate = false; - } - } - } - - // Parse a wildcard pattern - else if ( - stream->next == '_' || - - // TODO remove. - // For temporary backward compatibility, handle '*' as a wildcard. - stream->next == '*' - ) { - stream_advance(stream); - stream_skip_whitespace(stream); - - // Add a step that matches any kind of node - array_push(&self->steps, query_step__new(WILDCARD_SYMBOL, depth, is_immediate)); - } - - // Parse a double-quoted anonymous leaf node expression - else if (stream->next == '"') { - stream_advance(stream); - - // Parse the string content - const char *string_content = stream->input; - while (stream->next != '"') { - if (!stream_advance(stream)) { - stream_reset(stream, string_content - 1); - return TSQueryErrorSyntax; - } - } - uint32_t length = stream->input - string_content; - - // Add a step for the node - TSSymbol symbol = ts_language_symbol_for_name( - self->language, - string_content, - length, - false - ); - if (!symbol) { - stream_reset(stream, string_content); - return TSQueryErrorNodeType; - } - array_push(&self->steps, query_step__new(symbol, depth, is_immediate)); - - if (stream->next != '"') return TSQueryErrorSyntax; - stream_advance(stream); - } - - // Parse a field-prefixed pattern - else if (stream_is_ident_start(stream)) { - // Parse the field name - const char *field_name = stream->input; - stream_scan_identifier(stream); - uint32_t length = stream->input - field_name; - stream_skip_whitespace(stream); - - if (stream->next != ':') { - stream_reset(stream, field_name); - return TSQueryErrorSyntax; - } - stream_advance(stream); - stream_skip_whitespace(stream); - - // Parse the pattern - TSQueryError e = ts_query__parse_pattern( - self, - stream, - depth, - capture_count, - is_immediate - ); - if (e == PARENT_DONE) return TSQueryErrorSyntax; - if (e) return e; - - // Add the field name to the first step of the pattern - TSFieldId field_id = ts_language_field_id_for_name( - self->language, - field_name, - length - ); - if (!field_id) { - stream->input = field_name; - return TSQueryErrorField; - } - - uint32_t step_index = starting_step_index; - QueryStep *step = &self->steps.contents[step_index]; - for (;;) { - step->field = field_id; - if ( - step->alternative_index != NONE && - step->alternative_index > step_index && - step->alternative_index < self->steps.size - ) { - step_index = step->alternative_index; - step = &self->steps.contents[step_index]; - } else { - break; - } - } - } - - else { - return TSQueryErrorSyntax; - } - - stream_skip_whitespace(stream); - - // Parse suffixes modifiers for this pattern - for (;;) { - QueryStep *step = &self->steps.contents[starting_step_index]; - - // Parse the one-or-more operator. - if (stream->next == '+') { - stream_advance(stream); - stream_skip_whitespace(stream); - - QueryStep repeat_step = query_step__new(WILDCARD_SYMBOL, depth, false); - repeat_step.alternative_index = starting_step_index; - repeat_step.is_pass_through = true; - repeat_step.alternative_is_immediate = true; - array_push(&self->steps, repeat_step); - } - - // Parse the zero-or-more repetition operator. - else if (stream->next == '*') { - stream_advance(stream); - stream_skip_whitespace(stream); - - QueryStep repeat_step = query_step__new(WILDCARD_SYMBOL, depth, false); - repeat_step.alternative_index = starting_step_index; - repeat_step.is_pass_through = true; - repeat_step.alternative_is_immediate = true; - array_push(&self->steps, repeat_step); - - while (step->alternative_index != NONE) { - step = &self->steps.contents[step->alternative_index]; - } - step->alternative_index = self->steps.size; - } - - // Parse the optional operator. - else if (stream->next == '?') { - stream_advance(stream); - stream_skip_whitespace(stream); - - while (step->alternative_index != NONE) { - step = &self->steps.contents[step->alternative_index]; - } - step->alternative_index = self->steps.size; - } - - // Parse an '@'-prefixed capture pattern - else if (stream->next == '@') { - stream_advance(stream); - if (!stream_is_ident_start(stream)) return TSQueryErrorSyntax; - const char *capture_name = stream->input; - stream_scan_identifier(stream); - uint32_t length = stream->input - capture_name; - stream_skip_whitespace(stream); - - // Add the capture id to the first step of the pattern - uint16_t capture_id = symbol_table_insert_name( - &self->captures, - capture_name, - length - ); - - uint32_t step_index = starting_step_index; - for (;;) { - query_step__add_capture(step, capture_id); - if ( - step->alternative_index != NONE && - step->alternative_index > step_index && - step->alternative_index < self->steps.size - ) { - step_index = step->alternative_index; - step = &self->steps.contents[step_index]; - } else { - break; - } - } - - (*capture_count)++; - } - - // No more suffix modifiers - else { - break; - } - } - - return 0; -} - -TSQuery *ts_query_new( - const TSLanguage *language, - const char *source, - uint32_t source_len, - uint32_t *error_offset, - TSQueryError *error_type -) { - TSSymbol *symbol_map; - if (ts_language_version(language) >= TREE_SITTER_LANGUAGE_VERSION_WITH_SYMBOL_DEDUPING) { - symbol_map = NULL; - } else { - // Work around the fact that multiple symbols can currently be - // associated with the same name, due to "simple aliases". - // In the next language ABI version, this map will be contained - // in the language's `public_symbol_map` field. - uint32_t symbol_count = ts_language_symbol_count(language); - symbol_map = ts_malloc(sizeof(TSSymbol) * symbol_count); - for (unsigned i = 0; i < symbol_count; i++) { - const char *name = ts_language_symbol_name(language, i); - const TSSymbolType symbol_type = ts_language_symbol_type(language, i); - - symbol_map[i] = i; - - for (unsigned j = 0; j < i; j++) { - if (ts_language_symbol_type(language, j) == symbol_type) { - if (!strcmp(name, ts_language_symbol_name(language, j))) { - symbol_map[i] = j; - break; - } - } - } - } - } - - TSQuery *self = ts_malloc(sizeof(TSQuery)); - *self = (TSQuery) { - .steps = array_new(), - .pattern_map = array_new(), - .captures = symbol_table_new(), - .predicate_values = symbol_table_new(), - .predicate_steps = array_new(), - .predicates_by_pattern = array_new(), - .symbol_map = symbol_map, - .wildcard_root_pattern_count = 0, - .language = language, - }; - - // Parse all of the S-expressions in the given string. - Stream stream = stream_new(source, source_len); - stream_skip_whitespace(&stream); - while (stream.input < stream.end) { - uint32_t pattern_index = self->predicates_by_pattern.size; - uint32_t start_step_index = self->steps.size; - uint32_t capture_count = 0; - array_push(&self->start_bytes_by_pattern, stream.input - source); - array_push(&self->predicates_by_pattern, ((Slice) { - .offset = self->predicate_steps.size, - .length = 0, - })); - *error_type = ts_query__parse_pattern(self, &stream, 0, &capture_count, false); - array_push(&self->steps, query_step__new(0, PATTERN_DONE_MARKER, false)); - - // If any pattern could not be parsed, then report the error information - // and terminate. - if (*error_type) { - if (*error_type == PARENT_DONE) *error_type = TSQueryErrorSyntax; - *error_offset = stream.input - source; - ts_query_delete(self); - return NULL; - } - - // If a pattern has a wildcard at its root, optimize the matching process - // by skipping matching the wildcard. - if ( - self->steps.contents[start_step_index].symbol == WILDCARD_SYMBOL - ) { - QueryStep *second_step = &self->steps.contents[start_step_index + 1]; - if (second_step->symbol != WILDCARD_SYMBOL && second_step->depth != PATTERN_DONE_MARKER) { - start_step_index += 1; - } - } - - // Maintain a map that can look up patterns for a given root symbol. - for (;;) { - QueryStep *step = &self->steps.contents[start_step_index]; - ts_query__pattern_map_insert(self, step->symbol, start_step_index, pattern_index); - if (step->symbol == WILDCARD_SYMBOL) { - self->wildcard_root_pattern_count++; - } - - // If there are alternatives or options at the root of the pattern, - // then add multiple entries to the pattern map. - if (step->alternative_index != NONE) { - start_step_index = step->alternative_index; - step->alternative_index = NONE; - } else { - break; - } - } - } - - ts_query__finalize_steps(self); - return self; -} - -void ts_query_delete(TSQuery *self) { - if (self) { - array_delete(&self->steps); - array_delete(&self->pattern_map); - array_delete(&self->predicate_steps); - array_delete(&self->predicates_by_pattern); - array_delete(&self->start_bytes_by_pattern); - symbol_table_delete(&self->captures); - symbol_table_delete(&self->predicate_values); - ts_free(self->symbol_map); - ts_free(self); - } -} - -uint32_t ts_query_pattern_count(const TSQuery *self) { - return self->predicates_by_pattern.size; -} - -uint32_t ts_query_capture_count(const TSQuery *self) { - return self->captures.slices.size; -} - -uint32_t ts_query_string_count(const TSQuery *self) { - return self->predicate_values.slices.size; -} - -const char *ts_query_capture_name_for_id( - const TSQuery *self, - uint32_t index, - uint32_t *length -) { - return symbol_table_name_for_id(&self->captures, index, length); -} - -const char *ts_query_string_value_for_id( - const TSQuery *self, - uint32_t index, - uint32_t *length -) { - return symbol_table_name_for_id(&self->predicate_values, index, length); -} - -const TSQueryPredicateStep *ts_query_predicates_for_pattern( - const TSQuery *self, - uint32_t pattern_index, - uint32_t *step_count -) { - Slice slice = self->predicates_by_pattern.contents[pattern_index]; - *step_count = slice.length; - if (self->predicate_steps.contents == NULL) { - return NULL; - } - return &self->predicate_steps.contents[slice.offset]; -} - -uint32_t ts_query_start_byte_for_pattern( - const TSQuery *self, - uint32_t pattern_index -) { - return self->start_bytes_by_pattern.contents[pattern_index]; -} - -void ts_query_disable_capture( - TSQuery *self, - const char *name, - uint32_t length -) { - // Remove capture information for any pattern step that previously - // captured with the given name. - int id = symbol_table_id_for_name(&self->captures, name, length); - if (id != -1) { - for (unsigned i = 0; i < self->steps.size; i++) { - QueryStep *step = &self->steps.contents[i]; - query_step__remove_capture(step, id); - } - ts_query__finalize_steps(self); - } -} - -void ts_query_disable_pattern( - TSQuery *self, - uint32_t pattern_index -) { - // Remove the given pattern from the pattern map. Its steps will still - // be in the `steps` array, but they will never be read. - for (unsigned i = 0; i < self->pattern_map.size; i++) { - PatternEntry *pattern = &self->pattern_map.contents[i]; - if (pattern->pattern_index == pattern_index) { - array_erase(&self->pattern_map, i); - i--; - } - } -} - -/*************** - * QueryCursor - ***************/ - -TSQueryCursor *ts_query_cursor_new(void) { - TSQueryCursor *self = ts_malloc(sizeof(TSQueryCursor)); - *self = (TSQueryCursor) { - .ascending = false, - .halted = false, - .states = array_new(), - .finished_states = array_new(), - .capture_list_pool = capture_list_pool_new(), - .start_byte = 0, - .end_byte = UINT32_MAX, - .start_point = {0, 0}, - .end_point = POINT_MAX, - }; - array_reserve(&self->states, 8); - array_reserve(&self->finished_states, 8); - return self; -} - -void ts_query_cursor_delete(TSQueryCursor *self) { - array_delete(&self->states); - array_delete(&self->finished_states); - ts_tree_cursor_delete(&self->cursor); - capture_list_pool_delete(&self->capture_list_pool); - ts_free(self); -} - -void ts_query_cursor_exec( - TSQueryCursor *self, - const TSQuery *query, - TSNode node -) { - array_clear(&self->states); - array_clear(&self->finished_states); - ts_tree_cursor_reset(&self->cursor, node); - capture_list_pool_reset(&self->capture_list_pool); - self->next_state_id = 0; - self->depth = 0; - self->ascending = false; - self->halted = false; - self->query = query; -} - -void ts_query_cursor_set_byte_range( - TSQueryCursor *self, - uint32_t start_byte, - uint32_t end_byte -) { - if (end_byte == 0) { - start_byte = 0; - end_byte = UINT32_MAX; - } - self->start_byte = start_byte; - self->end_byte = end_byte; -} - -void ts_query_cursor_set_point_range( - TSQueryCursor *self, - TSPoint start_point, - TSPoint end_point -) { - if (end_point.row == 0 && end_point.column == 0) { - start_point = POINT_ZERO; - end_point = POINT_MAX; - } - self->start_point = start_point; - self->end_point = end_point; -} - -// Search through all of the in-progress states, and find the captured -// node that occurs earliest in the document. -static bool ts_query_cursor__first_in_progress_capture( - TSQueryCursor *self, - uint32_t *state_index, - uint32_t *byte_offset, - uint32_t *pattern_index -) { - bool result = false; - *state_index = UINT32_MAX; - *byte_offset = UINT32_MAX; - *pattern_index = UINT32_MAX; - for (unsigned i = 0; i < self->states.size; i++) { - const QueryState *state = &self->states.contents[i]; - if (state->dead) continue; - const CaptureList *captures = capture_list_pool_get( - &self->capture_list_pool, - state->capture_list_id - ); - if (captures->size > 0) { - uint32_t capture_byte = ts_node_start_byte(captures->contents[0].node); - if ( - !result || - capture_byte < *byte_offset || - (capture_byte == *byte_offset && state->pattern_index < *pattern_index) - ) { - result = true; - *state_index = i; - *byte_offset = capture_byte; - *pattern_index = state->pattern_index; - } - } - } - return result; -} - -// Determine which node is first in a depth-first traversal -int ts_query_cursor__compare_nodes(TSNode left, TSNode right) { - if (left.id != right.id) { - uint32_t left_start = ts_node_start_byte(left); - uint32_t right_start = ts_node_start_byte(right); - if (left_start < right_start) return -1; - if (left_start > right_start) return 1; - uint32_t left_node_count = ts_node_end_byte(left); - uint32_t right_node_count = ts_node_end_byte(right); - if (left_node_count > right_node_count) return -1; - if (left_node_count < right_node_count) return 1; - } - return 0; -} - -// Determine if either state contains a superset of the other state's captures. -void ts_query_cursor__compare_captures( - TSQueryCursor *self, - QueryState *left_state, - QueryState *right_state, - bool *left_contains_right, - bool *right_contains_left -) { - const CaptureList *left_captures = capture_list_pool_get( - &self->capture_list_pool, - left_state->capture_list_id - ); - const CaptureList *right_captures = capture_list_pool_get( - &self->capture_list_pool, - right_state->capture_list_id - ); - *left_contains_right = true; - *right_contains_left = true; - unsigned i = 0, j = 0; - for (;;) { - if (i < left_captures->size) { - if (j < right_captures->size) { - TSQueryCapture *left = &left_captures->contents[i]; - TSQueryCapture *right = &right_captures->contents[j]; - if (left->node.id == right->node.id && left->index == right->index) { - i++; - j++; - } else { - switch (ts_query_cursor__compare_nodes(left->node, right->node)) { - case -1: - *right_contains_left = false; - i++; - break; - case 1: - *left_contains_right = false; - j++; - break; - default: - *right_contains_left = false; - *left_contains_right = false; - i++; - j++; - break; - } - } - } else { - *right_contains_left = false; - break; - } - } else { - if (j < right_captures->size) { - *left_contains_right = false; - } - break; - } - } -} - -static void ts_query_cursor__add_state( - TSQueryCursor *self, - const PatternEntry *pattern -) { - QueryStep *step = &self->query->steps.contents[pattern->step_index]; - uint32_t start_depth = self->depth - step->depth; - - // Keep the states array in ascending order of start_depth and pattern_index, - // so that it can be processed more efficiently elsewhere. Usually, there is - // no work to do here because of two facts: - // * States with lower start_depth are naturally added first due to the - // order in which nodes are visited. - // * Earlier patterns are naturally added first because of the ordering of the - // pattern_map data structure that's used to initiate matches. - // - // This loop is only needed in cases where two conditions hold: - // * A pattern consists of more than one sibling node, so that its states - // remain in progress after exiting the node that started the match. - // * The first node in the pattern matches against multiple nodes at the - // same depth. - // - // An example of this is the pattern '((comment)* (function))'. If multiple - // `comment` nodes appear in a row, then we may initiate a new state for this - // pattern while another state for the same pattern is already in progress. - // If there are multiple patterns like this in a query, then this loop will - // need to execute in order to keep the states ordered by pattern_index. - uint32_t index = self->states.size; - while (index > 0) { - QueryState *prev_state = &self->states.contents[index - 1]; - if (prev_state->start_depth < start_depth) break; - if (prev_state->start_depth == start_depth) { - if (prev_state->pattern_index < pattern->pattern_index) break; - if (prev_state->pattern_index == pattern->pattern_index) { - // Avoid unnecessarily inserting an unnecessary duplicate state, - // which would be immediately pruned by the longest-match criteria. - if (prev_state->step_index == pattern->step_index) return; - } - } - index--; - } - - LOG( - " start state. pattern:%u, step:%u\n", - pattern->pattern_index, - pattern->step_index - ); - array_insert(&self->states, index, ((QueryState) { - .capture_list_id = NONE, - .step_index = pattern->step_index, - .pattern_index = pattern->pattern_index, - .start_depth = start_depth, - .consumed_capture_count = 0, - .seeking_immediate_match = true, - .has_in_progress_alternatives = false, - .dead = false, - })); -} - -// Acquire a capture list for this state. If there are no capture lists left in the -// pool, this will steal the capture list from another existing state, and mark that -// other state as 'dead'. -static CaptureList *ts_query_cursor__prepare_to_capture( - TSQueryCursor *self, - QueryState *state, - unsigned state_index_to_preserve -) { - if (state->capture_list_id == NONE) { - state->capture_list_id = capture_list_pool_acquire(&self->capture_list_pool); - - // If there are no capture lists left in the pool, then terminate whichever - // state has captured the earliest node in the document, and steal its - // capture list. - if (state->capture_list_id == NONE) { - uint32_t state_index, byte_offset, pattern_index; - if ( - ts_query_cursor__first_in_progress_capture( - self, - &state_index, - &byte_offset, - &pattern_index - ) && - state_index != state_index_to_preserve - ) { - LOG( - " abandon state. index:%u, pattern:%u, offset:%u.\n", - state_index, pattern_index, byte_offset - ); - QueryState *other_state = &self->states.contents[state_index]; - state->capture_list_id = other_state->capture_list_id; - other_state->capture_list_id = NONE; - other_state->dead = true; - CaptureList *list = capture_list_pool_get_mut( - &self->capture_list_pool, - state->capture_list_id - ); - array_clear(list); - return list; - } else { - LOG(" ran out of capture lists"); - return NULL; - } - } - } - return capture_list_pool_get_mut(&self->capture_list_pool, state->capture_list_id); -} - -// Duplicate the given state and insert the newly-created state immediately after -// the given state in the `states` array. Ensures that the given state reference is -// still valid, even if the states array is reallocated. -static QueryState *ts_query_cursor__copy_state( - TSQueryCursor *self, - QueryState **state_ref -) { - const QueryState *state = *state_ref; - uint32_t state_index = state - self->states.contents; - QueryState copy = *state; - copy.capture_list_id = NONE; - - // If the state has captures, copy its capture list. - if (state->capture_list_id != NONE) { - CaptureList *new_captures = ts_query_cursor__prepare_to_capture(self, ©, state_index); - if (!new_captures) return NULL; - const CaptureList *old_captures = capture_list_pool_get( - &self->capture_list_pool, - state->capture_list_id - ); - array_push_all(new_captures, old_captures); - } - - array_insert(&self->states, state_index + 1, copy); - *state_ref = &self->states.contents[state_index]; - return &self->states.contents[state_index + 1]; -} - -// Walk the tree, processing patterns until at least one pattern finishes, -// If one or more patterns finish, return `true` and store their states in the -// `finished_states` array. Multiple patterns can finish on the same node. If -// there are no more matches, return `false`. -static inline bool ts_query_cursor__advance(TSQueryCursor *self) { - bool did_match = false; - for (;;) { - if (self->halted) { - while (self->states.size > 0) { - QueryState state = array_pop(&self->states); - capture_list_pool_release( - &self->capture_list_pool, - state.capture_list_id - ); - } - } - - if (did_match || self->halted) return did_match; - - if (self->ascending) { - LOG("leave node. type:%s\n", ts_node_type(ts_tree_cursor_current_node(&self->cursor))); - - // Leave this node by stepping to its next sibling or to its parent. - if (ts_tree_cursor_goto_next_sibling(&self->cursor)) { - self->ascending = false; - } else if (ts_tree_cursor_goto_parent(&self->cursor)) { - self->depth--; - } else { - self->halted = true; - } - - // After leaving a node, remove any states that cannot make further progress. - uint32_t deleted_count = 0; - for (unsigned i = 0, n = self->states.size; i < n; i++) { - QueryState *state = &self->states.contents[i]; - QueryStep *step = &self->query->steps.contents[state->step_index]; - - // If a state completed its pattern inside of this node, but was deferred from finishing - // in order to search for longer matches, mark it as finished. - if (step->depth == PATTERN_DONE_MARKER) { - if (state->start_depth > self->depth || self->halted) { - LOG(" finish pattern %u\n", state->pattern_index); - state->id = self->next_state_id++; - array_push(&self->finished_states, *state); - did_match = true; - deleted_count++; - continue; - } - } - - // If a state needed to match something within this node, then remove that state - // as it has failed to match. - else if ((uint32_t)state->start_depth + (uint32_t)step->depth > self->depth) { - LOG( - " failed to match. pattern:%u, step:%u\n", - state->pattern_index, - state->step_index - ); - capture_list_pool_release( - &self->capture_list_pool, - state->capture_list_id - ); - deleted_count++; - continue; - } - - if (deleted_count > 0) { - self->states.contents[i - deleted_count] = *state; - } - } - self->states.size -= deleted_count; - } else { - // If this node is before the selected range, then avoid descending into it. - TSNode node = ts_tree_cursor_current_node(&self->cursor); - if ( - ts_node_end_byte(node) <= self->start_byte || - point_lte(ts_node_end_point(node), self->start_point) - ) { - if (!ts_tree_cursor_goto_next_sibling(&self->cursor)) { - self->ascending = true; - } - continue; - } - - // If this node is after the selected range, then stop walking. - if ( - self->end_byte <= ts_node_start_byte(node) || - point_lte(self->end_point, ts_node_start_point(node)) - ) { - self->halted = true; - continue; - } - - // Get the properties of the current node. - TSSymbol symbol = ts_node_symbol(node); - bool is_named = ts_node_is_named(node); - if (symbol != ts_builtin_sym_error && self->query->symbol_map) { - symbol = self->query->symbol_map[symbol]; - } - bool can_have_later_siblings; - bool can_have_later_siblings_with_this_field; - TSFieldId field_id = ts_tree_cursor_current_status( - &self->cursor, - &can_have_later_siblings, - &can_have_later_siblings_with_this_field - ); - LOG( - "enter node. type:%s, field:%s, row:%u state_count:%u, finished_state_count:%u\n", - ts_node_type(node), - ts_language_field_name_for_id(self->query->language, field_id), - ts_node_start_point(node).row, - self->states.size, - self->finished_states.size - ); - - // Add new states for any patterns whose root node is a wildcard. - for (unsigned i = 0; i < self->query->wildcard_root_pattern_count; i++) { - PatternEntry *pattern = &self->query->pattern_map.contents[i]; - QueryStep *step = &self->query->steps.contents[pattern->step_index]; - - // If this node matches the first step of the pattern, then add a new - // state at the start of this pattern. - if (step->field && field_id != step->field) continue; - ts_query_cursor__add_state(self, pattern); - } - - // Add new states for any patterns whose root node matches this node. - unsigned i; - if (ts_query__pattern_map_search(self->query, symbol, &i)) { - PatternEntry *pattern = &self->query->pattern_map.contents[i]; - QueryStep *step = &self->query->steps.contents[pattern->step_index]; - do { - // If this node matches the first step of the pattern, then add a new - // state at the start of this pattern. - if (step->field && field_id != step->field) continue; - ts_query_cursor__add_state(self, pattern); - - // Advance to the next pattern whose root node matches this node. - i++; - if (i == self->query->pattern_map.size) break; - pattern = &self->query->pattern_map.contents[i]; - step = &self->query->steps.contents[pattern->step_index]; - } while (step->symbol == symbol); - } - - // Update all of the in-progress states with current node. - for (unsigned i = 0, copy_count = 0; i < self->states.size; i += 1 + copy_count) { - QueryState *state = &self->states.contents[i]; - QueryStep *step = &self->query->steps.contents[state->step_index]; - state->has_in_progress_alternatives = false; - copy_count = 0; - - // Check that the node matches all of the criteria for the next - // step of the pattern. - if ((uint32_t)state->start_depth + (uint32_t)step->depth != self->depth) continue; - - // Determine if this node matches this step of the pattern, and also - // if this node can have later siblings that match this step of the - // pattern. - bool node_does_match = - step->symbol == symbol || - step->symbol == WILDCARD_SYMBOL || - (step->symbol == NAMED_WILDCARD_SYMBOL && is_named); - bool later_sibling_can_match = can_have_later_siblings; - if ((step->is_immediate && is_named) || state->seeking_immediate_match) { - later_sibling_can_match = false; - } - if (step->is_last_child && can_have_later_siblings) { - node_does_match = false; - } - if (step->field) { - if (step->field == field_id) { - if (!can_have_later_siblings_with_this_field) { - later_sibling_can_match = false; - } - } else { - node_does_match = false; - } - } - - // Remove states immediately if it is ever clear that they cannot match. - if (!node_does_match) { - if (!later_sibling_can_match) { - LOG( - " discard state. pattern:%u, step:%u\n", - state->pattern_index, - state->step_index - ); - capture_list_pool_release( - &self->capture_list_pool, - state->capture_list_id - ); - array_erase(&self->states, i); - i--; - } - continue; - } - - // Some patterns can match their root node in multiple ways, capturing different - // children. If this pattern step could match later children within the same - // parent, then this query state cannot simply be updated in place. It must be - // split into two states: one that matches this node, and one which skips over - // this node, to preserve the possibility of matching later siblings. - if (later_sibling_can_match && step->contains_captures) { - if (ts_query_cursor__copy_state(self, &state)) { - LOG( - " split state for capture. pattern:%u, step:%u\n", - state->pattern_index, - state->step_index - ); - copy_count++; - } - } - - // If the current node is captured in this pattern, add it to the capture list. - if (step->capture_ids[0] != NONE) { - CaptureList *capture_list = ts_query_cursor__prepare_to_capture(self, state, UINT32_MAX); - if (!capture_list) { - array_erase(&self->states, i); - i--; - continue; - } - - for (unsigned j = 0; j < MAX_STEP_CAPTURE_COUNT; j++) { - uint16_t capture_id = step->capture_ids[j]; - if (step->capture_ids[j] == NONE) break; - array_push(capture_list, ((TSQueryCapture) { node, capture_id })); - LOG( - " capture node. pattern:%u, capture_id:%u, capture_count:%u\n", - state->pattern_index, - capture_id, - capture_list->size - ); - } - } - - // Advance this state to the next step of its pattern. - state->step_index++; - state->seeking_immediate_match = false; - LOG( - " advance state. pattern:%u, step:%u\n", - state->pattern_index, - state->step_index - ); - - // If this state's next step has an alternative step, then copy the state in order - // to pursue both alternatives. The alternative step itself may have an alternative, - // so this is an interative process. - unsigned end_index = i + 1; - for (unsigned j = i; j < end_index; j++) { - QueryState *state = &self->states.contents[j]; - QueryStep *next_step = &self->query->steps.contents[state->step_index]; - if (next_step->alternative_index != NONE) { - if (next_step->is_dead_end) { - state->step_index = next_step->alternative_index; - j--; - continue; - } - - if (next_step->is_pass_through) { - state->step_index++; - j--; - } - - QueryState *copy = ts_query_cursor__copy_state(self, &state); - if (copy) { - LOG( - " split state for branch. pattern:%u, from_step:%u, to_step:%u, immediate:%d, capture_count: %u\n", - copy->pattern_index, - copy->step_index, - next_step->alternative_index, - next_step->alternative_is_immediate, - capture_list_pool_get(&self->capture_list_pool, copy->capture_list_id)->size - ); - end_index++; - copy_count++; - copy->step_index = next_step->alternative_index; - if (next_step->alternative_is_immediate) { - copy->seeking_immediate_match = true; - } - } - } - } - } - - for (unsigned i = 0; i < self->states.size; i++) { - QueryState *state = &self->states.contents[i]; - if (state->dead) { - array_erase(&self->states, i); - i--; - continue; - } - - // Enfore the longest-match criteria. When a query pattern contains optional or - // repeated nodes, this is necessary to avoid multiple redundant states, where - // one state has a strict subset of another state's captures. - bool did_remove = false; - for (unsigned j = i + 1; j < self->states.size; j++) { - QueryState *other_state = &self->states.contents[j]; - - // Query states are kept in ascending order of start_depth and pattern_index. - // Since the longest-match criteria is only used for deduping matches of the same - // pattern and root node, we only need to perform pairwise comparisons within a - // small slice of the states array. - if ( - other_state->start_depth != state->start_depth || - other_state->pattern_index != state->pattern_index - ) break; - - bool left_contains_right, right_contains_left; - ts_query_cursor__compare_captures( - self, - state, - other_state, - &left_contains_right, - &right_contains_left - ); - if (left_contains_right) { - if (state->step_index == other_state->step_index) { - LOG( - " drop shorter state. pattern: %u, step_index: %u\n", - state->pattern_index, - state->step_index - ); - capture_list_pool_release(&self->capture_list_pool, other_state->capture_list_id); - array_erase(&self->states, j); - j--; - continue; - } - other_state->has_in_progress_alternatives = true; - } - if (right_contains_left) { - if (state->step_index == other_state->step_index) { - LOG( - " drop shorter state. pattern: %u, step_index: %u\n", - state->pattern_index, - state->step_index - ); - capture_list_pool_release(&self->capture_list_pool, state->capture_list_id); - array_erase(&self->states, i); - i--; - did_remove = true; - break; - } - state->has_in_progress_alternatives = true; - } - } - - // If there the state is at the end of its pattern, remove it from the list - // of in-progress states and add it to the list of finished states. - if (!did_remove) { - LOG( - " keep state. pattern: %u, start_depth: %u, step_index: %u, capture_count: %u\n", - state->pattern_index, - state->start_depth, - state->step_index, - capture_list_pool_get(&self->capture_list_pool, state->capture_list_id)->size - ); - QueryStep *next_step = &self->query->steps.contents[state->step_index]; - if (next_step->depth == PATTERN_DONE_MARKER) { - if (state->has_in_progress_alternatives) { - LOG(" defer finishing pattern %u\n", state->pattern_index); - } else { - LOG(" finish pattern %u\n", state->pattern_index); - state->id = self->next_state_id++; - array_push(&self->finished_states, *state); - array_erase(&self->states, state - self->states.contents); - did_match = true; - i--; - } - } - } - } - - // Continue descending if possible. - if (ts_tree_cursor_goto_first_child(&self->cursor)) { - self->depth++; - } else { - self->ascending = true; - } - } - } -} - -bool ts_query_cursor_next_match( - TSQueryCursor *self, - TSQueryMatch *match -) { - if (self->finished_states.size == 0) { - if (!ts_query_cursor__advance(self)) { - return false; - } - } - - QueryState *state = &self->finished_states.contents[0]; - match->id = state->id; - match->pattern_index = state->pattern_index; - const CaptureList *captures = capture_list_pool_get( - &self->capture_list_pool, - state->capture_list_id - ); - match->captures = captures->contents; - match->capture_count = captures->size; - capture_list_pool_release(&self->capture_list_pool, state->capture_list_id); - array_erase(&self->finished_states, 0); - return true; -} - -void ts_query_cursor_remove_match( - TSQueryCursor *self, - uint32_t match_id -) { - for (unsigned i = 0; i < self->finished_states.size; i++) { - const QueryState *state = &self->finished_states.contents[i]; - if (state->id == match_id) { - capture_list_pool_release( - &self->capture_list_pool, - state->capture_list_id - ); - array_erase(&self->finished_states, i); - return; - } - } -} - -bool ts_query_cursor_next_capture( - TSQueryCursor *self, - TSQueryMatch *match, - uint32_t *capture_index -) { - for (;;) { - // The goal here is to return captures in order, even though they may not - // be discovered in order, because patterns can overlap. If there are any - // finished patterns, then try to find one that contains a capture that - // is *definitely* before any capture in an *unfinished* pattern. - if (self->finished_states.size > 0) { - // First, identify the position of the earliest capture in an unfinished - // match. For a finished capture to be returned, it must be *before* - // this position. - uint32_t first_unfinished_capture_byte; - uint32_t first_unfinished_pattern_index; - uint32_t first_unfinished_state_index; - ts_query_cursor__first_in_progress_capture( - self, - &first_unfinished_state_index, - &first_unfinished_capture_byte, - &first_unfinished_pattern_index - ); - - // Find the earliest capture in a finished match. - int first_finished_state_index = -1; - uint32_t first_finished_capture_byte = first_unfinished_capture_byte; - uint32_t first_finished_pattern_index = first_unfinished_pattern_index; - for (unsigned i = 0; i < self->finished_states.size; i++) { - const QueryState *state = &self->finished_states.contents[i]; - const CaptureList *captures = capture_list_pool_get( - &self->capture_list_pool, - state->capture_list_id - ); - if (captures->size > state->consumed_capture_count) { - uint32_t capture_byte = ts_node_start_byte( - captures->contents[state->consumed_capture_count].node - ); - if ( - capture_byte < first_finished_capture_byte || - ( - capture_byte == first_finished_capture_byte && - state->pattern_index < first_finished_pattern_index - ) - ) { - first_finished_state_index = i; - first_finished_capture_byte = capture_byte; - first_finished_pattern_index = state->pattern_index; - } - } else { - capture_list_pool_release( - &self->capture_list_pool, - state->capture_list_id - ); - array_erase(&self->finished_states, i); - i--; - } - } - - // If there is finished capture that is clearly before any unfinished - // capture, then return its match, and its capture index. Internally - // record the fact that the capture has been 'consumed'. - if (first_finished_state_index != -1) { - QueryState *state = &self->finished_states.contents[ - first_finished_state_index - ]; - match->id = state->id; - match->pattern_index = state->pattern_index; - const CaptureList *captures = capture_list_pool_get( - &self->capture_list_pool, - state->capture_list_id - ); - match->captures = captures->contents; - match->capture_count = captures->size; - *capture_index = state->consumed_capture_count; - state->consumed_capture_count++; - return true; - } - - if (capture_list_pool_is_empty(&self->capture_list_pool)) { - LOG( - " abandon state. index:%u, pattern:%u, offset:%u.\n", - first_unfinished_state_index, - first_unfinished_pattern_index, - first_unfinished_capture_byte - ); - capture_list_pool_release( - &self->capture_list_pool, - self->states.contents[first_unfinished_state_index].capture_list_id - ); - array_erase(&self->states, first_unfinished_state_index); - } - } - - // If there are no finished matches that are ready to be returned, then - // continue finding more matches. - if ( - !ts_query_cursor__advance(self) && - self->finished_states.size == 0 - ) return false; - } -} - -#undef LOG |