diff options
Diffstat (limited to 'src/nvim/search.c')
-rw-r--r-- | src/nvim/search.c | 530 |
1 files changed, 530 insertions, 0 deletions
diff --git a/src/nvim/search.c b/src/nvim/search.c index 93180f97fe..682fa417a9 100644 --- a/src/nvim/search.c +++ b/src/nvim/search.c @@ -4764,6 +4764,536 @@ the_end: restore_last_search_pattern(); } +/// Fuzzy string matching +/// +/// Ported from the lib_fts library authored by Forrest Smith. +/// https://github.com/forrestthewoods/lib_fts/tree/master/code +/// +/// The following blog describes the fuzzy matching algorithm: +/// https://www.forrestthewoods.com/blog/reverse_engineering_sublime_texts_fuzzy_match/ +/// +/// Each matching string is assigned a score. The following factors are checked: +/// - Matched letter +/// - Unmatched letter +/// - Consecutively matched letters +/// - Proximity to start +/// - Letter following a separator (space, underscore) +/// - Uppercase letter following lowercase (aka CamelCase) +/// +/// Matched letters are good. Unmatched letters are bad. Matching near the start +/// is good. Matching the first letter in the middle of a phrase is good. +/// Matching the uppercase letters in camel case entries is good. +/// +/// The score assigned for each factor is explained below. +/// File paths are different from file names. File extensions may be ignorable. +/// Single words care about consecutive matches but not separators or camel +/// case. +/// Score starts at 100 +/// Matched letter: +0 points +/// Unmatched letter: -1 point +/// Consecutive match bonus: +15 points +/// First letter bonus: +15 points +/// Separator bonus: +30 points +/// Camel case bonus: +30 points +/// Unmatched leading letter: -5 points (max: -15) +/// +/// There is some nuance to this. Scores don’t have an intrinsic meaning. The +/// score range isn’t 0 to 100. It’s roughly [50, 150]. Longer words have a +/// lower minimum score due to unmatched letter penalty. Longer search patterns +/// have a higher maximum score due to match bonuses. +/// +/// Separator and camel case bonus is worth a LOT. Consecutive matches are worth +/// quite a bit. +/// +/// There is a penalty if you DON’T match the first three letters. Which +/// effectively rewards matching near the start. However there’s no difference +/// in matching between the middle and end. +/// +/// There is not an explicit bonus for an exact match. Unmatched letters receive +/// a penalty. So shorter strings and closer matches are worth more. +typedef struct { + int idx; ///< used for stable sort + listitem_T *item; + int score; + list_T *lmatchpos; +} fuzzyItem_T; + +/// bonus for adjacent matches; this is higher than SEPARATOR_BONUS so that +/// matching a whole word is preferred. +#define SEQUENTIAL_BONUS 40 +/// bonus if match occurs after a path separator +#define PATH_SEPARATOR_BONUS 30 +/// bonus if match occurs after a word separator +#define WORD_SEPARATOR_BONUS 25 +/// bonus if match is uppercase and prev is lower +#define CAMEL_BONUS 30 +/// bonus if the first letter is matched +#define FIRST_LETTER_BONUS 15 +/// penalty applied for every letter in str before the first match +#define LEADING_LETTER_PENALTY -5 +/// maximum penalty for leading letters +#define MAX_LEADING_LETTER_PENALTY -15 +/// penalty for every letter that doesn't match +#define UNMATCHED_LETTER_PENALTY -1 +/// penalty for gap in matching positions (-2 * k) +#define GAP_PENALTY -2 +/// Score for a string that doesn't fuzzy match the pattern +#define SCORE_NONE -9999 + +#define FUZZY_MATCH_RECURSION_LIMIT 10 + +/// Compute a score for a fuzzy matched string. The matching character locations +/// are in 'matches'. +static int fuzzy_match_compute_score(const char_u *const str, const int strSz, + const uint32_t *const matches, const int numMatches) + FUNC_ATTR_NONNULL_ALL FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_PURE +{ + assert(numMatches > 0); // suppress clang "result of operation is garbage" + // Initialize score + int score = 100; + + // Apply leading letter penalty + int penalty = LEADING_LETTER_PENALTY * matches[0]; + if (penalty < MAX_LEADING_LETTER_PENALTY) { + penalty = MAX_LEADING_LETTER_PENALTY; + } + score += penalty; + + // Apply unmatched penalty + const int unmatched = strSz - numMatches; + score += UNMATCHED_LETTER_PENALTY * unmatched; + + // Apply ordering bonuses + for (int i = 0; i < numMatches; i++) { + const uint32_t currIdx = matches[i]; + + if (i > 0) { + const uint32_t prevIdx = matches[i - 1]; + + // Sequential + if (currIdx == prevIdx + 1) { + score += SEQUENTIAL_BONUS; + } else { + score += GAP_PENALTY * (currIdx - prevIdx); + } + } + + // Check for bonuses based on neighbor character value + if (currIdx > 0) { + // Camel case + const char_u *p = str; + int neighbor; + + for (uint32_t sidx = 0; sidx < currIdx; sidx++) { + neighbor = utf_ptr2char(p); + MB_PTR_ADV(p); + } + const int curr = utf_ptr2char(p); + + if (mb_islower(neighbor) && mb_isupper(curr)) { + score += CAMEL_BONUS; + } + + // Bonus if the match follows a separator character + if (neighbor == '/' || neighbor == '\\') { + score += PATH_SEPARATOR_BONUS; + } else if (neighbor == ' ' || neighbor == '_') { + score += WORD_SEPARATOR_BONUS; + } + } else { + // First letter + score += FIRST_LETTER_BONUS; + } + } + return score; +} + +/// Perform a recursive search for fuzzy matching 'fuzpat' in 'str'. +/// @return the number of matching characters. +static int fuzzy_match_recursive(const char_u *fuzpat, const char_u *str, uint32_t strIdx, + int *const outScore, const char_u *const strBegin, + const int strLen, const uint32_t *const srcMatches, + uint32_t *const matches, const int maxMatches, int nextMatch, + int *const recursionCount) + FUNC_ATTR_NONNULL_ARG(1, 2, 4, 5, 8, 11) FUNC_ATTR_WARN_UNUSED_RESULT +{ + // Recursion params + bool recursiveMatch = false; + uint32_t bestRecursiveMatches[MAX_FUZZY_MATCHES]; + int bestRecursiveScore = 0; + + // Count recursions + (*recursionCount)++; + if (*recursionCount >= FUZZY_MATCH_RECURSION_LIMIT) { + return 0; + } + + // Detect end of strings + if (*fuzpat == NUL || *str == NUL) { + return 0; + } + + // Loop through fuzpat and str looking for a match + bool first_match = true; + while (*fuzpat != NUL && *str != NUL) { + const int c1 = utf_ptr2char(fuzpat); + const int c2 = utf_ptr2char(str); + + // Found match + if (mb_tolower(c1) == mb_tolower(c2)) { + // Supplied matches buffer was too short + if (nextMatch >= maxMatches) { + return 0; + } + + // "Copy-on-Write" srcMatches into matches + if (first_match && srcMatches != NULL) { + memcpy(matches, srcMatches, nextMatch * sizeof(srcMatches[0])); + first_match = false; + } + + // Recursive call that "skips" this match + uint32_t recursiveMatches[MAX_FUZZY_MATCHES]; + int recursiveScore = 0; + const char_u *const next_char = str + utfc_ptr2len(str); + if (fuzzy_match_recursive(fuzpat, next_char, strIdx + 1, &recursiveScore, strBegin, strLen, + matches, recursiveMatches, + sizeof(recursiveMatches) / sizeof(recursiveMatches[0]), nextMatch, + recursionCount)) { + // Pick best recursive score + if (!recursiveMatch || recursiveScore > bestRecursiveScore) { + memcpy(bestRecursiveMatches, recursiveMatches, + MAX_FUZZY_MATCHES * sizeof(recursiveMatches[0])); + bestRecursiveScore = recursiveScore; + } + recursiveMatch = true; + } + + // Advance + matches[nextMatch++] = strIdx; + MB_PTR_ADV(fuzpat); + } + MB_PTR_ADV(str); + strIdx++; + } + + // Determine if full fuzpat was matched + const bool matched = *fuzpat == NUL; + + // Calculate score + if (matched) { + *outScore = fuzzy_match_compute_score(strBegin, strLen, matches, nextMatch); + } + + // Return best result + if (recursiveMatch && (!matched || bestRecursiveScore > *outScore)) { + // Recursive score is better than "this" + memcpy(matches, bestRecursiveMatches, maxMatches * sizeof(matches[0])); + *outScore = bestRecursiveScore; + return nextMatch; + } else if (matched) { + return nextMatch; // "this" score is better than recursive + } + + return 0; // no match +} + +/// fuzzy_match() +/// +/// Performs exhaustive search via recursion to find all possible matches and +/// match with highest score. +/// Scores values have no intrinsic meaning. Possible score range is not +/// normalized and varies with pattern. +/// Recursion is limited internally (default=10) to prevent degenerate cases +/// (pat_arg="aaaaaa" str="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"). +/// Uses char_u for match indices. Therefore patterns are limited to +/// MAX_FUZZY_MATCHES characters. +/// +/// @return true if 'pat_arg' matches 'str'. Also returns the match score in +/// 'outScore' and the matching character positions in 'matches'. +bool fuzzy_match(char_u *const str, const char_u *const pat_arg, const bool matchseq, + int *const outScore, uint32_t *const matches, const int maxMatches) + FUNC_ATTR_NONNULL_ALL FUNC_ATTR_WARN_UNUSED_RESULT +{ + const int len = mb_charlen(str); + bool complete = false; + int numMatches = 0; + + *outScore = 0; + + char_u *const save_pat = vim_strsave(pat_arg); + char_u *pat = save_pat; + char_u *p = pat; + + // Try matching each word in 'pat_arg' in 'str' + while (true) { + if (matchseq) { + complete = true; + } else { + // Extract one word from the pattern (separated by space) + p = skipwhite(p); + if (*p == NUL) { + break; + } + pat = p; + while (*p != NUL && !ascii_iswhite(utf_ptr2char(p))) { + MB_PTR_ADV(p); + } + if (*p == NUL) { // processed all the words + complete = true; + } + *p = NUL; + } + + int score = 0; + int recursionCount = 0; + const int matchCount + = fuzzy_match_recursive(pat, str, 0, &score, str, len, NULL, matches + numMatches, + maxMatches - numMatches, 0, &recursionCount); + if (matchCount == 0) { + numMatches = 0; + break; + } + + // Accumulate the match score and the number of matches + *outScore += score; + numMatches += matchCount; + + if (complete) { + break; + } + + // try matching the next word + p++; + } + + xfree(save_pat); + return numMatches != 0; +} + +/// Sort the fuzzy matches in the descending order of the match score. +/// For items with same score, retain the order using the index (stable sort) +static int fuzzy_match_item_compare(const void *const s1, const void *const s2) + FUNC_ATTR_NONNULL_ALL FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_PURE +{ + const int v1 = ((const fuzzyItem_T *)s1)->score; + const int v2 = ((const fuzzyItem_T *)s2)->score; + const int idx1 = ((const fuzzyItem_T *)s1)->idx; + const int idx2 = ((const fuzzyItem_T *)s2)->idx; + + return v1 == v2 ? (idx1 - idx2) : v1 > v2 ? -1 : 1; +} + +/// Fuzzy search the string 'str' in a list of 'items' and return the matching +/// strings in 'fmatchlist'. +/// If 'matchseq' is true, then for multi-word search strings, match all the +/// words in sequence. +/// If 'items' is a list of strings, then search for 'str' in the list. +/// If 'items' is a list of dicts, then either use 'key' to lookup the string +/// for each item or use 'item_cb' Funcref function to get the string. +/// If 'retmatchpos' is true, then return a list of positions where 'str' +/// matches for each item. +static void fuzzy_match_in_list(list_T *const items, char_u *const str, const bool matchseq, + const char_u *const key, Callback *const item_cb, + const bool retmatchpos, list_T *const fmatchlist) + FUNC_ATTR_NONNULL_ARG(2, 5, 7) +{ + const long len = tv_list_len(items); + if (len == 0) { + return; + } + + fuzzyItem_T *const ptrs = xcalloc(len, sizeof(fuzzyItem_T)); + long i = 0; + bool found_match = false; + uint32_t matches[MAX_FUZZY_MATCHES]; + + // For all the string items in items, get the fuzzy matching score + TV_LIST_ITER(items, li, { + ptrs[i].idx = i; + ptrs[i].item = li; + ptrs[i].score = SCORE_NONE; + char_u *itemstr = NULL; + typval_T rettv; + rettv.v_type = VAR_UNKNOWN; + const typval_T *const tv = TV_LIST_ITEM_TV(li); + if (tv->v_type == VAR_STRING) { // list of strings + itemstr = tv->vval.v_string; + } else if (tv->v_type == VAR_DICT && (key != NULL || item_cb->type != kCallbackNone)) { + // For a dict, either use the specified key to lookup the string or + // use the specified callback function to get the string. + if (key != NULL) { + itemstr = (char_u *)tv_dict_get_string(tv->vval.v_dict, (const char *)key, false); + } else { + typval_T argv[2]; + + // Invoke the supplied callback (if any) to get the dict item + tv->vval.v_dict->dv_refcount++; + argv[0].v_type = VAR_DICT; + argv[0].vval.v_dict = tv->vval.v_dict; + argv[1].v_type = VAR_UNKNOWN; + if (callback_call(item_cb, 1, argv, &rettv)) { + if (rettv.v_type == VAR_STRING) { + itemstr = rettv.vval.v_string; + } + } + tv_dict_unref(tv->vval.v_dict); + } + } + + int score; + if (itemstr != NULL && fuzzy_match(itemstr, str, matchseq, &score, matches, + sizeof(matches) / sizeof(matches[0]))) { + // Copy the list of matching positions in itemstr to a list, if + // 'retmatchpos' is set. + if (retmatchpos) { + ptrs[i].lmatchpos = tv_list_alloc(kListLenMayKnow); + int j = 0; + const char_u *p = str; + while (*p != NUL) { + if (!ascii_iswhite(utf_ptr2char(p))) { + tv_list_append_number(ptrs[i].lmatchpos, matches[j]); + j++; + } + MB_PTR_ADV(p); + } + } + ptrs[i].score = score; + found_match = true; + } + i++; + tv_clear(&rettv); + }); + + if (found_match) { + // Sort the list by the descending order of the match score + qsort(ptrs, len, sizeof(fuzzyItem_T), fuzzy_match_item_compare); + + // For matchfuzzy(), return a list of matched strings. + // ['str1', 'str2', 'str3'] + // For matchfuzzypos(), return a list with three items. + // The first item is a list of matched strings. The second item + // is a list of lists where each list item is a list of matched + // character positions. The third item is a list of matching scores. + // [['str1', 'str2', 'str3'], [[1, 3], [1, 3], [1, 3]]] + list_T *l; + if (retmatchpos) { + const listitem_T *const li = tv_list_find(fmatchlist, 0); + assert(li != NULL && TV_LIST_ITEM_TV(li)->vval.v_list != NULL); + l = TV_LIST_ITEM_TV(li)->vval.v_list; + } else { + l = fmatchlist; + } + + // Copy the matching strings with a valid score to the return list + for (i = 0; i < len; i++) { + if (ptrs[i].score == SCORE_NONE) { + break; + } + tv_list_append_tv(l, TV_LIST_ITEM_TV(ptrs[i].item)); + } + + // next copy the list of matching positions + if (retmatchpos) { + const listitem_T *li = tv_list_find(fmatchlist, -2); + assert(li != NULL && TV_LIST_ITEM_TV(li)->vval.v_list != NULL); + l = TV_LIST_ITEM_TV(li)->vval.v_list; + for (i = 0; i < len; i++) { + if (ptrs[i].score == SCORE_NONE) { + break; + } + tv_list_append_list(l, ptrs[i].lmatchpos); + } + + // copy the matching scores + li = tv_list_find(fmatchlist, -1); + assert(li != NULL && TV_LIST_ITEM_TV(li)->vval.v_list != NULL); + l = TV_LIST_ITEM_TV(li)->vval.v_list; + for (i = 0; i < len; i++) { + if (ptrs[i].score == SCORE_NONE) { + break; + } + tv_list_append_number(l, ptrs[i].score); + } + } + } + xfree(ptrs); +} + +/// Do fuzzy matching. Returns the list of matched strings in 'rettv'. +/// If 'retmatchpos' is true, also returns the matching character positions. +static void do_fuzzymatch(const typval_T *const argvars, typval_T *const rettv, + const bool retmatchpos) + FUNC_ATTR_NONNULL_ALL +{ + // validate and get the arguments + if (argvars[0].v_type != VAR_LIST || argvars[0].vval.v_list == NULL) { + semsg(_(e_listarg), retmatchpos ? "matchfuzzypos()" : "matchfuzzy()"); + return; + } + if (argvars[1].v_type != VAR_STRING || argvars[1].vval.v_string == NULL) { + semsg(_(e_invarg2), tv_get_string(&argvars[1])); + return; + } + + Callback cb = CALLBACK_NONE; + const char_u *key = NULL; + bool matchseq = false; + if (argvars[2].v_type != VAR_UNKNOWN) { + if (argvars[2].v_type != VAR_DICT || argvars[2].vval.v_dict == NULL) { + emsg(_(e_dictreq)); + return; + } + + // To search a dict, either a callback function or a key can be + // specified. + dict_T *const d = argvars[2].vval.v_dict; + const dictitem_T *const di = tv_dict_find(d, "key", -1); + if (di != NULL) { + if (di->di_tv.v_type != VAR_STRING || di->di_tv.vval.v_string == NULL + || *di->di_tv.vval.v_string == NUL) { + semsg(_(e_invarg2), tv_get_string(&di->di_tv)); + return; + } + key = (const char_u *)tv_get_string(&di->di_tv); + } else if (!tv_dict_get_callback(d, "text_cb", -1, &cb)) { + semsg(_(e_invargval), "text_cb"); + return; + } + if (tv_dict_find(d, "matchseq", -1) != NULL) { + matchseq = true; + } + } + + // get the fuzzy matches + tv_list_alloc_ret(rettv, retmatchpos ? 3 : kListLenUnknown); + if (retmatchpos) { + // For matchfuzzypos(), a list with three items are returned. First + // item is a list of matching strings, the second item is a list of + // lists with matching positions within each string and the third item + // is the list of scores of the matches. + tv_list_append_list(rettv->vval.v_list, tv_list_alloc(kListLenUnknown)); + tv_list_append_list(rettv->vval.v_list, tv_list_alloc(kListLenUnknown)); + tv_list_append_list(rettv->vval.v_list, tv_list_alloc(kListLenUnknown)); + } + + fuzzy_match_in_list(argvars[0].vval.v_list, (char_u *)tv_get_string(&argvars[1]), matchseq, key, + &cb, retmatchpos, rettv->vval.v_list); + callback_free(&cb); +} + +/// "matchfuzzy()" function +void f_matchfuzzy(typval_T *argvars, typval_T *rettv, FunPtr fptr) +{ + do_fuzzymatch(argvars, rettv, false); +} + +/// "matchfuzzypos()" function +void f_matchfuzzypos(typval_T *argvars, typval_T *rettv, FunPtr fptr) +{ + do_fuzzymatch(argvars, rettv, true); +} + /// Find identifiers or defines in included files. /// If p_ic && (compl_cont_status & CONT_SOL) then ptr must be in lowercase. /// |