diff options
-rw-r--r-- | runtime/doc/builtin.txt | 22 | ||||
-rw-r--r-- | runtime/doc/usr_41.txt | 1 | ||||
-rw-r--r-- | src/nvim/eval.lua | 1 | ||||
-rw-r--r-- | src/nvim/search.c | 298 | ||||
-rw-r--r-- | src/nvim/testdir/test_functions.vim | 26 |
5 files changed, 348 insertions, 0 deletions
diff --git a/runtime/doc/builtin.txt b/runtime/doc/builtin.txt index 56bc8bfb3e..198ef25ed9 100644 --- a/runtime/doc/builtin.txt +++ b/runtime/doc/builtin.txt @@ -296,6 +296,7 @@ matcharg({nr}) List arguments of |:match| matchdelete({id} [, {win}]) Number delete match identified by {id} matchend({expr}, {pat}[, {start}[, {count}]]) Number position where {pat} ends in {expr} +matchfuzzy({list}, {str}) List fuzzy match {str} in {list} matchlist({expr}, {pat}[, {start}[, {count}]]) List match and submatches of {pat} in {expr} matchstr({expr}, {pat}[, {start}[, {count}]]) @@ -4857,6 +4858,27 @@ matchend({expr}, {pat} [, {start} [, {count}]]) *matchend()* Can also be used as a |method|: > GetText()->matchend('word') +matchfuzzy({list}, {str}) *matchfuzzy()* + Returns a list with all the strings in {list} that fuzzy + match {str}. The strings in the returned list are sorted + based on the matching score. {str} is treated as a literal + string and regular expression matching is NOT supported. + The maximum supported {str} length is 256. + + If there are no matching strings or there is an error, then an + empty list is returned. If length of {str} is greater than + 256, then returns an empty list. + + Example: > + :echo matchfuzzy(["clay", "crow"], "cay") +< results in ["clay"]. > + :echo getbufinfo()->map({_, v -> v.name})->matchfuzzy("ndl") +< results in a list of buffer names fuzzy matching "ndl". > + :echo v:oldfiles->matchfuzzy("test") +< results in a list of file names fuzzy matching "test". > + :let l = readfile("buffer.c")->matchfuzzy("str") +< results in a list of lines in "buffer.c" fuzzy matching "str". + matchlist({expr}, {pat} [, {start} [, {count}]]) *matchlist()* Same as |match()|, but return a |List|. The first item in the list is the matched string, same as what matchstr() would diff --git a/runtime/doc/usr_41.txt b/runtime/doc/usr_41.txt index bf29c94d51..19f58cddd6 100644 --- a/runtime/doc/usr_41.txt +++ b/runtime/doc/usr_41.txt @@ -608,6 +608,7 @@ String manipulation: *string-functions* toupper() turn a string to uppercase match() position where a pattern matches in a string matchend() position where a pattern match ends in a string + matchfuzzy() fuzzy matches a string in a list of strings matchstr() match of a pattern in a string matchstrpos() match and positions of a pattern in a string matchlist() like matchstr() and also return submatches diff --git a/src/nvim/eval.lua b/src/nvim/eval.lua index eedc8ac45d..37b051222e 100644 --- a/src/nvim/eval.lua +++ b/src/nvim/eval.lua @@ -249,6 +249,7 @@ return { matcharg={args=1, base=1}, matchdelete={args={1, 2}, base=1}, matchend={args={2, 4}, base=1}, + matchfuzzy={args=2, base=1}, matchlist={args={2, 4}, base=1}, matchstr={args={2, 4}, base=1}, matchstrpos={args={2,4}, base=1}, diff --git a/src/nvim/search.c b/src/nvim/search.c index 93180f97fe..7bc994fad3 100644 --- a/src/nvim/search.c +++ b/src/nvim/search.c @@ -4764,6 +4764,304 @@ the_end: restore_last_search_pattern(); } +/// Fuzzy string matching +/// +/// Ported from the lib_fts library authored by Forrest Smith. +/// https://github.com/forrestthewoods/lib_fts/tree/master/code +/// +/// Blog describing the algorithm: +/// https://www.forrestthewoods.com/blog/reverse_engineering_sublime_texts_fuzzy_match/ +/// +/// Each matching string is assigned a score. The following factors are checked: +/// Matched letter +/// Unmatched letter +/// Consecutively matched letters +/// Proximity to start +/// Letter following a separator (space, underscore) +/// Uppercase letter following lowercase (aka CamelCase) +/// +/// Matched letters are good. Unmatched letters are bad. Matching near the start +/// is good. Matching the first letter in the middle of a phrase is good. +/// Matching the uppercase letters in camel case entries is good. +/// +/// The score assigned for each factor is explained below. +/// File paths are different from file names. File extensions may be ignorable. +/// Single words care about consecutive matches but not separators or camel +/// case. +/// Score starts at 0 +/// Matched letter: +0 points +/// Unmatched letter: -1 point +/// Consecutive match bonus: +5 points +/// Separator bonus: +10 points +/// Camel case bonus: +10 points +/// Unmatched leading letter: -3 points (max: -9) +/// +/// There is some nuance to this. Scores don’t have an intrinsic meaning. The +/// score range isn’t 0 to 100. It’s roughly [-50, 50]. Longer words have a +/// lower minimum score due to unmatched letter penalty. Longer search patterns +/// have a higher maximum score due to match bonuses. +/// +/// Separator and camel case bonus is worth a LOT. Consecutive matches are worth +/// quite a bit. +/// +/// There is a penalty if you DON’T match the first three letters. Which +/// effectively rewards matching near the start. However there’s no difference +/// in matching between the middle and end. +/// +/// There is not an explicit bonus for an exact match. Unmatched letters receive +/// a penalty. So shorter strings and closer matches are worth more. +typedef struct { + const listitem_T *item; + int score; +} fuzzyItem_T; + +static bool fuzzy_match_recursive(const char_u *fuzpat, const char_u *str, int *const outScore, + const char_u *const strBegin, const char_u *const srcMatches, + char_u *const matches, const int maxMatches, int nextMatch, + int *const recursionCount, const int recursionLimit) + FUNC_ATTR_NONNULL_ARG(1, 2, 3, 4, 6, 9) FUNC_ATTR_WARN_UNUSED_RESULT +{ + // Recursion params + bool recursiveMatch = false; + char_u bestRecursiveMatches[256]; + int bestRecursiveScore = 0; + + // Count recursions + (*recursionCount)++; + if (*recursionCount >= recursionLimit) { + return false; + } + + // Detect end of strings + if (*fuzpat == '\0' || *str == '\0') { + return false; + } + + // Loop through fuzpat and str looking for a match + bool first_match = true; + while (*fuzpat != '\0' && *str != '\0') { + // Found match + if (mb_tolower(*fuzpat) == mb_tolower(*str)) { + // Supplied matches buffer was too short + if (nextMatch >= maxMatches) { + return false; + } + + // "Copy-on-Write" srcMatches into matches + if (first_match && srcMatches) { + memcpy(matches, srcMatches, nextMatch); + first_match = false; + } + + // Recursive call that "skips" this match + char_u recursiveMatches[256]; + int recursiveScore = 0; + if (fuzzy_match_recursive(fuzpat, str + 1, &recursiveScore, strBegin, matches, + recursiveMatches, sizeof(recursiveMatches), nextMatch, + recursionCount, recursionLimit)) { + // Pick best recursive score + if (!recursiveMatch || recursiveScore > bestRecursiveScore) { + memcpy(bestRecursiveMatches, recursiveMatches, 256); + bestRecursiveScore = recursiveScore; + } + recursiveMatch = true; + } + + // Advance + matches[nextMatch++] = (char_u)(str - strBegin); + fuzpat++; + } + str++; + } + + // Determine if full fuzpat was matched + const bool matched = *fuzpat == '\0'; + + // Calculate score + if (matched) { + // bonus for adjacent matches + const int sequential_bonus = 15; + // bonus if match occurs after a separator + const int separator_bonus = 30; + // bonus if match is uppercase and prev is lower + const int camel_bonus = 30; + // bonus if the first letter is matched + const int first_letter_bonus = 15; + // penalty applied for every letter in str before the first match + const int leading_letter_penalty = -5; + // maximum penalty for leading letters + const int max_leading_letter_penalty = -15; + // penalty for every letter that doesn't matter + const int unmatched_letter_penalty = -1; + + // Iterate str to end + while (*str != '\0') { + str++; + } + + // Initialize score + *outScore = 100; + + // Apply leading letter penalty + int penalty = leading_letter_penalty * matches[0]; + if (penalty < max_leading_letter_penalty) { + penalty = max_leading_letter_penalty; + } + *outScore += penalty; + + // Apply unmatched penalty + const int unmatched = (int)(str - strBegin) - nextMatch; + *outScore += unmatched_letter_penalty * unmatched; + + // Apply ordering bonuses + for (int i = 0; i < nextMatch; i++) { + const char_u currIdx = matches[i]; + + if (i > 0) { + const char_u prevIdx = matches[i - 1]; + + // Sequential + if (currIdx == (prevIdx + 1)) { + *outScore += sequential_bonus; + } + } + + // Check for bonuses based on neighbor character value + if (currIdx > 0) { + // Camel case + const char_u neighbor = strBegin[currIdx - 1]; + const char_u curr = strBegin[currIdx]; + + if (islower(neighbor) && isupper(curr)) { + *outScore += camel_bonus; + } + + // Separator + const bool neighborSeparator = neighbor == '_' || neighbor == ' '; + if (neighborSeparator) { + *outScore += separator_bonus; + } + } else { + // First letter + *outScore += first_letter_bonus; + } + } + } + + // Return best result + if (recursiveMatch && (!matched || bestRecursiveScore > *outScore)) { + // Recursive score is better than "this" + memcpy(matches, bestRecursiveMatches, maxMatches); + *outScore = bestRecursiveScore; + return true; + } else if (matched) { + return true; // "this" score is better than recursive + } + + return false; // no match +} + +/// fuzzy_match() +/// +/// Performs exhaustive search via recursion to find all possible matches and +/// match with highest score. +/// Scores values have no intrinsic meaning. Possible score range is not +/// normalized and varies with pattern. +/// Recursion is limited internally (default=10) to prevent degenerate cases +/// (fuzpat="aaaaaa" str="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"). +/// Uses char_u for match indices. Therefore patterns are limited to 256 +/// characters. +/// +/// Returns true if fuzpat is found AND calculates a score. +static bool fuzzy_match(const char_u *const str, const char_u *const fuzpat, int *const outScore) + FUNC_ATTR_NONNULL_ALL FUNC_ATTR_WARN_UNUSED_RESULT +{ + char_u matches[256]; + int recursionCount = 0; + int recursionLimit = 10; + + *outScore = 0; + + return fuzzy_match_recursive(fuzpat, str, outScore, str, NULL, matches, sizeof(matches), 0, + &recursionCount, recursionLimit); +} + +/// Sort the fuzzy matches in the descending order of the match score. +static int fuzzy_item_compare(const void *const s1, const void *const s2) + FUNC_ATTR_NONNULL_ALL FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_PURE +{ + const int v1 = ((const fuzzyItem_T *)s1)->score; + const int v2 = ((const fuzzyItem_T *)s2)->score; + + return v1 == v2 ? 0 : v1 > v2 ? -1 : 1; +} + +/// Fuzzy search the string 'str' in 'strlist' and return the matching strings +/// in 'fmatchlist'. +static void match_fuzzy(const list_T *const strlist, const char_u *const str, + list_T *const fmatchlist) + FUNC_ATTR_NONNULL_ARG(2, 3) +{ + const long len = tv_list_len(strlist); + if (len == 0) { + return; + } + + fuzzyItem_T *const ptrs = xmalloc(sizeof(fuzzyItem_T) * len); + long i = 0; + bool found_match = false; + + // For all the string items in strlist, get the fuzzy matching score + TV_LIST_ITER_CONST(strlist, li, { + ptrs[i].item = li; + ptrs[i].score = -9999; + // ignore non-string items in the list + const typval_T *const tv = TV_LIST_ITEM_TV(li); + if (tv->v_type == VAR_STRING && tv->vval.v_string != NULL) { + int score; + if (fuzzy_match(tv->vval.v_string, str, &score)) { + ptrs[i].score = score; + found_match = true; + } + } + i++; + }); + + if (found_match) { + // Sort the list by the descending order of the match score + qsort(ptrs, (size_t)len, sizeof(fuzzyItem_T), fuzzy_item_compare); + + // Copy the matching strings with 'score != -9999' to the return list + for (i = 0; i < len; i++) { + if (ptrs[i].score == -9999) { + break; + } + const typval_T *const tv = TV_LIST_ITEM_TV(ptrs[i].item); + tv_list_append_string(fmatchlist, (const char *)tv->vval.v_string, -1); + } + } + + xfree(ptrs); +} + +/// "matchfuzzy()" function +void f_matchfuzzy(typval_T *argvars, typval_T *rettv, FunPtr fptr) +{ + if (argvars[0].v_type != VAR_LIST) { + emsg(_(e_listreq)); + return; + } + if (argvars[0].vval.v_list == NULL) { + return; + } + if (argvars[1].v_type != VAR_STRING || argvars[1].vval.v_string == NULL) { + semsg(_(e_invarg2), tv_get_string(&argvars[1])); + return; + } + match_fuzzy(argvars[0].vval.v_list, (const char_u *)tv_get_string(&argvars[1]), + tv_list_alloc_ret(rettv, kListLenUnknown)); +} + /// Find identifiers or defines in included files. /// If p_ic && (compl_cont_status & CONT_SOL) then ptr must be in lowercase. /// diff --git a/src/nvim/testdir/test_functions.vim b/src/nvim/testdir/test_functions.vim index 438bed51c6..57ae0d3020 100644 --- a/src/nvim/testdir/test_functions.vim +++ b/src/nvim/testdir/test_functions.vim @@ -1731,6 +1731,32 @@ func Test_nr2char() call assert_equal("\x80\xfc\b\xfd\x80\xfeX\x80\xfeX\x80\xfeX\x80\xfeX\x80\xfeX", eval('"\<M-' .. nr2char(0x40000000) .. '>"')) endfunc +" Test for matchfuzzy() +func Test_matchfuzzy() + call assert_fails('call matchfuzzy(10, "abc")', 'E714:') + " Needs v8.2.1183. + " call assert_fails('call matchfuzzy(["abc"], [])', 'E730:') + call assert_fails('call matchfuzzy(["abc"], [])', 'E475:') + call assert_equal([], matchfuzzy([], 'abc')) + call assert_equal([], matchfuzzy(['abc'], '')) + call assert_equal(['abc'], matchfuzzy(['abc', 10], 'ac')) + call assert_equal([], matchfuzzy([10, 20], 'ac')) + call assert_equal(['abc'], matchfuzzy(['abc'], 'abc')) + call assert_equal(['crayon', 'camera'], matchfuzzy(['camera', 'crayon'], 'cra')) + call assert_equal(['aabbaa', 'aaabbbaaa', 'aaaabbbbaaaa', 'aba'], matchfuzzy(['aba', 'aabbaa', 'aaabbbaaa', 'aaaabbbbaaaa'], 'aa')) + call assert_equal(['one'], matchfuzzy(['one', 'two'], 'one')) + call assert_equal(['oneTwo', 'onetwo'], matchfuzzy(['onetwo', 'oneTwo'], 'oneTwo')) + call assert_equal(['one_two', 'onetwo'], matchfuzzy(['onetwo', 'one_two'], 'oneTwo')) + call assert_equal(['aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'], matchfuzzy(['aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'], 'aa')) + call assert_equal([], matchfuzzy([repeat('a', 300)], repeat('a', 257))) + + %bw! + eval ['somebuf', 'anotherone', 'needle', 'yetanotherone']->map({_, v -> bufadd(v) + bufload(v)}) + let l = getbufinfo()->map({_, v -> v.name})->matchfuzzy('ndl') + call assert_equal(1, len(l)) + call assert_match('needle', l[0]) +endfunc + " Test for getcurpos() and setpos() func Test_getcurpos_setpos() new |