diff options
author | Sean Dewar <seandewar@users.noreply.github.com> | 2022-01-01 14:58:32 +0000 |
---|---|---|
committer | Sean Dewar <seandewar@users.noreply.github.com> | 2022-02-07 17:19:59 +0000 |
commit | fba00b5e7ef2b6903a4588a2c080d8b33a8a2b68 (patch) | |
tree | bd246b1bf1f92178a9e088f390782cbc7853e4f5 | |
parent | f02a5a7bdaafc1c5ff61aee133eb2b6ba5f57586 (diff) | |
download | rneovim-fba00b5e7ef2b6903a4588a2c080d8b33a8a2b68.tar.gz rneovim-fba00b5e7ef2b6903a4588a2c080d8b33a8a2b68.tar.bz2 rneovim-fba00b5e7ef2b6903a4588a2c080d8b33a8a2b68.zip |
vim-patch:8.2.1665: cannot do fuzzy string matching
Problem: Cannot do fuzzy string matching.
Solution: Add matchfuzzy(). (Yegappan Lakshmanan, closes vim/vim#6932)
https://github.com/vim/vim/commit/635414dd2f3ae7d4d972d79b806348a6516cb91a
Adjust Test_matchfuzzy's 2nd assert to expect the last error thrown, as
v8.2.1183 hasn't been ported yet (to be honest, the error message is kinda weird
if the 2nd argument is not convertible to string). We can still port this fully
as porting v8.2.1183 would require removing this change to pass CI.
-rw-r--r-- | runtime/doc/builtin.txt | 22 | ||||
-rw-r--r-- | runtime/doc/usr_41.txt | 1 | ||||
-rw-r--r-- | src/nvim/eval.lua | 1 | ||||
-rw-r--r-- | src/nvim/search.c | 298 | ||||
-rw-r--r-- | src/nvim/testdir/test_functions.vim | 26 |
5 files changed, 348 insertions, 0 deletions
diff --git a/runtime/doc/builtin.txt b/runtime/doc/builtin.txt index 56bc8bfb3e..198ef25ed9 100644 --- a/runtime/doc/builtin.txt +++ b/runtime/doc/builtin.txt @@ -296,6 +296,7 @@ matcharg({nr}) List arguments of |:match| matchdelete({id} [, {win}]) Number delete match identified by {id} matchend({expr}, {pat}[, {start}[, {count}]]) Number position where {pat} ends in {expr} +matchfuzzy({list}, {str}) List fuzzy match {str} in {list} matchlist({expr}, {pat}[, {start}[, {count}]]) List match and submatches of {pat} in {expr} matchstr({expr}, {pat}[, {start}[, {count}]]) @@ -4857,6 +4858,27 @@ matchend({expr}, {pat} [, {start} [, {count}]]) *matchend()* Can also be used as a |method|: > GetText()->matchend('word') +matchfuzzy({list}, {str}) *matchfuzzy()* + Returns a list with all the strings in {list} that fuzzy + match {str}. The strings in the returned list are sorted + based on the matching score. {str} is treated as a literal + string and regular expression matching is NOT supported. + The maximum supported {str} length is 256. + + If there are no matching strings or there is an error, then an + empty list is returned. If length of {str} is greater than + 256, then returns an empty list. + + Example: > + :echo matchfuzzy(["clay", "crow"], "cay") +< results in ["clay"]. > + :echo getbufinfo()->map({_, v -> v.name})->matchfuzzy("ndl") +< results in a list of buffer names fuzzy matching "ndl". > + :echo v:oldfiles->matchfuzzy("test") +< results in a list of file names fuzzy matching "test". > + :let l = readfile("buffer.c")->matchfuzzy("str") +< results in a list of lines in "buffer.c" fuzzy matching "str". + matchlist({expr}, {pat} [, {start} [, {count}]]) *matchlist()* Same as |match()|, but return a |List|. The first item in the list is the matched string, same as what matchstr() would diff --git a/runtime/doc/usr_41.txt b/runtime/doc/usr_41.txt index bf29c94d51..19f58cddd6 100644 --- a/runtime/doc/usr_41.txt +++ b/runtime/doc/usr_41.txt @@ -608,6 +608,7 @@ String manipulation: *string-functions* toupper() turn a string to uppercase match() position where a pattern matches in a string matchend() position where a pattern match ends in a string + matchfuzzy() fuzzy matches a string in a list of strings matchstr() match of a pattern in a string matchstrpos() match and positions of a pattern in a string matchlist() like matchstr() and also return submatches diff --git a/src/nvim/eval.lua b/src/nvim/eval.lua index eedc8ac45d..37b051222e 100644 --- a/src/nvim/eval.lua +++ b/src/nvim/eval.lua @@ -249,6 +249,7 @@ return { matcharg={args=1, base=1}, matchdelete={args={1, 2}, base=1}, matchend={args={2, 4}, base=1}, + matchfuzzy={args=2, base=1}, matchlist={args={2, 4}, base=1}, matchstr={args={2, 4}, base=1}, matchstrpos={args={2,4}, base=1}, diff --git a/src/nvim/search.c b/src/nvim/search.c index 93180f97fe..7bc994fad3 100644 --- a/src/nvim/search.c +++ b/src/nvim/search.c @@ -4764,6 +4764,304 @@ the_end: restore_last_search_pattern(); } +/// Fuzzy string matching +/// +/// Ported from the lib_fts library authored by Forrest Smith. +/// https://github.com/forrestthewoods/lib_fts/tree/master/code +/// +/// Blog describing the algorithm: +/// https://www.forrestthewoods.com/blog/reverse_engineering_sublime_texts_fuzzy_match/ +/// +/// Each matching string is assigned a score. The following factors are checked: +/// Matched letter +/// Unmatched letter +/// Consecutively matched letters +/// Proximity to start +/// Letter following a separator (space, underscore) +/// Uppercase letter following lowercase (aka CamelCase) +/// +/// Matched letters are good. Unmatched letters are bad. Matching near the start +/// is good. Matching the first letter in the middle of a phrase is good. +/// Matching the uppercase letters in camel case entries is good. +/// +/// The score assigned for each factor is explained below. +/// File paths are different from file names. File extensions may be ignorable. +/// Single words care about consecutive matches but not separators or camel +/// case. +/// Score starts at 0 +/// Matched letter: +0 points +/// Unmatched letter: -1 point +/// Consecutive match bonus: +5 points +/// Separator bonus: +10 points +/// Camel case bonus: +10 points +/// Unmatched leading letter: -3 points (max: -9) +/// +/// There is some nuance to this. Scores don’t have an intrinsic meaning. The +/// score range isn’t 0 to 100. It’s roughly [-50, 50]. Longer words have a +/// lower minimum score due to unmatched letter penalty. Longer search patterns +/// have a higher maximum score due to match bonuses. +/// +/// Separator and camel case bonus is worth a LOT. Consecutive matches are worth +/// quite a bit. +/// +/// There is a penalty if you DON’T match the first three letters. Which +/// effectively rewards matching near the start. However there’s no difference +/// in matching between the middle and end. +/// +/// There is not an explicit bonus for an exact match. Unmatched letters receive +/// a penalty. So shorter strings and closer matches are worth more. +typedef struct { + const listitem_T *item; + int score; +} fuzzyItem_T; + +static bool fuzzy_match_recursive(const char_u *fuzpat, const char_u *str, int *const outScore, + const char_u *const strBegin, const char_u *const srcMatches, + char_u *const matches, const int maxMatches, int nextMatch, + int *const recursionCount, const int recursionLimit) + FUNC_ATTR_NONNULL_ARG(1, 2, 3, 4, 6, 9) FUNC_ATTR_WARN_UNUSED_RESULT +{ + // Recursion params + bool recursiveMatch = false; + char_u bestRecursiveMatches[256]; + int bestRecursiveScore = 0; + + // Count recursions + (*recursionCount)++; + if (*recursionCount >= recursionLimit) { + return false; + } + + // Detect end of strings + if (*fuzpat == '\0' || *str == '\0') { + return false; + } + + // Loop through fuzpat and str looking for a match + bool first_match = true; + while (*fuzpat != '\0' && *str != '\0') { + // Found match + if (mb_tolower(*fuzpat) == mb_tolower(*str)) { + // Supplied matches buffer was too short + if (nextMatch >= maxMatches) { + return false; + } + + // "Copy-on-Write" srcMatches into matches + if (first_match && srcMatches) { + memcpy(matches, srcMatches, nextMatch); + first_match = false; + } + + // Recursive call that "skips" this match + char_u recursiveMatches[256]; + int recursiveScore = 0; + if (fuzzy_match_recursive(fuzpat, str + 1, &recursiveScore, strBegin, matches, + recursiveMatches, sizeof(recursiveMatches), nextMatch, + recursionCount, recursionLimit)) { + // Pick best recursive score + if (!recursiveMatch || recursiveScore > bestRecursiveScore) { + memcpy(bestRecursiveMatches, recursiveMatches, 256); + bestRecursiveScore = recursiveScore; + } + recursiveMatch = true; + } + + // Advance + matches[nextMatch++] = (char_u)(str - strBegin); + fuzpat++; + } + str++; + } + + // Determine if full fuzpat was matched + const bool matched = *fuzpat == '\0'; + + // Calculate score + if (matched) { + // bonus for adjacent matches + const int sequential_bonus = 15; + // bonus if match occurs after a separator + const int separator_bonus = 30; + // bonus if match is uppercase and prev is lower + const int camel_bonus = 30; + // bonus if the first letter is matched + const int first_letter_bonus = 15; + // penalty applied for every letter in str before the first match + const int leading_letter_penalty = -5; + // maximum penalty for leading letters + const int max_leading_letter_penalty = -15; + // penalty for every letter that doesn't matter + const int unmatched_letter_penalty = -1; + + // Iterate str to end + while (*str != '\0') { + str++; + } + + // Initialize score + *outScore = 100; + + // Apply leading letter penalty + int penalty = leading_letter_penalty * matches[0]; + if (penalty < max_leading_letter_penalty) { + penalty = max_leading_letter_penalty; + } + *outScore += penalty; + + // Apply unmatched penalty + const int unmatched = (int)(str - strBegin) - nextMatch; + *outScore += unmatched_letter_penalty * unmatched; + + // Apply ordering bonuses + for (int i = 0; i < nextMatch; i++) { + const char_u currIdx = matches[i]; + + if (i > 0) { + const char_u prevIdx = matches[i - 1]; + + // Sequential + if (currIdx == (prevIdx + 1)) { + *outScore += sequential_bonus; + } + } + + // Check for bonuses based on neighbor character value + if (currIdx > 0) { + // Camel case + const char_u neighbor = strBegin[currIdx - 1]; + const char_u curr = strBegin[currIdx]; + + if (islower(neighbor) && isupper(curr)) { + *outScore += camel_bonus; + } + + // Separator + const bool neighborSeparator = neighbor == '_' || neighbor == ' '; + if (neighborSeparator) { + *outScore += separator_bonus; + } + } else { + // First letter + *outScore += first_letter_bonus; + } + } + } + + // Return best result + if (recursiveMatch && (!matched || bestRecursiveScore > *outScore)) { + // Recursive score is better than "this" + memcpy(matches, bestRecursiveMatches, maxMatches); + *outScore = bestRecursiveScore; + return true; + } else if (matched) { + return true; // "this" score is better than recursive + } + + return false; // no match +} + +/// fuzzy_match() +/// +/// Performs exhaustive search via recursion to find all possible matches and +/// match with highest score. +/// Scores values have no intrinsic meaning. Possible score range is not +/// normalized and varies with pattern. +/// Recursion is limited internally (default=10) to prevent degenerate cases +/// (fuzpat="aaaaaa" str="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"). +/// Uses char_u for match indices. Therefore patterns are limited to 256 +/// characters. +/// +/// Returns true if fuzpat is found AND calculates a score. +static bool fuzzy_match(const char_u *const str, const char_u *const fuzpat, int *const outScore) + FUNC_ATTR_NONNULL_ALL FUNC_ATTR_WARN_UNUSED_RESULT +{ + char_u matches[256]; + int recursionCount = 0; + int recursionLimit = 10; + + *outScore = 0; + + return fuzzy_match_recursive(fuzpat, str, outScore, str, NULL, matches, sizeof(matches), 0, + &recursionCount, recursionLimit); +} + +/// Sort the fuzzy matches in the descending order of the match score. +static int fuzzy_item_compare(const void *const s1, const void *const s2) + FUNC_ATTR_NONNULL_ALL FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_PURE +{ + const int v1 = ((const fuzzyItem_T *)s1)->score; + const int v2 = ((const fuzzyItem_T *)s2)->score; + + return v1 == v2 ? 0 : v1 > v2 ? -1 : 1; +} + +/// Fuzzy search the string 'str' in 'strlist' and return the matching strings +/// in 'fmatchlist'. +static void match_fuzzy(const list_T *const strlist, const char_u *const str, + list_T *const fmatchlist) + FUNC_ATTR_NONNULL_ARG(2, 3) +{ + const long len = tv_list_len(strlist); + if (len == 0) { + return; + } + + fuzzyItem_T *const ptrs = xmalloc(sizeof(fuzzyItem_T) * len); + long i = 0; + bool found_match = false; + + // For all the string items in strlist, get the fuzzy matching score + TV_LIST_ITER_CONST(strlist, li, { + ptrs[i].item = li; + ptrs[i].score = -9999; + // ignore non-string items in the list + const typval_T *const tv = TV_LIST_ITEM_TV(li); + if (tv->v_type == VAR_STRING && tv->vval.v_string != NULL) { + int score; + if (fuzzy_match(tv->vval.v_string, str, &score)) { + ptrs[i].score = score; + found_match = true; + } + } + i++; + }); + + if (found_match) { + // Sort the list by the descending order of the match score + qsort(ptrs, (size_t)len, sizeof(fuzzyItem_T), fuzzy_item_compare); + + // Copy the matching strings with 'score != -9999' to the return list + for (i = 0; i < len; i++) { + if (ptrs[i].score == -9999) { + break; + } + const typval_T *const tv = TV_LIST_ITEM_TV(ptrs[i].item); + tv_list_append_string(fmatchlist, (const char *)tv->vval.v_string, -1); + } + } + + xfree(ptrs); +} + +/// "matchfuzzy()" function +void f_matchfuzzy(typval_T *argvars, typval_T *rettv, FunPtr fptr) +{ + if (argvars[0].v_type != VAR_LIST) { + emsg(_(e_listreq)); + return; + } + if (argvars[0].vval.v_list == NULL) { + return; + } + if (argvars[1].v_type != VAR_STRING || argvars[1].vval.v_string == NULL) { + semsg(_(e_invarg2), tv_get_string(&argvars[1])); + return; + } + match_fuzzy(argvars[0].vval.v_list, (const char_u *)tv_get_string(&argvars[1]), + tv_list_alloc_ret(rettv, kListLenUnknown)); +} + /// Find identifiers or defines in included files. /// If p_ic && (compl_cont_status & CONT_SOL) then ptr must be in lowercase. /// diff --git a/src/nvim/testdir/test_functions.vim b/src/nvim/testdir/test_functions.vim index 438bed51c6..57ae0d3020 100644 --- a/src/nvim/testdir/test_functions.vim +++ b/src/nvim/testdir/test_functions.vim @@ -1731,6 +1731,32 @@ func Test_nr2char() call assert_equal("\x80\xfc\b\xfd\x80\xfeX\x80\xfeX\x80\xfeX\x80\xfeX\x80\xfeX", eval('"\<M-' .. nr2char(0x40000000) .. '>"')) endfunc +" Test for matchfuzzy() +func Test_matchfuzzy() + call assert_fails('call matchfuzzy(10, "abc")', 'E714:') + " Needs v8.2.1183. + " call assert_fails('call matchfuzzy(["abc"], [])', 'E730:') + call assert_fails('call matchfuzzy(["abc"], [])', 'E475:') + call assert_equal([], matchfuzzy([], 'abc')) + call assert_equal([], matchfuzzy(['abc'], '')) + call assert_equal(['abc'], matchfuzzy(['abc', 10], 'ac')) + call assert_equal([], matchfuzzy([10, 20], 'ac')) + call assert_equal(['abc'], matchfuzzy(['abc'], 'abc')) + call assert_equal(['crayon', 'camera'], matchfuzzy(['camera', 'crayon'], 'cra')) + call assert_equal(['aabbaa', 'aaabbbaaa', 'aaaabbbbaaaa', 'aba'], matchfuzzy(['aba', 'aabbaa', 'aaabbbaaa', 'aaaabbbbaaaa'], 'aa')) + call assert_equal(['one'], matchfuzzy(['one', 'two'], 'one')) + call assert_equal(['oneTwo', 'onetwo'], matchfuzzy(['onetwo', 'oneTwo'], 'oneTwo')) + call assert_equal(['one_two', 'onetwo'], matchfuzzy(['onetwo', 'one_two'], 'oneTwo')) + call assert_equal(['aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'], matchfuzzy(['aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'], 'aa')) + call assert_equal([], matchfuzzy([repeat('a', 300)], repeat('a', 257))) + + %bw! + eval ['somebuf', 'anotherone', 'needle', 'yetanotherone']->map({_, v -> bufadd(v) + bufload(v)}) + let l = getbufinfo()->map({_, v -> v.name})->matchfuzzy('ndl') + call assert_equal(1, len(l)) + call assert_match('needle', l[0]) +endfunc + " Test for getcurpos() and setpos() func Test_getcurpos_setpos() new |