aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorSean Dewar <seandewar@users.noreply.github.com>2022-01-01 14:58:32 +0000
committerSean Dewar <seandewar@users.noreply.github.com>2022-02-07 17:19:59 +0000
commitfba00b5e7ef2b6903a4588a2c080d8b33a8a2b68 (patch)
treebd246b1bf1f92178a9e088f390782cbc7853e4f5 /src
parentf02a5a7bdaafc1c5ff61aee133eb2b6ba5f57586 (diff)
downloadrneovim-fba00b5e7ef2b6903a4588a2c080d8b33a8a2b68.tar.gz
rneovim-fba00b5e7ef2b6903a4588a2c080d8b33a8a2b68.tar.bz2
rneovim-fba00b5e7ef2b6903a4588a2c080d8b33a8a2b68.zip
vim-patch:8.2.1665: cannot do fuzzy string matching
Problem: Cannot do fuzzy string matching. Solution: Add matchfuzzy(). (Yegappan Lakshmanan, closes vim/vim#6932) https://github.com/vim/vim/commit/635414dd2f3ae7d4d972d79b806348a6516cb91a Adjust Test_matchfuzzy's 2nd assert to expect the last error thrown, as v8.2.1183 hasn't been ported yet (to be honest, the error message is kinda weird if the 2nd argument is not convertible to string). We can still port this fully as porting v8.2.1183 would require removing this change to pass CI.
Diffstat (limited to 'src')
-rw-r--r--src/nvim/eval.lua1
-rw-r--r--src/nvim/search.c298
-rw-r--r--src/nvim/testdir/test_functions.vim26
3 files changed, 325 insertions, 0 deletions
diff --git a/src/nvim/eval.lua b/src/nvim/eval.lua
index eedc8ac45d..37b051222e 100644
--- a/src/nvim/eval.lua
+++ b/src/nvim/eval.lua
@@ -249,6 +249,7 @@ return {
matcharg={args=1, base=1},
matchdelete={args={1, 2}, base=1},
matchend={args={2, 4}, base=1},
+ matchfuzzy={args=2, base=1},
matchlist={args={2, 4}, base=1},
matchstr={args={2, 4}, base=1},
matchstrpos={args={2,4}, base=1},
diff --git a/src/nvim/search.c b/src/nvim/search.c
index 93180f97fe..7bc994fad3 100644
--- a/src/nvim/search.c
+++ b/src/nvim/search.c
@@ -4764,6 +4764,304 @@ the_end:
restore_last_search_pattern();
}
+/// Fuzzy string matching
+///
+/// Ported from the lib_fts library authored by Forrest Smith.
+/// https://github.com/forrestthewoods/lib_fts/tree/master/code
+///
+/// Blog describing the algorithm:
+/// https://www.forrestthewoods.com/blog/reverse_engineering_sublime_texts_fuzzy_match/
+///
+/// Each matching string is assigned a score. The following factors are checked:
+/// Matched letter
+/// Unmatched letter
+/// Consecutively matched letters
+/// Proximity to start
+/// Letter following a separator (space, underscore)
+/// Uppercase letter following lowercase (aka CamelCase)
+///
+/// Matched letters are good. Unmatched letters are bad. Matching near the start
+/// is good. Matching the first letter in the middle of a phrase is good.
+/// Matching the uppercase letters in camel case entries is good.
+///
+/// The score assigned for each factor is explained below.
+/// File paths are different from file names. File extensions may be ignorable.
+/// Single words care about consecutive matches but not separators or camel
+/// case.
+/// Score starts at 0
+/// Matched letter: +0 points
+/// Unmatched letter: -1 point
+/// Consecutive match bonus: +5 points
+/// Separator bonus: +10 points
+/// Camel case bonus: +10 points
+/// Unmatched leading letter: -3 points (max: -9)
+///
+/// There is some nuance to this. Scores don’t have an intrinsic meaning. The
+/// score range isn’t 0 to 100. It’s roughly [-50, 50]. Longer words have a
+/// lower minimum score due to unmatched letter penalty. Longer search patterns
+/// have a higher maximum score due to match bonuses.
+///
+/// Separator and camel case bonus is worth a LOT. Consecutive matches are worth
+/// quite a bit.
+///
+/// There is a penalty if you DON’T match the first three letters. Which
+/// effectively rewards matching near the start. However there’s no difference
+/// in matching between the middle and end.
+///
+/// There is not an explicit bonus for an exact match. Unmatched letters receive
+/// a penalty. So shorter strings and closer matches are worth more.
+typedef struct {
+ const listitem_T *item;
+ int score;
+} fuzzyItem_T;
+
+static bool fuzzy_match_recursive(const char_u *fuzpat, const char_u *str, int *const outScore,
+ const char_u *const strBegin, const char_u *const srcMatches,
+ char_u *const matches, const int maxMatches, int nextMatch,
+ int *const recursionCount, const int recursionLimit)
+ FUNC_ATTR_NONNULL_ARG(1, 2, 3, 4, 6, 9) FUNC_ATTR_WARN_UNUSED_RESULT
+{
+ // Recursion params
+ bool recursiveMatch = false;
+ char_u bestRecursiveMatches[256];
+ int bestRecursiveScore = 0;
+
+ // Count recursions
+ (*recursionCount)++;
+ if (*recursionCount >= recursionLimit) {
+ return false;
+ }
+
+ // Detect end of strings
+ if (*fuzpat == '\0' || *str == '\0') {
+ return false;
+ }
+
+ // Loop through fuzpat and str looking for a match
+ bool first_match = true;
+ while (*fuzpat != '\0' && *str != '\0') {
+ // Found match
+ if (mb_tolower(*fuzpat) == mb_tolower(*str)) {
+ // Supplied matches buffer was too short
+ if (nextMatch >= maxMatches) {
+ return false;
+ }
+
+ // "Copy-on-Write" srcMatches into matches
+ if (first_match && srcMatches) {
+ memcpy(matches, srcMatches, nextMatch);
+ first_match = false;
+ }
+
+ // Recursive call that "skips" this match
+ char_u recursiveMatches[256];
+ int recursiveScore = 0;
+ if (fuzzy_match_recursive(fuzpat, str + 1, &recursiveScore, strBegin, matches,
+ recursiveMatches, sizeof(recursiveMatches), nextMatch,
+ recursionCount, recursionLimit)) {
+ // Pick best recursive score
+ if (!recursiveMatch || recursiveScore > bestRecursiveScore) {
+ memcpy(bestRecursiveMatches, recursiveMatches, 256);
+ bestRecursiveScore = recursiveScore;
+ }
+ recursiveMatch = true;
+ }
+
+ // Advance
+ matches[nextMatch++] = (char_u)(str - strBegin);
+ fuzpat++;
+ }
+ str++;
+ }
+
+ // Determine if full fuzpat was matched
+ const bool matched = *fuzpat == '\0';
+
+ // Calculate score
+ if (matched) {
+ // bonus for adjacent matches
+ const int sequential_bonus = 15;
+ // bonus if match occurs after a separator
+ const int separator_bonus = 30;
+ // bonus if match is uppercase and prev is lower
+ const int camel_bonus = 30;
+ // bonus if the first letter is matched
+ const int first_letter_bonus = 15;
+ // penalty applied for every letter in str before the first match
+ const int leading_letter_penalty = -5;
+ // maximum penalty for leading letters
+ const int max_leading_letter_penalty = -15;
+ // penalty for every letter that doesn't matter
+ const int unmatched_letter_penalty = -1;
+
+ // Iterate str to end
+ while (*str != '\0') {
+ str++;
+ }
+
+ // Initialize score
+ *outScore = 100;
+
+ // Apply leading letter penalty
+ int penalty = leading_letter_penalty * matches[0];
+ if (penalty < max_leading_letter_penalty) {
+ penalty = max_leading_letter_penalty;
+ }
+ *outScore += penalty;
+
+ // Apply unmatched penalty
+ const int unmatched = (int)(str - strBegin) - nextMatch;
+ *outScore += unmatched_letter_penalty * unmatched;
+
+ // Apply ordering bonuses
+ for (int i = 0; i < nextMatch; i++) {
+ const char_u currIdx = matches[i];
+
+ if (i > 0) {
+ const char_u prevIdx = matches[i - 1];
+
+ // Sequential
+ if (currIdx == (prevIdx + 1)) {
+ *outScore += sequential_bonus;
+ }
+ }
+
+ // Check for bonuses based on neighbor character value
+ if (currIdx > 0) {
+ // Camel case
+ const char_u neighbor = strBegin[currIdx - 1];
+ const char_u curr = strBegin[currIdx];
+
+ if (islower(neighbor) && isupper(curr)) {
+ *outScore += camel_bonus;
+ }
+
+ // Separator
+ const bool neighborSeparator = neighbor == '_' || neighbor == ' ';
+ if (neighborSeparator) {
+ *outScore += separator_bonus;
+ }
+ } else {
+ // First letter
+ *outScore += first_letter_bonus;
+ }
+ }
+ }
+
+ // Return best result
+ if (recursiveMatch && (!matched || bestRecursiveScore > *outScore)) {
+ // Recursive score is better than "this"
+ memcpy(matches, bestRecursiveMatches, maxMatches);
+ *outScore = bestRecursiveScore;
+ return true;
+ } else if (matched) {
+ return true; // "this" score is better than recursive
+ }
+
+ return false; // no match
+}
+
+/// fuzzy_match()
+///
+/// Performs exhaustive search via recursion to find all possible matches and
+/// match with highest score.
+/// Scores values have no intrinsic meaning. Possible score range is not
+/// normalized and varies with pattern.
+/// Recursion is limited internally (default=10) to prevent degenerate cases
+/// (fuzpat="aaaaaa" str="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa").
+/// Uses char_u for match indices. Therefore patterns are limited to 256
+/// characters.
+///
+/// Returns true if fuzpat is found AND calculates a score.
+static bool fuzzy_match(const char_u *const str, const char_u *const fuzpat, int *const outScore)
+ FUNC_ATTR_NONNULL_ALL FUNC_ATTR_WARN_UNUSED_RESULT
+{
+ char_u matches[256];
+ int recursionCount = 0;
+ int recursionLimit = 10;
+
+ *outScore = 0;
+
+ return fuzzy_match_recursive(fuzpat, str, outScore, str, NULL, matches, sizeof(matches), 0,
+ &recursionCount, recursionLimit);
+}
+
+/// Sort the fuzzy matches in the descending order of the match score.
+static int fuzzy_item_compare(const void *const s1, const void *const s2)
+ FUNC_ATTR_NONNULL_ALL FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_PURE
+{
+ const int v1 = ((const fuzzyItem_T *)s1)->score;
+ const int v2 = ((const fuzzyItem_T *)s2)->score;
+
+ return v1 == v2 ? 0 : v1 > v2 ? -1 : 1;
+}
+
+/// Fuzzy search the string 'str' in 'strlist' and return the matching strings
+/// in 'fmatchlist'.
+static void match_fuzzy(const list_T *const strlist, const char_u *const str,
+ list_T *const fmatchlist)
+ FUNC_ATTR_NONNULL_ARG(2, 3)
+{
+ const long len = tv_list_len(strlist);
+ if (len == 0) {
+ return;
+ }
+
+ fuzzyItem_T *const ptrs = xmalloc(sizeof(fuzzyItem_T) * len);
+ long i = 0;
+ bool found_match = false;
+
+ // For all the string items in strlist, get the fuzzy matching score
+ TV_LIST_ITER_CONST(strlist, li, {
+ ptrs[i].item = li;
+ ptrs[i].score = -9999;
+ // ignore non-string items in the list
+ const typval_T *const tv = TV_LIST_ITEM_TV(li);
+ if (tv->v_type == VAR_STRING && tv->vval.v_string != NULL) {
+ int score;
+ if (fuzzy_match(tv->vval.v_string, str, &score)) {
+ ptrs[i].score = score;
+ found_match = true;
+ }
+ }
+ i++;
+ });
+
+ if (found_match) {
+ // Sort the list by the descending order of the match score
+ qsort(ptrs, (size_t)len, sizeof(fuzzyItem_T), fuzzy_item_compare);
+
+ // Copy the matching strings with 'score != -9999' to the return list
+ for (i = 0; i < len; i++) {
+ if (ptrs[i].score == -9999) {
+ break;
+ }
+ const typval_T *const tv = TV_LIST_ITEM_TV(ptrs[i].item);
+ tv_list_append_string(fmatchlist, (const char *)tv->vval.v_string, -1);
+ }
+ }
+
+ xfree(ptrs);
+}
+
+/// "matchfuzzy()" function
+void f_matchfuzzy(typval_T *argvars, typval_T *rettv, FunPtr fptr)
+{
+ if (argvars[0].v_type != VAR_LIST) {
+ emsg(_(e_listreq));
+ return;
+ }
+ if (argvars[0].vval.v_list == NULL) {
+ return;
+ }
+ if (argvars[1].v_type != VAR_STRING || argvars[1].vval.v_string == NULL) {
+ semsg(_(e_invarg2), tv_get_string(&argvars[1]));
+ return;
+ }
+ match_fuzzy(argvars[0].vval.v_list, (const char_u *)tv_get_string(&argvars[1]),
+ tv_list_alloc_ret(rettv, kListLenUnknown));
+}
+
/// Find identifiers or defines in included files.
/// If p_ic && (compl_cont_status & CONT_SOL) then ptr must be in lowercase.
///
diff --git a/src/nvim/testdir/test_functions.vim b/src/nvim/testdir/test_functions.vim
index 438bed51c6..57ae0d3020 100644
--- a/src/nvim/testdir/test_functions.vim
+++ b/src/nvim/testdir/test_functions.vim
@@ -1731,6 +1731,32 @@ func Test_nr2char()
call assert_equal("\x80\xfc\b\xfd\x80\xfeX\x80\xfeX\x80\xfeX\x80\xfeX\x80\xfeX", eval('"\<M-' .. nr2char(0x40000000) .. '>"'))
endfunc
+" Test for matchfuzzy()
+func Test_matchfuzzy()
+ call assert_fails('call matchfuzzy(10, "abc")', 'E714:')
+ " Needs v8.2.1183.
+ " call assert_fails('call matchfuzzy(["abc"], [])', 'E730:')
+ call assert_fails('call matchfuzzy(["abc"], [])', 'E475:')
+ call assert_equal([], matchfuzzy([], 'abc'))
+ call assert_equal([], matchfuzzy(['abc'], ''))
+ call assert_equal(['abc'], matchfuzzy(['abc', 10], 'ac'))
+ call assert_equal([], matchfuzzy([10, 20], 'ac'))
+ call assert_equal(['abc'], matchfuzzy(['abc'], 'abc'))
+ call assert_equal(['crayon', 'camera'], matchfuzzy(['camera', 'crayon'], 'cra'))
+ call assert_equal(['aabbaa', 'aaabbbaaa', 'aaaabbbbaaaa', 'aba'], matchfuzzy(['aba', 'aabbaa', 'aaabbbaaa', 'aaaabbbbaaaa'], 'aa'))
+ call assert_equal(['one'], matchfuzzy(['one', 'two'], 'one'))
+ call assert_equal(['oneTwo', 'onetwo'], matchfuzzy(['onetwo', 'oneTwo'], 'oneTwo'))
+ call assert_equal(['one_two', 'onetwo'], matchfuzzy(['onetwo', 'one_two'], 'oneTwo'))
+ call assert_equal(['aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'], matchfuzzy(['aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'], 'aa'))
+ call assert_equal([], matchfuzzy([repeat('a', 300)], repeat('a', 257)))
+
+ %bw!
+ eval ['somebuf', 'anotherone', 'needle', 'yetanotherone']->map({_, v -> bufadd(v) + bufload(v)})
+ let l = getbufinfo()->map({_, v -> v.name})->matchfuzzy('ndl')
+ call assert_equal(1, len(l))
+ call assert_match('needle', l[0])
+endfunc
+
" Test for getcurpos() and setpos()
func Test_getcurpos_setpos()
new