refactor!: use utf8proc full casefolding

According to `CaseFolding-15.1.0.txt`, full casefolding should be preferred over simple casefolding as it's considered to be more correct. Since utf8proc already provides full casefolding it makes sense to switch to it. This will also remove a lot of unnecessary build code. Temporary exceptions are made for two sets characters: - `ß` will still be considered `ß` (instead of `ss`) as using a full casefolding requires interfering with upstream spell files in some form. - `İ` will still be considered `İ` (instead of `i̇`) as using full casefolding requires making a value judgement on the "correct" behavior. There are two, equally valid case-insensetive comparison for this character according to unicode. It is essentially up to the implementor to decide which conversion is correct. For this reason it might make sense to allow users to decide which conversion should be done as an added option to `casemap` in a future PR.
author: dundargoc <gocdundar@gmail.com> 2024-06-18 14:01:20 +0200
committer: dundargoc <33953936+dundargoc@users.noreply.github.com> 2024-08-07 15:31:18 +0200
commit: 328ea02eb7dec32286ae6c691ecef71d988c905b (patch)
tree: 7ed364062b329648eff486da8d34cecc95a0f0b8 /src/nvim
parent: 11a6f3c9301b3deb71f7e5886fce3718420355be (diff)
download: rneovim-328ea02eb7dec32286ae6c691ecef71d988c905b.tar.gz
rneovim-328ea02eb7dec32286ae6c691ecef71d988c905b.tar.bz2
rneovim-328ea02eb7dec32286ae6c691ecef71d988c905b.zip
2 files changed, 25 insertions, 78 deletions
diff --git a/src/nvim/generators/gen_unicode_tables.lua b/src/nvim/generators/gen_unicode_tables.lua
index 305b64b7be..01eb34be88 100644
--- a/src/nvim/generators/gen_unicode_tables.lua
+++ b/src/nvim/generators/gen_unicode_tables.lua
@@ -28,7 +28,6 @@ local get_path = function(fname)
 end
 
 local unicodedata_fname = get_path('UnicodeData.txt')
-local casefolding_fname = get_path('CaseFolding.txt')
 local eastasianwidth_fname = get_path('EastAsianWidth.txt')
 local emoji_fname = get_path('emoji-data.txt')
 
@@ -77,10 +76,6 @@ local parse_data_to_props = function(ud_fp)
   return fp_lines_to_lists(ud_fp, 15, false)
 end
 
-local parse_fold_props = function(cf_fp)
-  return fp_lines_to_lists(cf_fp, 4, true)
-end
-
 local parse_width_props = function(eaw_fp)
   return fp_lines_to_lists(eaw_fp, 2, true)
 end
@@ -97,45 +92,6 @@ local make_range = function(start, end_, step, add)
   end
 end
 
-local build_convert_table = function(ut_fp, props, cond_func, nl_index, table_name)
-  ut_fp:write('static const convertStruct ' .. table_name .. '[] = {\n')
-  local start = -1
-  local end_ = -1
-  local step = 0
-  local add = -1
-  for _, p in ipairs(props) do
-    if cond_func(p) then
-      local n = tonumber(p[1], 16)
-      local nl = tonumber(p[nl_index], 16)
-      if start >= 0 and add == (nl - n) and (step == 0 or n - end_ == step) then
-        -- Continue with the same range.
-        step = n - end_
-        end_ = n
-      else
-        if start >= 0 then
-          -- Produce previous range.
-          ut_fp:write(make_range(start, end_, step, add))
-        end
-        start = n
-        end_ = n
-        step = 0
-        add = nl - n
-      end
-    end
-  end
-  if start >= 0 then
-    ut_fp:write(make_range(start, end_, step, add))
-  end
-  ut_fp:write('};\n')
-end
-
-local build_fold_table = function(ut_fp, foldprops)
-  local cond_func = function(p)
-    return (p[2] == 'C' or p[2] == 'S')
-  end
-  return build_convert_table(ut_fp, foldprops, cond_func, 3, 'foldCase')
-end
-
 local build_combining_table = function(ut_fp, dataprops)
   ut_fp:write('static const struct interval combining[] = {\n')
   local start = -1
@@ -291,12 +247,6 @@ local ut_fp = io.open(utf_tables_fname, 'w')
 
 build_combining_table(ut_fp, dataprops)
 
-local cf_fp = io.open(casefolding_fname, 'r')
-local foldprops = parse_fold_props(cf_fp)
-cf_fp:close()
-
-build_fold_table(ut_fp, foldprops)
-
 local eaw_fp = io.open(eastasianwidth_fname, 'r')
 local widthprops = parse_width_props(eaw_fp)
 eaw_fp:close()
diff --git a/src/nvim/mbyte.c b/src/nvim/mbyte.c
index c6cefb8a91..0c1b537f3a 100644
--- a/src/nvim/mbyte.c
+++ b/src/nvim/mbyte.c
@@ -1284,41 +1284,38 @@ bool utf_ambiguous_width(int c)
                        || intable(emoji_all, ARRAY_SIZE(emoji_all), c));
 }
 
-// Generic conversion function for case operations.
-// Return the converted equivalent of "a", which is a UCS-4 character.  Use
-// the given conversion "table".  Uses binary search on "table".
-static int utf_convert(int a, const convertStruct *const table, size_t n_items)
-{
-  // indices into table
-  size_t start = 0;
-  size_t end = n_items;
-  while (start < end) {
-    // need to search further
-    size_t mid = (end + start) / 2;
-    if (table[mid].rangeEnd < a) {
-      start = mid + 1;
-    } else {
-      end = mid;
-    }
-  }
-  if (start < n_items
-      && table[start].rangeStart <= a
-      && a <= table[start].rangeEnd
-      && (a - table[start].rangeStart) % table[start].step == 0) {
-    return a + table[start].offset;
-  }
-  return a;
-}
-
 // Return the folded-case equivalent of "a", which is a UCS-4 character.  Uses
-// simple case folding.
+// full case folding.
 int utf_fold(int a)
 {
   if (a < 0x80) {
     // be fast for ASCII
     return a >= 0x41 && a <= 0x5a ? a + 32 : a;
   }
-  return utf_convert(a, foldCase, ARRAY_SIZE(foldCase));
+
+  // TODO(dundargoc): utf8proc only does full case folding, which breaks some tests. This is a
+  // temporary workaround to circumvent failing tests.
+  //
+  // (0xdf) ß == ss in full casefolding. Using this however breaks the vim spell tests and the error
+  // E763 is thrown. This is due to the test spells relying on the vim spell files.
+  //
+  // (0x130) İ == i̇ in full casefolding.
+  if (a == 0xdf || a == 0x130) {
+    return a;
+  }
+
+  utf8proc_uint8_t input_str[16] = { 0 };
+  utf8proc_encode_char(a, input_str);
+
+  utf8proc_uint8_t *fold_str_utf;
+  utf8proc_map((utf8proc_uint8_t *)input_str, 0, &fold_str_utf,
+               UTF8PROC_NULLTERM | UTF8PROC_CASEFOLD);
+
+  int fold_codepoint_utf = utf_ptr2char((char *)fold_str_utf);
+
+  xfree(fold_str_utf);
+
+  return fold_codepoint_utf;
 }
 
 // Vim's own character class functions.  These exist because many library
author	dundargoc <gocdundar@gmail.com>	2024-06-18 14:01:20 +0200
committer	dundargoc <33953936+dundargoc@users.noreply.github.com>	2024-08-07 15:31:18 +0200
commit	328ea02eb7dec32286ae6c691ecef71d988c905b (patch)
tree	7ed364062b329648eff486da8d34cecc95a0f0b8 /src/nvim
parent	11a6f3c9301b3deb71f7e5886fce3718420355be (diff)
download	rneovim-328ea02eb7dec32286ae6c691ecef71d988c905b.tar.gz rneovim-328ea02eb7dec32286ae6c691ecef71d988c905b.tar.bz2 rneovim-328ea02eb7dec32286ae6c691ecef71d988c905b.zip