From d533edf61eef15456efdf16bf45e68c824ee5870 Mon Sep 17 00:00:00 2001 From: James McCoy Date: Wed, 21 Sep 2016 10:15:19 -0400 Subject: vim-patch:7.4.1604 Problem: Although emoji characters are ambiguous width, best is to treat them as full width. Solution: Update the Unicode character tables. Add the 'emoji' options. (Yasuhiro Matsumoto) https://github.com/vim/vim/commit/3848e00e0177abdb31bc600234967863ec487233 --- scripts/genunicodetables.lua | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) (limited to 'scripts/genunicodetables.lua') diff --git a/scripts/genunicodetables.lua b/scripts/genunicodetables.lua index 36339e2fc6..08b8b0f90e 100644 --- a/scripts/genunicodetables.lua +++ b/scripts/genunicodetables.lua @@ -12,10 +12,12 @@ -- 2 then interval applies only to first, third, fifth, … character in range. -- Fourth value is number that should be added to the codepoint to yield -- folded/lower/upper codepoint. +-- 4. emoji table: sorted list of non-overlapping closed intervals of Emoji +-- characters if arg[1] == '--help' then print('Usage:') print(' genunicodetables.lua UnicodeData.txt CaseFolding.txt ' .. - 'EastAsianWidth.txt') + 'EastAsianWidth.txt emoji-data.txt') print(' unicode_tables.generated.h') os.exit(0) end @@ -23,8 +25,9 @@ end local unicodedata_fname = arg[1] local casefolding_fname = arg[2] local eastasianwidth_fname = arg[3] +local emoji_fname = arg[4] -local utf_tables_fname = arg[4] +local utf_tables_fname = arg[5] local split_on_semicolons = function(s) local ret = {} @@ -79,6 +82,10 @@ local parse_width_props = function(eaw_fp) return fp_lines_to_lists(eaw_fp, 2, true) end +local parse_emoji_props = function(emoji_fp) + return fp_lines_to_lists(emoji_fp, 2, true) +end + local make_range = function(start, end_, step, add) if step and add then return (' {0x%x, 0x%x, %d, %d},\n'):format( @@ -213,6 +220,24 @@ local build_width_table = function(ut_fp, dataprops, widthprops, widths, ut_fp:write('};\n') end +local build_emoji_table = function(ut_fp, emojiprops) + ut_fp:write('static const struct interval emoji[] = {\n') + for _, p in ipairs(emojiprops) do + if p[2]:match('Emoji%s+#') then + local start, end_ = p[1]:find('%.%.') + if start then + local n = tonumber(p[1]:sub(1, start - 1), 16) + local nl = tonumber(p[1]:sub(end_ + 1), 16) + ut_fp:write(make_range(n, nl)) + else + local n = tonumber(p[1], 16) + ut_fp:write(make_range(n, n)) + end + end + end + ut_fp:write('};\n') +end + local ud_fp = io.open(unicodedata_fname, 'r') local dataprops = parse_data_to_props(ud_fp) ud_fp:close() @@ -236,4 +261,10 @@ eaw_fp:close() build_width_table(ut_fp, dataprops, widthprops, {W=true, F=true}, 'doublewidth') build_width_table(ut_fp, dataprops, widthprops, {A=true}, 'ambiguous') +local emoji_fp = io.open(emoji_fname, 'r') +local emojiprops = parse_emoji_props(emoji_fp) +emoji_fp:close() + +build_emoji_table(ut_fp, emojiprops) + ut_fp:close() -- cgit From 45598d2e5e2b56e24e4d5abe4f28f259e3def572 Mon Sep 17 00:00:00 2001 From: James McCoy Date: Thu, 22 Sep 2016 00:40:45 -0400 Subject: vim-patch:7.4.1620 Problem: Emoji characters are not considered as a kind of word character. Solution: Give emoji characters a word class number. (Yashuhiro Matsumoto) https://github.com/vim/vim/commit/4077b33a8370afb3d5ae74e556a0119cf51fe294 --- scripts/genunicodetables.lua | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'scripts/genunicodetables.lua') diff --git a/scripts/genunicodetables.lua b/scripts/genunicodetables.lua index 08b8b0f90e..75adb36a8f 100644 --- a/scripts/genunicodetables.lua +++ b/scripts/genunicodetables.lua @@ -12,7 +12,7 @@ -- 2 then interval applies only to first, third, fifth, … character in range. -- Fourth value is number that should be added to the codepoint to yield -- folded/lower/upper codepoint. --- 4. emoji table: sorted list of non-overlapping closed intervals of Emoji +-- 4. emoji_tab table: sorted list of non-overlapping closed intervals of Emoji -- characters if arg[1] == '--help' then print('Usage:') @@ -221,7 +221,7 @@ local build_width_table = function(ut_fp, dataprops, widthprops, widths, end local build_emoji_table = function(ut_fp, emojiprops) - ut_fp:write('static const struct interval emoji[] = {\n') + ut_fp:write('static const struct interval emoji_tab[] = {\n') for _, p in ipairs(emojiprops) do if p[2]:match('Emoji%s+#') then local start, end_ = p[1]:find('%.%.') -- cgit From 1144cc6d9edd8d59b6f24e4d8f1df395342c2619 Mon Sep 17 00:00:00 2001 From: James McCoy Date: Thu, 22 Sep 2016 00:43:19 -0400 Subject: vim-patch:7.4.1629 Problem: Handling emoji characters as full width has problems with backwards compatibility. Solution: Remove ambiguous and double width characters from the emoji table. Use a separate table for the character class. (partly by Yashuhiro Matsumoto) https://github.com/vim/vim/commit/b86f10ee10bdf932df02bdaf601dffa671518a47 --- scripts/genunicodetables.lua | 71 ++++++++++++++++++++++++++++++++++++-------- 1 file changed, 59 insertions(+), 12 deletions(-) (limited to 'scripts/genunicodetables.lua') diff --git a/scripts/genunicodetables.lua b/scripts/genunicodetables.lua index 75adb36a8f..d5fcb56566 100644 --- a/scripts/genunicodetables.lua +++ b/scripts/genunicodetables.lua @@ -175,6 +175,7 @@ local build_width_table = function(ut_fp, dataprops, widthprops, widths, local start = -1 local end_ = -1 local dataidx = 1 + local ret = {} for _, p in ipairs(widthprops) do if widths[p[2]:sub(1, 1)] then local rng_start, rng_end = p[1]:find('%.%.') @@ -207,6 +208,7 @@ local build_width_table = function(ut_fp, dataprops, widthprops, widths, else if start >= 0 then ut_fp:write(make_range(start, end_)) + table.insert(ret, {start, end_}) end start = n end @@ -216,25 +218,68 @@ local build_width_table = function(ut_fp, dataprops, widthprops, widths, end if start >= 0 then ut_fp:write(make_range(start, end_)) + table.insert(ret, {start, end_}) end ut_fp:write('};\n') + return ret end -local build_emoji_table = function(ut_fp, emojiprops) - ut_fp:write('static const struct interval emoji_tab[] = {\n') +local build_emoji_table = function(ut_fp, emojiprops, doublewidth, ambiwidth) + local emojiwidth = {} + local emoji = {} for _, p in ipairs(emojiprops) do if p[2]:match('Emoji%s+#') then - local start, end_ = p[1]:find('%.%.') - if start then - local n = tonumber(p[1]:sub(1, start - 1), 16) - local nl = tonumber(p[1]:sub(end_ + 1), 16) - ut_fp:write(make_range(n, nl)) + local rng_start, rng_end = p[1]:find('%.%.') + if rng_start then + n = tonumber(p[1]:sub(1, rng_start - 1), 16) + n_last = tonumber(p[1]:sub(rng_end + 1), 16) else - local n = tonumber(p[1], 16) - ut_fp:write(make_range(n, n)) + n = tonumber(p[1], 16) + n_last = n + end + if #emoji > 0 and n - 1 == emoji[#emoji][2] then + emoji[#emoji][2] = n_last + else + table.insert(emoji, { n, n_last }) + end + -- exclude characters that are in the ambiguous/doublewidth table + for _, ambi in ipairs(ambiwidth) do + if n >= ambi[1] and n <= ambi[2] then + n = ambi[2] + 1 + end + if n_last >= ambi[1] and n_last <= ambi[2] then + n_last = ambi[1] - 1 + end + end + for _, double in ipairs(doublewidth) do + if n >= double[1] and n <= double[2] then + n = double[2] + 1 + end + if n_last >= double[1] and n_last <= double[2] then + n_last = double[1] - 1 + end + end + + if n <= n_last then + if #emojiwidth > 0 and n - 1 == emojiwidth[#emojiwidth][2] then + emojiwidth[#emojiwidth][2] = n_last + else + table.insert(emojiwidth, { n, n_last }) + end end end end + + ut_fp:write('static const struct interval emoji_all[] = {\n') + for _, p in ipairs(emoji) do + ut_fp:write(make_range(p[1], p[2])) + end + ut_fp:write('};\n') + + ut_fp:write('static const struct interval emoji_width[] = {\n') + for _, p in ipairs(emojiwidth) do + ut_fp:write(make_range(p[1], p[2])) + end ut_fp:write('};\n') end @@ -258,13 +303,15 @@ local eaw_fp = io.open(eastasianwidth_fname, 'r') local widthprops = parse_width_props(eaw_fp) eaw_fp:close() -build_width_table(ut_fp, dataprops, widthprops, {W=true, F=true}, 'doublewidth') -build_width_table(ut_fp, dataprops, widthprops, {A=true}, 'ambiguous') +local doublewidth = build_width_table(ut_fp, dataprops, widthprops, + {W=true, F=true}, 'doublewidth') +local ambiwidth = build_width_table(ut_fp, dataprops, widthprops, + {A=true}, 'ambiguous') local emoji_fp = io.open(emoji_fname, 'r') local emojiprops = parse_emoji_props(emoji_fp) emoji_fp:close() -build_emoji_table(ut_fp, emojiprops) +build_emoji_table(ut_fp, emojiprops, doublewidth, ambiwidth) ut_fp:close() -- cgit From dafca1ad68676d83845086963d3ef09d2e91679f Mon Sep 17 00:00:00 2001 From: James McCoy Date: Thu, 22 Sep 2016 00:12:44 -0400 Subject: vim-patch:7.4.1642 Problem: Handling emoji characters as full width has problems with backwards compatibility. Solution: Only put characters in the 1f000 range in the emoji table. https://github.com/vim/vim/commit/6a08454b93784c92296d4c08456401cbaa74c9d5 --- scripts/genunicodetables.lua | 50 +++++++++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 22 deletions(-) (limited to 'scripts/genunicodetables.lua') diff --git a/scripts/genunicodetables.lua b/scripts/genunicodetables.lua index d5fcb56566..f66f738659 100644 --- a/scripts/genunicodetables.lua +++ b/scripts/genunicodetables.lua @@ -12,8 +12,9 @@ -- 2 then interval applies only to first, third, fifth, … character in range. -- Fourth value is number that should be added to the codepoint to yield -- folded/lower/upper codepoint. --- 4. emoji_tab table: sorted list of non-overlapping closed intervals of Emoji --- characters +-- 4. emoji_width and emoji_all tables: sorted lists of non-overlapping closed +-- intervals of Emoji characters. emoji_width contains all the characters +-- which don't have ambiguous or double width, and emoji_all has all Emojis. if arg[1] == '--help' then print('Usage:') print(' genunicodetables.lua UnicodeData.txt CaseFolding.txt ' .. @@ -242,29 +243,34 @@ local build_emoji_table = function(ut_fp, emojiprops, doublewidth, ambiwidth) else table.insert(emoji, { n, n_last }) end - -- exclude characters that are in the ambiguous/doublewidth table - for _, ambi in ipairs(ambiwidth) do - if n >= ambi[1] and n <= ambi[2] then - n = ambi[2] + 1 - end - if n_last >= ambi[1] and n_last <= ambi[2] then - n_last = ambi[1] - 1 - end - end - for _, double in ipairs(doublewidth) do - if n >= double[1] and n <= double[2] then - n = double[2] + 1 + + -- Characters below 1F000 may be considered single width traditionally, + -- making them double width causes problems. + if n >= 0x1f000 then + -- exclude characters that are in the ambiguous/doublewidth table + for _, ambi in ipairs(ambiwidth) do + if n >= ambi[1] and n <= ambi[2] then + n = ambi[2] + 1 + end + if n_last >= ambi[1] and n_last <= ambi[2] then + n_last = ambi[1] - 1 + end end - if n_last >= double[1] and n_last <= double[2] then - n_last = double[1] - 1 + for _, double in ipairs(doublewidth) do + if n >= double[1] and n <= double[2] then + n = double[2] + 1 + end + if n_last >= double[1] and n_last <= double[2] then + n_last = double[1] - 1 + end end - end - if n <= n_last then - if #emojiwidth > 0 and n - 1 == emojiwidth[#emojiwidth][2] then - emojiwidth[#emojiwidth][2] = n_last - else - table.insert(emojiwidth, { n, n_last }) + if n <= n_last then + if #emojiwidth > 0 and n - 1 == emojiwidth[#emojiwidth][2] then + emojiwidth[#emojiwidth][2] = n_last + else + table.insert(emojiwidth, { n, n_last }) + end end end end -- cgit From 4ce24ff9da0f6551eeca2011dc9d05194bf02e12 Mon Sep 17 00:00:00 2001 From: James McCoy Date: Sat, 24 Sep 2016 14:27:04 -0400 Subject: genunicodetables: Give the unicode directory as the first argument Let genunicodetables determine which files it needs from the unicode directory. cmake just needs to pass the directory and destination file to the script. --- scripts/genunicodetables.lua | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'scripts/genunicodetables.lua') diff --git a/scripts/genunicodetables.lua b/scripts/genunicodetables.lua index f66f738659..66430ba26e 100644 --- a/scripts/genunicodetables.lua +++ b/scripts/genunicodetables.lua @@ -17,18 +17,22 @@ -- which don't have ambiguous or double width, and emoji_all has all Emojis. if arg[1] == '--help' then print('Usage:') - print(' genunicodetables.lua UnicodeData.txt CaseFolding.txt ' .. - 'EastAsianWidth.txt emoji-data.txt') - print(' unicode_tables.generated.h') + print(' genunicodetables.lua unicode/ unicode_tables.generated.h') os.exit(0) end -local unicodedata_fname = arg[1] -local casefolding_fname = arg[2] -local eastasianwidth_fname = arg[3] -local emoji_fname = arg[4] +local basedir = arg[1] +local pathsep = package.config:sub(1, 1) +local get_path = function(fname) + return basedir .. pathsep .. fname +end + +local unicodedata_fname = get_path('UnicodeData.txt') +local casefolding_fname = get_path('CaseFolding.txt') +local eastasianwidth_fname = get_path('EastAsianWidth.txt') +local emoji_fname = get_path('emoji-data.txt') -local utf_tables_fname = arg[5] +local utf_tables_fname = arg[2] local split_on_semicolons = function(s) local ret = {} -- cgit