aboutsummaryrefslogtreecommitdiff
path: root/scripts/genunicodetables.lua
diff options
context:
space:
mode:
Diffstat (limited to 'scripts/genunicodetables.lua')
-rw-r--r--scripts/genunicodetables.lua225
1 files changed, 225 insertions, 0 deletions
diff --git a/scripts/genunicodetables.lua b/scripts/genunicodetables.lua
new file mode 100644
index 0000000000..e8de5026de
--- /dev/null
+++ b/scripts/genunicodetables.lua
@@ -0,0 +1,225 @@
+if arg[1] == '--help' then
+ print('Usage:')
+ print(' genunicodetables.lua UnicodeData.txt CaseFolding.txt ' ..
+ 'EastAsianWidth.txt')
+ print(' unicode_tables.generated.h')
+ os.exit(0)
+end
+
+local unicodedata_fname = arg[1]
+local casefolding_fname = arg[2]
+local eastasianwidth_fname = arg[3]
+
+local utf_tables_fname = arg[4]
+
+local split_on_semicolons = function(s)
+ local ret = {}
+ local idx = 1
+ while idx <= #s + 1 do
+ item = s:match('^[^;]*', idx)
+ idx = idx + #item + 1
+ if idx <= #s + 1 then
+ assert(s:sub(idx - 1, idx - 1) == ';')
+ end
+ item = item:gsub('^%s*', '')
+ item = item:gsub('%s*$', '')
+ table.insert(ret, item)
+ end
+ return ret
+end
+
+local fp_lines_to_lists = function(fp, n, has_comments)
+ local ret = {}
+ local line
+ local i = 0
+ while true do
+ i = i + 1
+ line = fp:read('*l')
+ if not line then
+ break
+ end
+ if (not has_comments
+ or (line:sub(1, 1) ~= '#' and not line:match('^%s*$'))) then
+ local l = split_on_semicolons(line)
+ if #l ~= n then
+ io.stderr:write(('Found %s items in line %u, expected %u\n'):format(
+ #l, i, n))
+ io.stderr:write('Line: ' .. line .. '\n')
+ return nil
+ end
+ table.insert(ret, l)
+ end
+ end
+ return ret
+end
+
+local parse_data_to_props = function(ud_fp)
+ return fp_lines_to_lists(ud_fp, 15, false)
+end
+
+local parse_fold_props = function(cf_fp)
+ return fp_lines_to_lists(cf_fp, 4, true)
+end
+
+local parse_width_props = function(eaw_fp)
+ return fp_lines_to_lists(eaw_fp, 2, true)
+end
+
+local make_range = function(start, end_, step, add)
+ if step and add then
+ return (' {0x%x, 0x%x, %d, %d},\n'):format(
+ start, end_, step == 0 and -1 or step, add)
+ else
+ return (' {0x%04x, 0x%04x},\n'):format(start, end_)
+ end
+end
+
+local build_convert_table = function(ut_fp, props, cond_func, nl_index,
+ table_name)
+ ut_fp:write('static const convertStruct ' .. table_name .. '[] = {\n')
+ local start = -1
+ local end_ = -1
+ local step = 0
+ local add = -1
+ for _, p in ipairs(props) do
+ if cond_func(p) then
+ local n = tonumber(p[1], 16)
+ local nl = tonumber(p[nl_index], 16)
+ if start >= 0 and add == (nl - n) and (step == 0 or n - end_ == step) then
+ -- Continue with the same range.
+ step = n - end_
+ end_ = n
+ else
+ if start >= 0 then
+ -- Produce previous range.
+ ut_fp:write(make_range(start, end_, step, add))
+ end
+ start = n
+ end_ = n
+ step = 0
+ add = nl - n
+ end
+ end
+ end
+ if start >= 0 then
+ ut_fp:write(make_range(start, end_, step, add))
+ end
+ ut_fp:write('};\n')
+end
+
+local build_case_table = function(ut_fp, dataprops, table_name, index)
+ local cond_func = function(p)
+ return p[index] ~= ''
+ end
+ return build_convert_table(ut_fp, dataprops, cond_func, index,
+ 'to' .. table_name)
+end
+
+local build_fold_table = function(ut_fp, foldprops)
+ local cond_func = function(p)
+ return (p[2] == 'C' or p[2] == 'S')
+ end
+ return build_convert_table(ut_fp, foldprops, cond_func, 3, 'foldCase')
+end
+
+local build_combining_table = function(ut_fp, dataprops)
+ ut_fp:write('static const struct interval combining[] = {\n')
+ local start = -1
+ local end_ = -1
+ for _, p in ipairs(dataprops) do
+ if (({Mn=true, Mc=true, Me=true})[p[3]]) then
+ local n = tonumber(p[1], 16)
+ if start >= 0 and end_ + 1 == n then
+ -- Continue with the same range.
+ end_ = n
+ else
+ if start >= 0 then
+ -- Produce previous range.
+ ut_fp:write(make_range(start, end_))
+ end
+ start = n
+ end_ = n
+ end
+ end
+ end
+ if start >= 0 then
+ ut_fp:write(make_range(start, end_))
+ end
+ ut_fp:write('};\n')
+end
+
+local build_width_table = function(ut_fp, dataprops, widthprops, widths,
+ table_name)
+ ut_fp:write('static const struct interval ' .. table_name .. '[] = {\n')
+ local start = -1
+ local end_ = -1
+ local dataidx = 1
+ for _, p in ipairs(widthprops) do
+ if widths[p[2]:sub(1, 1)] then
+ local rng_start, rng_end = p[1]:find('%.%.')
+ local n, n_last
+ if rng_start then
+ -- It is a range. We don’t check for composing char then.
+ n = tonumber(p[1]:sub(1, rng_start - 1), 16)
+ n_last = tonumber(p[1]:sub(rng_end + 1), 16)
+ else
+ n = tonumber(p[1], 16)
+ n_last = n
+ end
+ local dn
+ while true do
+ dn = tonumber(dataprops[dataidx][1], 16)
+ if dn >= n then
+ break
+ end
+ dataidx = dataidx + 1
+ end
+ if dn ~= n and n_last == n then
+ io.stderr:write('Cannot find character ' .. n .. ' in data table.\n')
+ end
+ -- Only use the char when it’s not a composing char.
+ -- But use all chars from a range.
+ local dp = dataprops[dataidx]
+ if (n_last > n) or (not (({Mn=true, Mc=true, Me=true})[dp[3]])) then
+ if start >= 0 and end_ + 1 == n then
+ -- Continue with the same range.
+ else
+ if start >= 0 then
+ ut_fp:write(make_range(start, end_))
+ end
+ start = n
+ end
+ end_ = n_last
+ end
+ end
+ end
+ if start >= 0 then
+ ut_fp:write(make_range(start, end_))
+ end
+ ut_fp:write('};\n')
+end
+
+local ud_fp = io.open(unicodedata_fname, 'r')
+local dataprops = parse_data_to_props(ud_fp)
+ud_fp:close()
+
+local ut_fp = io.open(utf_tables_fname, 'w')
+
+build_case_table(ut_fp, dataprops, 'Lower', 14)
+build_case_table(ut_fp, dataprops, 'Upper', 13)
+build_combining_table(ut_fp, dataprops)
+
+local cf_fp = io.open(casefolding_fname, 'r')
+local foldprops = parse_fold_props(cf_fp)
+cf_fp:close()
+
+build_fold_table(ut_fp, foldprops)
+
+local eaw_fp = io.open(eastasianwidth_fname, 'r')
+local widthprops = parse_width_props(eaw_fp)
+eaw_fp:close()
+
+build_width_table(ut_fp, dataprops, widthprops, {W=true, F=true}, 'doublewidth')
+build_width_table(ut_fp, dataprops, widthprops, {A=true}, 'ambiguous')
+
+ut_fp:close()