From 09b64d75bd92a95d89c4f39f9df7918760abe98d Mon Sep 17 00:00:00 2001
From: "Justin M. Keyes" <justinkz@gmail.com>
Date: Sun, 27 Mar 2022 19:47:34 -0700
Subject: feat(docs): gen_help_html.lua
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Problem:
The :help docs HTML generated is driven by an old awk script
`runtime/doc/makehtml.awk` that is hard to maintain (ad hoc parser and
no one has touched it in decades) and has bugs like:
- https://github.com/neovim/neovim.github.io/issues/96
- https://github.com/neovim/neovim.github.io/issues/97

Solution:
Use Lua + treesitter (https://github.com/vigoux/tree-sitter-vimdoc) to
generate :help docs HTML.  Also validates tag links.

fix https://github.com/neovim/neovim.github.io/issues/96
fix https://github.com/neovim/neovim.github.io/issues/97

TODO:
- delete doc_html build task
- delete runtime/doc/Makefile
- delete makehtml.awk
- delete maketags.awk

OUTPUT:

    $ nvim -V1 -es --clean +"lua require('scripts.gen_help_html')"
    output dir: /…/neovim.github.io/_site/doc/
    generated (207  errors): api.txt         => api.html
    generated (122  errors): arabic.txt      => arabic.html
    generated (285  errors): autocmd.txt     => autocmd.html
    generated (641  errors): builtin.txt     => builtin.html
    generated (623  errors): change.txt      => change.html
    generated (65   errors): channel.txt     => channel.html
    generated (353  errors): cmdline.txt     => cmdline.html
    generated (3    errors): debug.txt       => debug.html
    generated (28   errors): deprecated.txt  => deprecated.html
    generated (193  errors): dev_style.txt   => dev_style.html
    generated (460  errors): develop.txt     => develop.html
    generated (19   errors): diagnostic.txt  => diagnostic.html
    generated (57   errors): diff.txt        => diff.html
    generated (818  errors): digraph.txt     => digraph.html
    generated (330  errors): editing.txt     => editing.html
    generated (368  errors): eval.txt        => eval.html
    generated (184  errors): fold.txt        => fold.html
    generated (61   errors): ft_ada.txt      => ft_ada.html
    generated (0    errors): ft_ps1.txt      => ft_ps1.html
    generated (20   errors): ft_raku.txt     => ft_raku.html
    generated (5    errors): ft_rust.txt     => ft_rust.html
    generated (41   errors): ft_sql.txt      => ft_sql.html
    generated (110  errors): gui.txt         => gui.html
    generated (79   errors): hebrew.txt      => hebrew.html
    generated (17   errors): help.txt        => index.html
    generated (104  errors): helphelp.txt    => helphelp.html
    generated (0    errors): if_cscop.txt    => if_cscop.html
    generated (23   errors): if_perl.txt     => if_perl.html
    generated (16   errors): if_pyth.txt     => if_pyth.html
    generated (9    errors): if_ruby.txt     => if_ruby.html
    generated (216  errors): indent.txt      => indent.html
    generated (634  errors): index.txt       => vimindex.html
    generated (320  errors): insert.txt      => insert.html
    generated (265  errors): intro.txt       => intro.html
    generated (9    errors): job_control.txt => job_control.html
    generated (0    errors): lsp-extension.txt => lsp-extension.html
    generated (214  errors): lsp.txt         => lsp.html
    generated (311  errors): lua.txt         => lua.html
    generated (592  errors): luaref.txt      => luaref.html
    generated (798  errors): luvref.txt      => luvref.html
    generated (663  errors): map.txt         => map.html
    generated (228  errors): mbyte.txt       => mbyte.html
    generated (228  errors): message.txt     => message.html
    generated (0    errors): mlang.txt       => mlang.html
    generated (761  errors): motion.txt      => motion.html
    generated (4    errors): nvim.txt        => nvim.html
    generated (226  errors): nvim_terminal_emulator.txt => nvim_terminal_emulator.html
    generated (988  errors): options.txt     => options.html
    generated (567  errors): pattern.txt     => pattern.html
    generated (15   errors): pi_gzip.txt     => pi_gzip.html
    generated (10   errors): pi_health.txt   => pi_health.html
    generated (27   errors): pi_msgpack.txt  => pi_msgpack.html
    generated (2177 errors): pi_netrw.txt    => pi_netrw.html
    generated (41   errors): pi_paren.txt    => pi_paren.html
    generated (9    errors): pi_spec.txt     => pi_spec.html
    generated (218  errors): pi_tar.txt      => pi_tar.html
    generated (0    errors): pi_tutor.txt    => pi_tutor.html
    generated (235  errors): pi_zip.txt      => pi_zip.html
    generated (265  errors): print.txt       => print.html
    generated (31   errors): provider.txt    => provider.html
    generated (335  errors): quickfix.txt    => quickfix.html
    generated (572  errors): quickref.txt    => quickref.html
    generated (109  errors): recover.txt     => recover.html
    generated (14   errors): remote.txt      => remote.html
    generated (14   errors): remote_plugin.txt => remote_plugin.html
    generated (351  errors): repeat.txt      => repeat.html
    generated (23   errors): rileft.txt      => rileft.html
    generated (12   errors): russian.txt     => russian.html
    generated (6    errors): scroll.txt      => scroll.html
    generated (106  errors): sign.txt        => sign.html
    generated (347  errors): spell.txt       => spell.html
    generated (784  errors): starting.txt    => starting.html
    generated (1499 errors): syntax.txt      => syntax.html
    generated (23   errors): tabpage.txt     => tabpage.html
    generated (257  errors): tagsrch.txt     => tagsrch.html
    generated (31   errors): term.txt        => term.html
    generated (0    errors): testing.txt     => testing.html
    generated (96   errors): tips.txt        => tips.html
    generated (57   errors): treesitter.txt  => treesitter.html
    generated (71   errors): uganda.txt      => uganda.html
    generated (74   errors): ui.txt          => ui.html
    generated (87   errors): undo.txt        => undo.html
    generated (17   errors): userfunc.txt    => userfunc.html
    generated (1    errors): usr_01.txt      => usr_01.html
    generated (89   errors): usr_02.txt      => usr_02.html
    generated (293  errors): usr_03.txt      => usr_03.html
    generated (46   errors): usr_04.txt      => usr_04.html
    generated (96   errors): usr_05.txt      => usr_05.html
    generated (54   errors): usr_06.txt      => usr_06.html
    generated (20   errors): usr_07.txt      => usr_07.html
    generated (241  errors): usr_08.txt      => usr_08.html
    generated (130  errors): usr_09.txt      => usr_09.html
    generated (50   errors): usr_10.txt      => usr_10.html
    generated (33   errors): usr_11.txt      => usr_11.html
    generated (32   errors): usr_12.txt      => usr_12.html
    generated (22   errors): usr_20.txt      => usr_20.html
    generated (75   errors): usr_21.txt      => usr_21.html
    generated (8    errors): usr_22.txt      => usr_22.html
    generated (3    errors): usr_23.txt      => usr_23.html
    generated (163  errors): usr_25.txt      => usr_25.html
    generated (13   errors): usr_26.txt      => usr_26.html
    generated (84   errors): usr_27.txt      => usr_27.html
    generated (173  errors): usr_28.txt      => usr_28.html
    generated (285  errors): usr_29.txt      => usr_29.html
    generated (280  errors): usr_30.txt      => usr_30.html
    generated (11   errors): usr_31.txt      => usr_31.html
    generated (13   errors): usr_32.txt      => usr_32.html
    generated (156  errors): usr_40.txt      => usr_40.html
    generated (134  errors): usr_41.txt      => usr_41.html
    generated (35   errors): usr_42.txt      => usr_42.html
    generated (19   errors): usr_43.txt      => usr_43.html
    generated (60   errors): usr_44.txt      => usr_44.html
    generated (13   errors): usr_45.txt      => usr_45.html
    generated (1    errors): usr_toc.txt     => usr_toc.html
    generated (69   errors): various.txt     => various.html
    generated (68   errors): vi_diff.txt     => vi_diff.html
    generated (437  errors): vim_diff.txt    => vim_diff.html
    generated (296  errors): visual.txt      => visual.html
    generated (181  errors): windows.txt     => windows.html
    generated 119 html pages
    total errors: 23862
    invalid tags: 537
---
 scripts/gen_help_html.py | 389 -----------------------------------------------
 1 file changed, 389 deletions(-)
 delete mode 100644 scripts/gen_help_html.py

(limited to 'scripts/gen_help_html.py')

diff --git a/scripts/gen_help_html.py b/scripts/gen_help_html.py
deleted file mode 100644
index 0b8e77ac22..0000000000
--- a/scripts/gen_help_html.py
+++ /dev/null
@@ -1,389 +0,0 @@
-# Converts Vim/Nvim documentation to HTML.
-#
-# USAGE:
-#   1. python3 scripts/gen_help_html.py runtime/doc/ ~/neovim.github.io/t/
-#   3. cd ~/neovim.github.io/ && jekyll serve --host 0.0.0.0
-#   2. Visit http://localhost:4000/t/help.txt.html
-#
-# Adapted from https://github.com/c4rlo/vimhelp/
-# License: MIT
-#
-# Copyright (c) 2016 Carlo Teubner
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-import os
-import re
-import urllib.parse
-import datetime
-import sys
-from itertools import chain
-
-HEAD = """\
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
-    "http://www.w3.org/TR/html4/loose.dtd">
-<html>
-<head>
-<meta http-equiv="Content-type" content="text/html; charset={encoding}"/>
-<style>
-.h {{
-  font-weight: bold;
-}}
-h1 {{
-  font-family: sans-serif;
-}}
-pre {{
-  font-family: sans-serif;
-}}
-</style>
-<title>Nvim: {filename}</title>
-"""
-
-HEAD_END = '</head>\n<body>\n'
-
-INTRO = """
-<h1>Nvim help files</h1>
-<p>
-<a href="https://neovim.io/">Nvim</a> help pages{vers-note}.
-Updated <a href="https://github.com/neovim/bot-ci" class="d">automatically</a>
-from the <a href="https://github.com/neovim/neovim" class="d">Nvim source</a>.
-</p>
-"""
-
-VERSION_NOTE = ", current as of Nvim {version}"
-
-SITENAVI_LINKS = """
-<a href="quickref.txt.html">Quick reference</a> &middot;
-<a href="usr_toc.txt.html">User manual</a> &middot;
-<a href="{helptxt}#reference_toc">Reference manual</a> &middot;
-"""
-
-SITENAVI_LINKS_PLAIN = SITENAVI_LINKS.format(helptxt='help.txt.html')
-SITENAVI_LINKS_WEB = SITENAVI_LINKS.format(helptxt='/')
-
-SITENAVI_PLAIN = '<p>' + SITENAVI_LINKS_PLAIN + '</p>'
-SITENAVI_WEB = '<p>' + SITENAVI_LINKS_WEB + '</p>'
-
-SITENAVI_SEARCH = '<table width="100%"><tbody><tr><td>' + SITENAVI_LINKS_WEB + \
-    '</td><td style="text-align: right; max-width: 25vw"><div class="gcse-searchbox">' \
-    '</div></td></tr></tbody></table><div class="gcse-searchresults"></div>'
-
-TEXTSTART = """
-<div id="d1">
-<pre id="sp">""" + (" " * 80) + """</pre>
-<div id="d2">
-<pre>
-"""
-
-FOOTER = '</pre>'
-
-FOOTER2 = """
-<p id="footer">Generated {generated_date} from <code>{commit}</code></p>
-</div>
-</div>
-</body>
-</html>
-""".format(
-    generated_date='{0:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now()),
-    commit='?')
-
-RE_TAGLINE = re.compile(r'(\S+)\s+(\S+)')
-
-PAT_WORDCHAR = '[!#-)+-{}~\xC0-\xFF]'
-
-PAT_HEADER = r'(^.*~$)'
-PAT_GRAPHIC = r'(^.* `$)'
-PAT_PIPEWORD = r'(?<!\\)\|([#-)!+-~]+)\|'
-PAT_STARWORD = r'\*([#-)!+-~]+)\*(?:(?=\s)|$)'
-PAT_COMMAND = r'`([^` ]+)`'
-PAT_OPTWORD = r"('(?:[a-z]{2,}|t_..)')"
-PAT_CTRL = r'(CTRL-(?:W_)?(?:\{char\}|<[A-Za-z]+?>|.)?)'
-PAT_SPECIAL = r'(<.+?>|\{.+?}|' \
-    r'\[(?:range|line|count|offset|\+?cmd|[-+]?num|\+\+opt|' \
-    r'arg|arguments|ident|addr|group)]|' \
-    r'(?<=\s)\[[-a-z^A-Z0-9_]{2,}])'
-PAT_TITLE = r'(Vim version [0-9.a-z]+|VIM REFERENCE.*)'
-PAT_NOTE = r'((?<!' + PAT_WORDCHAR + r')(?:note|NOTE|Notes?):?' \
-    r'(?!' + PAT_WORDCHAR + r'))'
-PAT_URL = r'((?:https?|ftp)://[^\'"<> \t]+[a-zA-Z0-9/])'
-PAT_WORD = r'((?<!' + PAT_WORDCHAR + r')' + PAT_WORDCHAR + r'+' \
-    r'(?!' + PAT_WORDCHAR + r'))'
-
-RE_LINKWORD = re.compile(
-    PAT_OPTWORD + '|' +
-    PAT_CTRL + '|' +
-    PAT_SPECIAL)
-RE_TAGWORD = re.compile(
-    PAT_HEADER + '|' +
-    PAT_GRAPHIC + '|' +
-    PAT_PIPEWORD + '|' +
-    PAT_STARWORD + '|' +
-    PAT_COMMAND + '|' +
-    PAT_OPTWORD + '|' +
-    PAT_CTRL + '|' +
-    PAT_SPECIAL + '|' +
-    PAT_TITLE + '|' +
-    PAT_NOTE + '|' +
-    PAT_URL + '|' +
-    PAT_WORD)
-RE_NEWLINE = re.compile(r'[\r\n]')
-# H1 header "=====…"
-# H2 header "-----…"
-RE_HRULE = re.compile(r'[-=]{3,}.*[-=]{3,3}$')
-RE_EG_START = re.compile(r'(?:.* )?>$')
-RE_EG_END = re.compile(r'\S')
-RE_SECTION = re.compile(r'[-A-Z .][-A-Z0-9 .()]*(?=\s+\*)')
-RE_STARTAG = re.compile(r'\s\*([^ \t|]+)\*(?:\s|$)')
-RE_LOCAL_ADD = re.compile(r'LOCAL ADDITIONS:\s+\*local-additions\*$')
-
-
-class Link(object):
-    __slots__ = 'link_plain_same',    'link_pipe_same', \
-                'link_plain_foreign', 'link_pipe_foreign', \
-                'filename'
-
-    def __init__(self, link_plain_same, link_plain_foreign,
-                 link_pipe_same,  link_pipe_foreign, filename):
-        self.link_plain_same = link_plain_same
-        self.link_plain_foreign = link_plain_foreign
-        self.link_pipe_same = link_pipe_same
-        self.link_pipe_foreign = link_pipe_foreign
-        self.filename = filename
-
-
-class VimH2H(object):
-    def __init__(self, tags, version=None, is_web_version=True):
-        self._urls = {}
-        self._version = version
-        self._is_web_version = is_web_version
-        for line in RE_NEWLINE.split(tags):
-            m = RE_TAGLINE.match(line)
-            if m:
-                tag, filename = m.group(1, 2)
-                self.do_add_tag(filename, tag)
-
-    def add_tags(self, filename, contents):
-        for match in RE_STARTAG.finditer(contents):
-            tag = match.group(1).replace('\\', '\\\\').replace('/', '\\/')
-            self.do_add_tag(str(filename), tag)
-
-    def do_add_tag(self, filename, tag):
-        tag_quoted = urllib.parse.quote_plus(tag)
-
-        def mkpart1(doc):
-            return '<a href="' + doc + '#' + tag_quoted + '" class="'
-        part1_same = mkpart1('')
-        if self._is_web_version and filename == 'help.txt':
-            doc = '/'
-        else:
-            doc = filename + '.html'
-        part1_foreign = mkpart1(doc)
-        part2 = '">' + html_escape[tag] + '</a>'
-
-        def mklinks(cssclass):
-            return (part1_same + cssclass + part2,
-                    part1_foreign + cssclass + part2)
-        cssclass_plain = 'd'
-        m = RE_LINKWORD.match(tag)
-        if m:
-            opt, ctrl, special = m.groups()
-            if opt is not None:
-                cssclass_plain = 'o'
-            elif ctrl is not None:
-                cssclass_plain = 'k'
-            elif special is not None:
-                cssclass_plain = 's'
-        links_plain = mklinks(cssclass_plain)
-        links_pipe = mklinks('l')
-        self._urls[tag] = Link(
-            links_plain[0], links_plain[1],
-            links_pipe[0],  links_pipe[1],
-            filename)
-
-    def maplink(self, tag, curr_filename, css_class=None):
-        links = self._urls.get(tag)
-        if links is not None:
-            if links.filename == curr_filename:
-                if css_class == 'l':
-                    return links.link_pipe_same
-                else:
-                    return links.link_plain_same
-            else:
-                if css_class == 'l':
-                    return links.link_pipe_foreign
-                else:
-                    return links.link_plain_foreign
-        elif css_class is not None:
-            return '<span class="' + css_class + '">' + html_escape[tag] + \
-                '</span>'
-        else:
-            return html_escape[tag]
-
-    def to_html(self, filename, contents, encoding):
-        out = []
-
-        inexample = 0
-        filename = str(filename)
-        is_help_txt = (filename == 'help.txt')
-        last = ''
-        for line in RE_NEWLINE.split(contents):
-            line = line.rstrip('\r\n')
-            line_tabs = line
-            line = line.expandtabs()
-            if last == 'h1':
-                out.extend(('</pre>'))  # XXX
-                out.extend(('<h1>', line.rstrip(), '</h1>\n'))
-                out.extend(('<pre>'))
-                last = ''
-                continue
-            if RE_HRULE.match(line):
-                # out.extend(('<span class="h">', line, '</span>\n'))
-                last = 'h1'
-                continue
-            if inexample == 2:
-                if RE_EG_END.match(line):
-                    inexample = 0
-                    if line[0] == '<':
-                        line = line[1:]
-                else:
-                    out.extend(('<span class="e">', html_escape[line],
-                                '</span>\n'))
-                    continue
-            if RE_EG_START.match(line_tabs):
-                inexample = 1
-                line = line[0:-1]
-            if RE_SECTION.match(line_tabs):
-                m = RE_SECTION.match(line)
-                out.extend((r'<span class="c">', m.group(0), r'</span>'))
-                line = line[m.end():]
-            lastpos = 0
-            for match in RE_TAGWORD.finditer(line):
-                pos = match.start()
-                if pos > lastpos:
-                    out.append(html_escape[line[lastpos:pos]])
-                lastpos = match.end()
-                header, graphic, pipeword, starword, command, opt, ctrl, \
-                    special, title, note, url, word = match.groups()
-                if pipeword is not None:
-                    out.append(self.maplink(pipeword, filename, 'l'))
-                elif starword is not None:
-                    out.extend(('<a name="', urllib.parse.quote_plus(starword),
-                                '" class="t">', html_escape[starword], '</a>'))
-                elif command is not None:
-                    out.extend(('<span class="e">', html_escape[command],
-                                '</span>'))
-                elif opt is not None:
-                    out.append(self.maplink(opt, filename, 'o'))
-                elif ctrl is not None:
-                    out.append(self.maplink(ctrl, filename, 'k'))
-                elif special is not None:
-                    out.append(self.maplink(special, filename, 's'))
-                elif title is not None:
-                    out.extend(('<span class="i">', html_escape[title],
-                                '</span>'))
-                elif note is not None:
-                    out.extend(('<span class="n">', html_escape[note],
-                                '</span>'))
-                elif header is not None:
-                    out.extend(('<span class="h">', html_escape[header[:-1]],
-                                '</span>'))
-                elif graphic is not None:
-                    out.append(html_escape[graphic[:-2]])
-                elif url is not None:
-                    out.extend(('<a class="u" href="', url, '">' +
-                                html_escape[url], '</a>'))
-                elif word is not None:
-                    out.append(self.maplink(word, filename))
-            if lastpos < len(line):
-                out.append(html_escape[line[lastpos:]])
-            out.append('\n')
-            if inexample == 1:
-                inexample = 2
-
-        header = []
-        header.append(HEAD.format(encoding=encoding, filename=filename))
-        header.append(HEAD_END)
-        if self._is_web_version and is_help_txt:
-            vers_note = VERSION_NOTE.replace('{version}', self._version) \
-                if self._version else ''
-            header.append(INTRO.replace('{vers-note}', vers_note))
-        if self._is_web_version:
-            header.append(SITENAVI_SEARCH)
-            sitenavi_footer = SITENAVI_WEB
-        else:
-            header.append(SITENAVI_PLAIN)
-            sitenavi_footer = SITENAVI_PLAIN
-        header.append(TEXTSTART)
-        return ''.join(chain(header, out, (FOOTER, sitenavi_footer, FOOTER2)))
-
-
-class HtmlEscCache(dict):
-    def __missing__(self, key):
-        r = key.replace('&', '&amp;') \
-               .replace('<', '&lt;') \
-               .replace('>', '&gt;')
-        self[key] = r
-        return r
-
-
-html_escape = HtmlEscCache()
-
-
-def slurp(filename):
-    try:
-        with open(filename, encoding='UTF-8') as f:
-            return f.read(), 'UTF-8'
-    except UnicodeError:
-        # 'ISO-8859-1' ?
-        with open(filename, encoding='latin-1') as f:
-            return f.read(), 'latin-1'
-
-
-def usage():
-    return "usage: " + sys.argv[0] + " IN_DIR OUT_DIR [BASENAMES...]"
-
-
-def main():
-    if len(sys.argv) < 3:
-        sys.exit(usage())
-
-    in_dir = sys.argv[1]
-    out_dir = sys.argv[2]
-    basenames = sys.argv[3:]
-
-    print("Processing tags...")
-    h2h = VimH2H(slurp(os.path.join(in_dir, 'tags'))[0], is_web_version=False)
-
-    if len(basenames) == 0:
-        basenames = os.listdir(in_dir)
-
-    for basename in basenames:
-        if os.path.splitext(basename)[1] != '.txt' and basename != 'tags':
-            print("Ignoring " + basename)
-            continue
-        print("Processing " + basename + "...")
-        path = os.path.join(in_dir, basename)
-        text, encoding = slurp(path)
-        outpath = os.path.join(out_dir, basename + '.html')
-        of = open(outpath, 'w')
-        of.write(h2h.to_html(basename, text, encoding))
-        of.close()
-
-
-main()
-- 
cgit