4 files changed, 544 insertions, 16 deletions
diff --git a/test/unit/formatc.lua b/test/unit/formatc.lua
new file mode 100644
index 0000000000..64e651e8da
--- /dev/null
+++ b/test/unit/formatc.lua
@@ -0,0 +1,236 @@
+--[[ Copyright (c) 2009 Peter "Corsix" Cawley
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE. --]]
+
+-- this C parser was taken from Corsix-TH, I'm sure this could be done much
+-- better (i.e.: I think everything I do could be substitutions made with LPeg
+-- during parsing), but I've just learned enough basic LPeg to make this
+-- work.
+-- see: http://lua-users.org/wiki/LpegRecipes
+
+local lpeg = require "lpeg"
+
+local C, P, R, S, V = lpeg.C, lpeg.P, lpeg.R, lpeg.S, lpeg.V
+local Carg, Cc, Cp, Ct = lpeg.Carg, lpeg.Cc, lpeg.Cp, lpeg.Ct
+
+local tokens = P { "tokens";
+  -- Comment of form /* ... */
+  comment = Ct(P"/*" * C((V"newline" + (1 - P"*/"))^0) * P"*/" * Cc"comment"),
+
+  -- Single line comment
+  line_comment = Ct(P"//" * C((1 - V"newline")^0)  * Cc"comment_line"),
+
+  -- Single platform independant line break which increments line number
+  newline = (P"\r\n" + P"\n\r" + S"\r\n") * (Cp() * Carg(1)) / function(pos, state)
+    state.line = state.line + 1
+    state.line_start = pos
+  end,
+
+  -- Line continuation
+  line_extend = Ct(C(P[[\]] * V"newline") * Cc"line_extend"),
+
+  -- Whitespace of any length (includes newlines)
+  whitespace = Ct(C((S" \t" + V"newline")^1) * Cc"whitespace"),
+
+  -- Special form of #include with filename followed in angled brackets (matches 3 tokens)
+  include = Ct(C(P"#include") * Cc"preprocessor") *
+            Ct(C(S" \t"^1) * Cc"whitespace") *
+            Ct(C(P"<" * (1 - P">")^1 * P">") * Cc"string"),
+
+  -- Preprocessor instruction
+  preprocessor = V"include" +
+                 Ct(C(P"#" * P" "^0 * ( P"define" + P"elif" + P"else" + P"endif" + P"#" +
+                               P"error" + P"ifdef" + P"ifndef" + P"if" + P"import" +
+                               P"include" + P"line" + P"pragma" + P"undef" + P"using" +
+                               P"pragma"
+                             ) * #S" \r\n\t") * Cc"preprocessor"),
+
+  -- Identifier of form [a-zA-Z_][a-zA-Z0-9_]*
+  identifier = Ct(C(R("az","AZ","__") * R("09","az","AZ","__")^0) * Cc"identifier"),
+
+  -- Single character in a string
+  string_char = R("az","AZ","09") + S"$%^&*()_-+={[}]:;@~#<,>.!?/ \t" + (P"\\" * S[[ntvbrfa\?'"0x]]),
+
+  -- String literal
+  string = Ct(C(P"'" * (V"string_char" + P'"')^0 * P"'" +
+                P'"' * (V"string_char" + P"'")^0 * P'"') * Cc"string"),
+
+  -- Operator
+  operator = Ct(C(P">>=" + P"<<=" + P"..." +
+                  P"::" + P"<<" + P">>" + P"<=" + P">=" + P"==" + P"!=" +
+                  P"||" + P"&&" + P"++" + P"--" + P"->" + P"+=" + P"-=" +
+                  P"*=" + P"/=" + P"|=" + P"&=" + P"^=" + S"+-*/=<>%^|&.?:!~,") * Cc"operator"),
+
+  -- Misc. char (token type is the character itself)
+  char = Ct(C(S"[]{}();") / function(x) return x, x end),
+
+  -- Hex, octal or decimal number
+  int = Ct(C((P"0x" * R("09","af","AF")^1) + (P"0" * R"07"^0) + R"09"^1) * Cc"integer"),
+
+  -- Floating point number
+  f_exponent = S"eE" + S"+-"^-1 * R"09"^1,
+  f_terminator = S"fFlL",
+  float = Ct(C(
+            R"09"^1 * V"f_exponent" * V"f_terminator"^-1 +
+            R"09"^0 * P"." * R"09"^1 * V"f_exponent"^-1 * V"f_terminator"^-1 +
+            R"09"^1 * P"." * R"09"^0 * V"f_exponent"^-1 * V"f_terminator"^-1
+          ) * Cc"float"),
+
+  -- Any token
+  token = V"comment" +
+          V"line_comment" +
+          V"identifier" +
+          V"whitespace" +
+          V"line_extend" +
+          V"preprocessor" +
+          V"string" +
+          V"char" +
+          V"operator" +
+          V"float" +
+          V"int",
+
+  -- Error for when nothing else matches
+  error = (Cp() * C(P(1) ^ -8) * Carg(1)) / function(pos, where, state)
+    error(("Tokenising error on line %i, position %i, near '%s'")
+      :format(state.line, pos - state.line_start + 1, where))
+  end,
+
+  -- Match end of input or throw error
+  finish = -P(1) + V"error",
+
+  -- Match stream of tokens into a table
+  tokens = Ct(V"token" ^ 0) * V"finish",
+}
+
+local function TokeniseC(str)
+  return tokens:match(str, 1, {line = 1, line_start = 1})
+end
+
+local function set(t)
+  local s = {}
+  for i, v in ipairs(t) do
+    s[v] = true
+  end
+  return s
+end
+
+local C_keywords = set {
+  "break", "case", "char", "const", "continue", "default", "do", "double",
+  "else", "enum", "extern", "float", "for", "goto", "if", "int", "long",
+  "register", "return", "short", "signed", "sizeof", "static", "struct",
+  "switch", "typedef", "union", "unsigned", "void", "volatile", "while",
+}
+
+-- Very primitive C formatter that tries to put "things" inside braces on one
+-- line. This is a step done after preprocessing the C source to ensure that
+-- the duplicate line detecter can more reliably pick out identical declarations.
+--
+-- an example:
+--   struct mystruct
+--   {
+--      int a;
+--      int b;
+--   };
+--
+-- would become:
+--  struct mystruct
+--  { int a; int b; };
+--
+--  The first one will have a lot of false positives (the line '{' for
+--  example), the second one is more unique.
+local function formatc(str)
+  local tokens = TokeniseC(str)
+  local result = {}
+  local block_level = 0
+  local allow_one_nl = false
+  local end_at_brace = false
+
+  for i, token in ipairs(tokens) do
+    local typ = token[2]
+    if typ == '{' then
+      block_level = block_level + 1
+    elseif typ == '}' then
+      block_level = block_level - 1
+
+      if block_level == 0 and end_at_brace then
+        -- if we're not inside a block, we're at the basic statement level,
+        -- and ';' indicates we're at the end of a statement, so we put end
+        -- it with a newline.
+        token[1] = token[1] .. "\n"
+        end_at_brace = false
+      end
+    elseif typ == 'identifier' then
+      -- static usually indicates an inline header function, which has no
+      -- trailing ';', so we have to add a newline after the '}' ourselves.
+      if token[1] == 'static' then
+        end_at_brace = true
+      end
+    elseif typ == 'preprocessor' then
+      -- preprocessor directives don't end in ';' but need their newline, so
+      -- we're going to allow the next newline to pass.
+      allow_one_nl = true
+    elseif typ == ';' then
+      if block_level == 0 then
+        -- if we're not inside a block, we're at the basic statement level,
+        -- and ';' indicates we're at the end of a statement, so we put end
+        -- it with a newline.
+        token[1] = ";\n"
+      end
+    elseif typ == 'whitespace' then
+      -- replace all whitespace by one space
+      local repl = " "
+
+      -- except when allow_on_nl is true and there's a newline in the whitespace
+      if string.find(token[1], "[\r\n]+") and allow_one_nl == true then
+        -- in that case we replace all whitespace by one newline
+        repl = "\n"
+        allow_one_nl = false
+      end
+
+      token[1] = string.gsub(token[1], "%s+", repl)
+    end
+    result[#result + 1] = token[1]
+  end
+
+  return table.concat(result)
+end
+
+-- uncomment the following lines (and comment the return) for standalone
+-- operation (very handy for debugging)
+local function standalone(...)
+  require "moonscript"
+  Preprocess = require("preprocess")
+  Preprocess.add_to_include_path('./../../src')
+
+  input = Preprocess.preprocess_stream(arg[1])
+  local raw = input:read('*all')
+  input:close()
+
+  local formatted
+  if #arg == 2 and arg[2] == 'no' then
+      formatted = raw
+  else
+      formatted = formatc(raw)
+  end
+
+  print(formatted)
+end
+-- standalone(...)
+
+return formatc
diff --git a/test/unit/helpers.moon b/test/unit/helpers.moon
index 77d491a008..e84c569143 100644
--- a/test/unit/helpers.moon
+++ b/test/unit/helpers.moon
@@ -1,4 +1,19 @@
 ffi = require 'ffi'
+lpeg = require 'lpeg'
+formatc = require 'test.unit.formatc'
+Set = require 'test.unit.set'
+Preprocess = require 'test.unit.preprocess'
+
+-- add some standard header locations
+-- TODO(aktau, jszakmeister): optionally pass more header locations via env
+Preprocess.add_to_include_path('./src')
+Preprocess.add_to_include_path('./.deps/usr/include')
+Preprocess.add_to_include_path('./build/config')
+
+if ffi.abi('32bit')
+  Preprocess.add_to_include_path('/opt/neovim-deps/32/include')
+else
+  Preprocess.add_to_include_path('/opt/neovim-deps/include')
 
 -- load neovim shared library
 testlib = os.getenv 'NVIM_TEST_LIB'
@@ -7,22 +22,72 @@ unless testlib
 
 libnvim = ffi.load testlib
 
--- Luajit ffi parser doesn't understand preprocessor directives, so
--- this helper function removes common directives before passing it the to ffi.
--- It will return a pointer to the library table, emulating 'requires'
-cimport = (path) ->
-  header_file = io.open path, 'rb'
-
-  if not header_file
-    error "cannot find #{path}"
-
-  header = header_file\read '*a'
-  header_file.close!
-  header = string.gsub header, '#include[^\n]*\n', ''
-  header = string.gsub header, '#ifndef[^\n]*\n', ''
-  header = string.gsub header, '#define[^\n]*\n', ''
-  header = string.gsub header, '#endif[^\n]*\n', ''
-  ffi.cdef header
+trim = (s) ->
+  s\match'^%s*(.*%S)' or ''
+
+-- a Set that keeps around the lines we've already seen
+export cdefs
+if cdefs == nil
+  cdefs = Set!
+
+export imported
+if imported == nil
+  imported = Set!
+
+-- some things are just too complex for the LuaJIT C parser to digest. We
+-- usually don't need them anyway.
+filter_complex_blocks = (body) ->
+  result = {}
+  for line in body\gmatch("[^\r\n]+")
+    -- remove all lines that contain Objective-C block syntax, the LuaJIT ffi
+    -- doesn't understand it.
+    if string.find(line, "(^)", 1, true) ~= nil
+      continue
+    if string.find(line, "_ISwupper", 1, true) ~= nil
+      continue
+    result[#result + 1] = line
+  table.concat(result, "\n")
+
+-- use this helper to import C files, you can pass multiple paths at once,
+-- this helper will return the C namespace of the nvim library.
+-- cimport = (path) ->
+cimport = (...) ->
+  -- filter out paths we've already imported
+  paths = [path for path in *{...} when not imported\contains(path)]
+  for path in *paths
+    imported\add(path)
+
+  if #paths == 0
+    return libnvim
+
+  -- preprocess the header
+  stream = Preprocess.preprocess_stream(unpack(paths))
+  body = stream\read("*a")
+  stream\close!
+
+  -- format it (so that the lines are "unique" statements), also filter out
+  -- Objective-C blocks
+  body = formatc(body)
+  body = filter_complex_blocks(body)
+
+  -- add the formatted lines to a set
+  new_cdefs = Set!
+  for line in body\gmatch("[^\r\n]+")
+    new_cdefs\add(trim(line))
+
+  -- subtract the lines we've already imported from the new lines, then add
+  -- the new unique lines to the old lines (so they won't be imported again)
+  new_cdefs\diff(cdefs)
+  cdefs\union(new_cdefs)
+
+  if new_cdefs\size! == 0
+    -- if there's no new lines, just return
+    return libnvim
+
+  -- request a sorted version of the new lines (same relative order as the
+  -- original preprocessed file) and feed that to the LuaJIT ffi
+  new_lines = new_cdefs\to_table!
+  ffi.cdef(table.concat(new_lines, "\n"))
 
   return libnvim
 
diff --git a/test/unit/preprocess.moon b/test/unit/preprocess.moon
new file mode 100644
index 0000000000..88580476b2
--- /dev/null
+++ b/test/unit/preprocess.moon
@@ -0,0 +1,155 @@
+-- helps managing loading different headers into the LuaJIT ffi. Untested on
+-- windows, will probably need quite a bit of adjustment to run there.
+
+ffi = require("ffi")
+
+ccs = {}
+
+env_cc = os.getenv("CC")
+if env_cc
+  table.insert(ccs, {path: "/usr/bin/env #{env_cc}", type: "gcc"})
+
+if ffi.os == "Windows"
+  table.insert(ccs, {path: "cl", type: "msvc"})
+
+table.insert(ccs, {path: "/usr/bin/env cc", type: "gcc"})
+table.insert(ccs, {path: "/usr/bin/env gcc", type: "gcc"})
+table.insert(ccs, {path: "/usr/bin/env gcc-4.9", type: "gcc"})
+table.insert(ccs, {path: "/usr/bin/env gcc-4.8", type: "gcc"})
+table.insert(ccs, {path: "/usr/bin/env gcc-4.7", type: "gcc"})
+table.insert(ccs, {path: "/usr/bin/env clang", type: "clang"})
+table.insert(ccs, {path: "/usr/bin/env icc", type: "gcc"})
+
+quote_me = '[^%w%+%-%=%@%_%/]' -- complement (needn't quote)
+shell_quote = (str) ->
+  if string.find(str, quote_me) or str == '' then
+    "'" .. string.gsub(str, "'", [['"'"']]) .. "'"
+  else
+    str
+
+-- parse Makefile format dependencies into a Lua table
+parse_make_deps = (deps) ->
+  -- remove line breaks and line concatenators
+  deps = deps\gsub("\n", "")\gsub("\\", "")
+
+  -- remove the Makefile "target:" element
+  deps = deps\gsub(".+:", "")
+
+  -- remove redundant spaces
+  deps = deps\gsub("  +", " ")
+
+  -- split acording to token (space in this case)
+  headers = {}
+  for token in deps\gmatch("[^%s]+")
+    -- headers[token] = true
+    headers[#headers + 1] = token
+
+  -- resolve path redirections (..) to normalize all paths
+  for i, v in ipairs(headers)
+    -- double dots (..)
+    headers[i] = v\gsub("/[^/%s]+/%.%.", "")
+
+    -- single dot (.)
+    headers[i] = v\gsub("%./", "")
+
+  headers
+
+-- will produce a string that represents a meta C header file that includes
+-- all the passed in headers. I.e.:
+--
+-- headerize({"stdio.h", "math.h", true}
+-- produces:
+-- #include <stdio.h>
+-- #include <math.h>
+--
+-- headerize({"vim.h", "memory.h", false}
+-- produces:
+-- #include "vim.h"
+-- #include "memory.h"
+headerize = (headers, global) ->
+  pre = '"'
+  post = pre
+  if global
+    pre = "<"
+    post = ">"
+
+  formatted = ["#include #{pre}#{hdr}#{post}" for hdr in *headers]
+  table.concat(formatted, "\n")
+
+class Gcc
+  -- preprocessor flags that will hopefully make the compiler produce C
+  -- declarations that the LuaJIT ffi understands.
+  @@preprocessor_extra_flags = {
+   '-D "aligned(ARGS)="',
+   '-D "__attribute__(ARGS)="',
+   '-D "__asm(ARGS)="',
+   '-D "__asm__(ARGS)="',
+   '-D "__inline__="',
+   '-D_GNU_SOURCE'
+  }
+
+  new: (path) =>
+    @path = path
+
+  add_to_include_path: (...) =>
+      paths = {...}
+      for path in *paths
+          directive = '-I ' .. '"' .. path .. '"'
+          @@preprocessor_extra_flags[#@@preprocessor_extra_flags + 1] = directive
+
+  -- returns a list of the headers files upon which this file relies
+  dependencies: (hdr) =>
+    out = io.popen("#{@path} -M #{hdr} 2>&1")
+    deps = out\read("*a")
+    out\close!
+
+    if deps
+      parse_make_deps(deps)
+    else
+      nil
+
+  -- returns a stream representing a preprocessed form of the passed-in
+  -- headers. Don't forget to close the stream by calling the close() method
+  -- on it.
+  preprocess_stream: (...) =>
+    paths = {...}
+    -- create pseudo-header
+    pseudoheader = headerize(paths, false)
+    defines = table.concat(@@preprocessor_extra_flags, ' ')
+    cmd = ("echo $hdr | #{@path} #{defines} -std=c99 -P -E -")\gsub('$hdr', shell_quote(pseudoheader))
+    -- lfs = require("lfs")
+    -- print("CWD: #{lfs.currentdir!}")
+    -- print("CMD: #{cmd}")
+    -- io.stderr\write("CWD: #{lfs.currentdir!}\n")
+    -- io.stderr\write("CMD: #{cmd}\n")
+    io.popen(cmd)
+
+class Clang extends Gcc
+class Msvc extends Gcc
+
+type_to_class = {
+  "gcc": Gcc,
+  "clang": Clang,
+  "msvc": Msvc
+}
+
+find_best_cc = (ccs) ->
+  for _, meta in pairs(ccs)
+    version = io.popen("#{meta.path} -v 2>&1")
+    version\close!
+    if version
+      return type_to_class[meta.type](meta.path)
+  nil
+
+-- find the best cc. If os.exec causes problems on windows (like popping up
+-- a console window) we might consider using something like this:
+-- http://scite-ru.googlecode.com/svn/trunk/pack/tools/LuaLib/shell.html#exec
+cc = nil
+if cc == nil
+  cc = find_best_cc(ccs)
+
+return {
+  includes: (hdr) -> cc\dependencies(hdr)
+  preprocess_stream: (...) -> cc\preprocess_stream(...)
+  add_to_include_path: (...) -> cc\add_to_include_path(...)
+}
diff --git a/test/unit/set.moon b/test/unit/set.moon
new file mode 100644
index 0000000000..daa312a2f4
--- /dev/null
+++ b/test/unit/set.moon
@@ -0,0 +1,72 @@
+-- a set class for fast union/diff, can always return a table with the lines
+-- in the same relative order in which they were added by calling the
+-- to_table method. It does this by keeping two lua tables that mirror each
+-- other:
+-- 1) index => item
+-- 2) item => index
+class Set
+  new: (items) =>
+    if type(items) == 'table'
+      tempset = Set()
+      tempset\union_table(items)
+      @tbl = tempset\raw_tbl!
+      @items = tempset\raw_items!
+      @nelem = tempset\size!
+    else
+      @tbl = {}
+      @items = {}
+      @nelem = 0
+
+  -- adds the argument Set to this Set
+  union: (other) =>
+    for e in other\iterator!
+      @add(e)
+
+  -- adds the argument table to this Set
+  union_table: (t) =>
+    for k,v in pairs(t)
+      @add(v)
+
+  -- substracts the argument Set from this Set
+  diff: (other) =>
+    if other\size! > @size!
+      -- this set is smaller than the other set
+      for e in @iterator!
+        if other\contains(e)
+          @remove(e)
+    else
+      -- this set is larger than the other set
+      for e in other\iterator!
+        if @items[e]
+          @remove(e)
+
+  add: (it) =>
+    if not @contains(it)
+      idx = #@tbl + 1
+      @tbl[idx] = it
+      @items[it] = idx
+      @nelem += 1
+
+  remove: (it) =>
+    if @contains(it)
+      idx = @items[it]
+      @tbl[idx] = nil
+      @items[it] = nil
+      @nelem -= 1
+
+  contains: (it) =>
+    @items[it] or false
+
+  size: => @nelem
+  raw_tbl: => @tbl
+  raw_items: => @items
+  iterator: => pairs(@items)
+
+  to_table: =>
+    -- there might be gaps in @tbl, so we have to be careful and sort first
+    keys = [idx for idx, _ in pairs(@tbl)]
+    table.sort(keys)
+    copy = [@tbl[idx] for idx in *keys]
+    copy
+
+return Set