aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--codegen/src/ext/mod.rs2
-rw-r--r--codegen/src/ext/utf8.rs386
-rw-r--r--codegen/src/ext/vt.rs (renamed from codegen/src/ext.rs)10
-rw-r--r--codegen/src/main.rs15
-rw-r--r--examples/parselog.rs2
-rw-r--r--src/definitions.rs8
-rw-r--r--src/lib.rs49
-rw-r--r--src/table.rs19
-rw-r--r--src/table.rs.in14
-rw-r--r--src/utf8/mod.rs91
-rw-r--r--src/utf8/table.rs184
-rw-r--r--src/utf8/table.rs.in60
-rw-r--r--src/utf8/types.rs77
13 files changed, 887 insertions, 30 deletions
diff --git a/codegen/src/ext/mod.rs b/codegen/src/ext/mod.rs
new file mode 100644
index 0000000..c28d9f7
--- /dev/null
+++ b/codegen/src/ext/mod.rs
@@ -0,0 +1,2 @@
+pub mod utf8;
+pub mod vt;
diff --git a/codegen/src/ext/utf8.rs b/codegen/src/ext/utf8.rs
new file mode 100644
index 0000000..5b73081
--- /dev/null
+++ b/codegen/src/ext/utf8.rs
@@ -0,0 +1,386 @@
+//! Macro expansion for the utf8 parser state table
+use std::fmt;
+
+use syntex::Registry;
+
+use syntex_syntax::ast::{self, ExprKind, Arm, Expr, PatKind, LitKind, Pat};
+use syntex_syntax::codemap::Span;
+use syntex_syntax::ext::base::{ExtCtxt, MacEager, MacResult, DummyResult};
+use syntex_syntax::ext::build::AstBuilder;
+use syntex_syntax::parse::token::{Token, DelimToken};
+use syntex_syntax::parse::parser::Parser;
+use syntex_syntax::parse::PResult;
+use syntex_syntax::ptr::P;
+use syntex_syntax::tokenstream::TokenTree;
+
+#[path="../../../src/utf8/types.rs"]
+mod types;
+
+use self::types::{State, Action, pack};
+
+pub fn register(registry: &mut Registry) {
+ registry.add_macro("utf8_state_table", expand_state_table);
+}
+
+fn state_from_str<S>(s: &S) -> Result<State, ()>
+ where S: AsRef<str>
+{
+ Ok(match s.as_ref() {
+ "State::Ground" => State::Ground,
+ "State::Tail3" => State::Tail3,
+ "State::Tail2" => State::Tail2,
+ "State::Tail1" => State::Tail1,
+ "State::U3_2_e0" => State::U3_2_e0,
+ "State::U3_2_ed" => State::U3_2_ed,
+ "State::Utf8_4_3_f0" => State::Utf8_4_3_f0,
+ "State::Utf8_4_3_f4" => State::Utf8_4_3_f4,
+ _ => return Err(())
+ })
+}
+
+fn action_from_str<S>(s: &S) -> Result<Action, ()>
+ where S: AsRef<str>
+{
+ Ok(match s.as_ref() {
+ "Action::InvalidSequence" => Action::InvalidSequence,
+ "Action::EmitByte" => Action::EmitByte,
+ "Action::SetByte1" => Action::SetByte1,
+ "Action::SetByte2" => Action::SetByte2,
+ "Action::SetByte2Top" => Action::SetByte2Top,
+ "Action::SetByte3" => Action::SetByte3,
+ "Action::SetByte3Top" => Action::SetByte3Top,
+ "Action::SetByte4" => Action::SetByte4,
+ _ => return Err(())
+ })
+}
+
+fn parse_table_input_mappings<'a>(parser: &mut Parser<'a>) -> PResult<'a, Vec<Arm>> {
+ // Must start on open brace
+ try!(parser.expect(&Token::OpenDelim(DelimToken::Brace)));
+
+ let mut arms: Vec<Arm> = Vec::new();
+ while parser.token != Token::CloseDelim(DelimToken::Brace) {
+ match parser.parse_arm() {
+ Ok(arm) => arms.push(arm),
+ Err(e) => {
+ // Recover by skipping to the end of the block.
+ return Err(e);
+ }
+ }
+ }
+
+ // Consume the closing brace
+ parser.bump();
+ Ok(arms)
+}
+
+/// Expressions describing state transitions and actions
+#[derive(Debug)]
+struct TableDefinitionExprs {
+ state_expr: P<Expr>,
+ mapping_arms: Vec<Arm>,
+}
+
+fn state_from_expr(expr: P<Expr>, cx: &mut ExtCtxt) -> Result<State, ()> {
+ let s = match expr.node {
+ ExprKind::Path(ref _qself, ref path) => {
+ path.to_string()
+ },
+ _ => {
+ cx.span_err(expr.span, "expected State");
+ return Err(())
+ }
+ };
+
+ state_from_str(&s).map_err(|_| {
+ cx.span_err(expr.span, "expected State");
+ ()
+ })
+}
+
+fn u8_lit_from_expr(expr: &Expr, cx: &mut ExtCtxt) -> Result<u8, ()> {
+ static MSG: &'static str = "expected u8 int literal";
+
+ match expr.node {
+ ExprKind::Lit(ref lit) => {
+ match lit.node {
+ LitKind::Int(val, _) => {
+ Ok(val as u8)
+ },
+ _ => {
+ cx.span_err(lit.span, MSG);
+ return Err(());
+ }
+ }
+ },
+ _ => {
+ cx.span_err(expr.span, MSG);
+ return Err(());
+ }
+ }
+}
+
+fn input_mapping_from_arm(arm: Arm, cx: &mut ExtCtxt) -> Result<InputMapping, ()> {
+ let Arm { pats, body, .. } = arm;
+
+ let input = try!(InputDefinition::from_pat(&pats[0], cx));
+ let transition = try!(Transition::from_expr(&body, cx));
+
+ Ok(InputMapping {
+ input: input,
+ transition: transition,
+ })
+}
+
+/// What happens when certain input is received
+#[derive(Copy, Clone)]
+enum Transition {
+ State(State),
+ Action(Action),
+ StateAction(State, Action),
+}
+
+impl fmt::Debug for Transition {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ match *self {
+ Transition::State(state) => try!(write!(f, "State({:?})", state)),
+ Transition::Action(action) => try!(write!(f, "Action({:?})", action)),
+ Transition::StateAction(state, action) => {
+ try!(write!(f, "StateAction({:?}, {:?})", state, action));
+ }
+ }
+
+ write!(f, " -> {:?}", self.pack_u8())
+ }
+}
+
+impl Transition {
+ // State is stored in the top 4 bits
+ fn pack_u8(&self) -> u8 {
+ match *self {
+ Transition::State(state) => pack(state, Action::InvalidSequence),
+ Transition::Action(action) => pack(State::Ground, action),
+ Transition::StateAction(state, action) => pack(state, action),
+ }
+ }
+}
+
+impl Transition {
+ fn from_expr(expr: &Expr, cx: &mut ExtCtxt) -> Result<Transition, ()> {
+ match expr.node {
+ ExprKind::Tup(ref tup_exprs) => {
+ let mut action = None;
+ let mut state = None;
+
+ for tup_expr in tup_exprs {
+ if let ExprKind::Path(_, ref path) = tup_expr.node {
+ let path_str = path.to_string();
+ if path_str.starts_with('A') {
+ action = Some(try!(action_from_str(&path_str)
+ .map_err(|_| {
+ cx.span_err(expr.span, "invalid action");
+ })));
+ } else {
+ state = Some(try!(state_from_str(&path_str)
+ .map_err(|_| {
+ cx.span_err(expr.span, "invalid state");
+ })));
+ }
+ }
+ }
+
+ match (action, state) {
+ (Some(action), Some(state)) => Ok(Transition::StateAction(state, action)),
+ (None, Some(state)) => Ok(Transition::State(state)),
+ (Some(action), None) => Ok(Transition::Action(action)),
+ _ => {
+ cx.span_err(expr.span, "expected Action and/or State");
+ Err(())
+ }
+ }
+ },
+ ExprKind::Path(_, ref path) => {
+ // Path can be Action or State
+ let path_str = path.to_string();
+
+ if path_str.starts_with('A') {
+ let action = try!(action_from_str(&path_str)
+ .map_err(|_| {
+ cx.span_err(expr.span, "invalid action");
+ }));
+ Ok(Transition::Action(action))
+ } else {
+ let state = try!(state_from_str(&path_str)
+ .map_err(|_| {
+ cx.span_err(expr.span, "invalid state");
+ }));
+
+ Ok(Transition::State(state))
+ }
+ },
+ _ => {
+ cx.span_err(expr.span, "expected Action and/or State");
+ Err(())
+ }
+ }
+ }
+}
+
+#[derive(Debug)]
+enum InputDefinition {
+ Specific(u8),
+ Range { start: u8, end: u8 }
+}
+
+impl InputDefinition {
+ fn from_pat(pat: &Pat, cx: &mut ExtCtxt) -> Result<InputDefinition, ()> {
+ Ok(match pat.node {
+ PatKind::Lit(ref lit_expr) => {
+ InputDefinition::Specific(try!(u8_lit_from_expr(&lit_expr, cx)))
+ },
+ PatKind::Range(ref start_expr, ref end_expr) => {
+ InputDefinition::Range {
+ start: try!(u8_lit_from_expr(start_expr, cx)),
+ end: try!(u8_lit_from_expr(end_expr, cx)),
+ }
+ },
+ _ => {
+ cx.span_err(pat.span, "expected literal or range expression");
+ return Err(())
+ }
+ })
+ }
+}
+
+#[derive(Debug)]
+struct InputMapping {
+ input: InputDefinition,
+ transition: Transition,
+}
+
+#[derive(Debug)]
+struct TableDefinition {
+ state: State,
+ mappings: Vec<InputMapping>,
+}
+
+fn parse_raw_definitions(
+ definitions: Vec<TableDefinitionExprs>,
+ cx: &mut ExtCtxt
+) -> Result<Vec<TableDefinition>, ()> {
+ let mut out = Vec::new();
+
+ for raw in definitions {
+ let TableDefinitionExprs { state_expr, mapping_arms } = raw;
+ let state = try!(state_from_expr(state_expr, cx));
+
+ let mut mappings = Vec::new();
+ for arm in mapping_arms {
+ mappings.push(try!(input_mapping_from_arm(arm, cx)));
+ }
+
+ out.push(TableDefinition {
+ state: state,
+ mappings: mappings,
+ })
+ }
+
+ Ok(out)
+}
+
+fn parse_table_definition<'a>(parser: &mut Parser<'a>) -> PResult<'a, TableDefinitionExprs> {
+ let state_expr = try!(parser.parse_expr());
+ try!(parser.expect(&Token::FatArrow));
+ let mappings = try!(parse_table_input_mappings(parser));
+
+ Ok(TableDefinitionExprs {
+ state_expr: state_expr,
+ mapping_arms: mappings
+ })
+}
+
+fn parse_table_definition_list<'a>(parser: &mut Parser<'a>)
+ -> PResult<'a, Vec<TableDefinitionExprs>>
+{
+ let mut definitions = Vec::new();
+ while parser.token != Token::Eof {
+ definitions.push(try!(parse_table_definition(parser)));
+ parser.eat(&Token::Comma);
+ }
+
+ Ok(definitions)
+}
+
+fn build_state_tables<T>(defs: T) -> [[u8; 256]; 8]
+ where T: AsRef<[TableDefinition]>
+{
+ let mut result = [[0u8; 256]; 8];
+
+ for def in defs.as_ref() {
+ let state = def.state;
+ let state = state as u8;
+ let transitions = &mut result[state as usize];
+
+ for mapping in &def.mappings {
+ let trans = mapping.transition.pack_u8();
+ match mapping.input {
+ InputDefinition::Specific(idx) => {
+ transitions[idx as usize] = trans;
+ },
+ InputDefinition::Range { start, end } => {
+ for idx in start..end {
+ transitions[idx as usize] = trans;
+ }
+ transitions[end as usize] = trans;
+ },
+ }
+ }
+ }
+
+ result
+}
+
+fn build_table_ast(cx: &mut ExtCtxt, sp: Span, table: [[u8; 256]; 8]) -> P<ast::Expr> {
+ let table = table.iter()
+ .map(|list| {
+ let exprs = list.iter()
+ .map(|num| cx.expr_u8(sp, *num))
+ .collect();
+ cx.expr_vec(sp, exprs)
+ })
+ .collect();
+
+ cx.expr_vec(sp, table)
+}
+
+fn expand_state_table<'cx>(
+ cx: &'cx mut ExtCtxt,
+ sp: Span,
+ args: &[TokenTree])
+ -> Box<MacResult + 'cx>
+{
+ macro_rules! ptry {
+ ($pres:expr) => {
+ match $pres {
+ Ok(val) => val,
+ Err(mut err) => {
+ err.emit();
+ return DummyResult::any(sp);
+ }
+ }
+ }
+ }
+
+ // Parse the lookup spec
+ let mut parser: Parser = cx.new_parser_from_tts(args);
+ let definitions = ptry!(parse_table_definition_list(&mut parser));
+ let definitions = match parse_raw_definitions(definitions, cx) {
+ Ok(definitions) => definitions,
+ Err(_) => return DummyResult::any(sp),
+ };
+
+ let table = build_state_tables(&definitions);
+ let ast = build_table_ast(cx, sp, table);
+
+ MacEager::expr(ast)
+}
diff --git a/codegen/src/ext.rs b/codegen/src/ext/vt.rs
index cef2267..3f5bcf3 100644
--- a/codegen/src/ext.rs
+++ b/codegen/src/ext/vt.rs
@@ -1,3 +1,4 @@
+//! Macro expansion for the virtual terminal parser state table
use std::fmt;
use syntex::Registry;
@@ -12,10 +13,13 @@ use syntex_syntax::parse::PResult;
use syntex_syntax::ptr::P;
use syntex_syntax::tokenstream::TokenTree;
-use definitions::{State, Action};
+#[path="../../../src/definitions.rs"]
+mod definitions;
+
+use self::definitions::{State, Action};
pub fn register(registry: &mut Registry) {
- registry.add_macro("state_table", expand_state_table);
+ registry.add_macro("vt_state_table", expand_state_table);
}
fn state_from_str<S>(s: &S) -> Result<State, ()>
@@ -37,6 +41,7 @@ fn state_from_str<S>(s: &S) -> Result<State, ()>
"State::Ground" => State::Ground,
"State::OscString" => State::OscString,
"State::SosPmApcString" => State::SosPmApcString,
+ "State::Utf8" => State::Utf8,
_ => return Err(())
})
}
@@ -60,6 +65,7 @@ fn action_from_str<S>(s: &S) -> Result<Action, ()>
"Action::Print" => Action::Print,
"Action::Put" => Action::Put,
"Action::Unhook" => Action::Unhook,
+ "Action::BeginUtf8" => Action::BeginUtf8,
_ => return Err(())
})
}
diff --git a/codegen/src/main.rs b/codegen/src/main.rs
index 64bddd9..5f8d153 100644
--- a/codegen/src/main.rs
+++ b/codegen/src/main.rs
@@ -1,18 +1,23 @@
+#![allow(dead_code)]
extern crate syntex;
extern crate syntex_syntax;
mod ext;
-#[path="../../src/definitions.rs"]
-pub mod definitions;
-
use std::path::Path;
fn main() {
+ // Expand VT parser state table
+ let mut registry = syntex::Registry::new();
+ ext::vt::register(&mut registry);
let src = &Path::new("../src/table.rs.in");
let dst = &Path::new("../src/table.rs");
+ registry.expand("vt_state_table", src, dst).expect("expand vt_stable_table ok");
+ // Expand UTF8 parser state table
let mut registry = syntex::Registry::new();
- ext::register(&mut registry);
- registry.expand("state_table", src, dst).expect("expand stable_table ok");
+ ext::utf8::register(&mut registry);
+ let src = &Path::new("../src/utf8/table.rs.in");
+ let dst = &Path::new("../src/utf8/table.rs");
+ registry.expand("utf8_state_table", src, dst).expect("expand utf8_stable_table ok");
}
diff --git a/examples/parselog.rs b/examples/parselog.rs
index 804c399..f4ae86a 100644
--- a/examples/parselog.rs
+++ b/examples/parselog.rs
@@ -9,7 +9,7 @@ use vtparse::{StateMachine, Parser};
struct Log;
impl Parser for Log {
- fn print(&mut self, _machine: &StateMachine, c: char) {
+ fn print(&mut self, c: char) {
println!("[print] {:?}", c);
}
fn execute(&mut self, _machine: &StateMachine, byte: u8) {
diff --git a/src/definitions.rs b/src/definitions.rs
index 5177ca6..ded49cf 100644
--- a/src/definitions.rs
+++ b/src/definitions.rs
@@ -15,7 +15,7 @@ pub enum State {
Ground = 12,
OscString = 13,
SosPmApcString = 14,
- Unused__ = 15,
+ Utf8 = 15,
}
#[derive(Debug, Clone, Copy)]
@@ -35,7 +35,7 @@ pub enum Action {
Print = 12,
Put = 13,
Unhook = 14,
- Unused__ = 15,
+ BeginUtf8 = 15,
}
/// Unpack a u8 into a State and Action
@@ -67,12 +67,12 @@ mod tests {
}
match unpack(0x0f) {
- (State::Unused__, Action::None) => (),
+ (State::Utf8, Action::None) => (),
_ => panic!("unpack failed"),
}
match unpack(0xff) {
- (State::Unused__, Action::Unused__) => (),
+ (State::Utf8, Action::BeginUtf8) => (),
_ => panic!("unpack failed"),
}
}
diff --git a/src/lib.rs b/src/lib.rs
index 73326bc..17e265e 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,5 +1,6 @@
mod table;
mod definitions;
+mod utf8;
pub use definitions::{Action, State, unpack};
@@ -27,6 +28,20 @@ impl State {
const MAX_INTERMEDIATES: usize = 2;
const MAX_PARAMS: usize = 16;
+struct VtUtf8Receiver<'a, P: Parser + 'a>(&'a mut P, &'a mut State);
+
+impl<'a, P: Parser> utf8::Receiver for VtUtf8Receiver<'a, P> {
+ fn codepoint(&mut self, c: char) {
+ self.0.print(c);
+ *self.1 = State::Ground;
+ }
+
+ fn invalid_sequence(&mut self) {
+ self.0.print('�');
+ *self.1 = State::Ground;
+ }
+}
+
/// ANSI VTE Parser
///
/// As described in http://vt100.net/emu/dec_ansi_parser
@@ -38,7 +53,8 @@ pub struct StateMachine {
intermediate_idx: usize,
params: [i64; MAX_PARAMS],
num_params: usize,
- ignoring: bool
+ ignoring: bool,
+ utf8_parser: utf8::Parser,
}
impl StateMachine {
@@ -50,6 +66,7 @@ impl StateMachine {
params: [0i64; MAX_PARAMS],
num_params: 0,
ignoring: false,
+ utf8_parser: utf8::Parser::new(),
}
}
@@ -62,6 +79,12 @@ impl StateMachine {
}
pub fn advance<P: Parser>(&mut self, parser: &mut P, byte: u8) {
+ // Utf8 characters are handled out-of-band.
+ if let State::Utf8 = self.state {
+ self.process_utf8(parser, byte);
+ return;
+ }
+
// Handle state changes in the anywhere state before evaluating changes
// for current state.
let mut change = STATE_CHANGE[State::Anywhere as usize][byte as usize];
@@ -76,13 +99,22 @@ impl StateMachine {
self.perform_state_change(parser, state, action, byte);
}
+ #[inline]
+ fn process_utf8<P>(&mut self, parser: &mut P, byte: u8)
+ where P: Parser
+ {
+ let mut receiver = VtUtf8Receiver(parser, &mut self.state);
+ let utf8_parser = &mut self.utf8_parser;
+ utf8_parser.advance(&mut receiver, byte);
+ }
+
fn perform_state_change<P>(&mut self, parser: &mut P, state: State, action: Action, byte: u8)
where P: Parser
{
macro_rules! maybe_action {
($action:expr, $arg:expr) => {
match $action {
- Action::None | Action::Unused__ => (),
+ Action::None => (),
action => {
self.perform_action(parser, action, $arg);
},
@@ -91,7 +123,7 @@ impl StateMachine {
}
match state {
- State::Anywhere | State::Unused__ => {
+ State::Anywhere => {
// Just run the action
self.perform_action(parser, action, byte);
},
@@ -114,7 +146,7 @@ impl StateMachine {
fn perform_action<P: Parser>(&mut self, parser: &mut P, action: Action, byte: u8) {
match action {
- Action::Print => parser.print(self, byte as char),
+ Action::Print => parser.print(byte as char),
Action::Execute => parser.execute(self, byte),
Action::Hook => parser.hook(self, byte),
Action::Put => parser.put(self, byte),
@@ -124,7 +156,7 @@ impl StateMachine {
Action::Unhook => parser.unhook(self, byte),
Action::CsiDispatch => parser.csi_dispatch(self, byte as char),
Action::EscDispatch => parser.esc_dispatch(self, byte),
- Action::Ignore | Action::None | Action::Unused__=> (),
+ Action::Ignore | Action::None => (),
Action::Collect => {
if self.intermediate_idx == MAX_INTERMEDIATES {
self.ignoring = true;
@@ -155,13 +187,16 @@ impl StateMachine {
self.intermediate_idx = 0;
self.num_params = 0;
self.ignoring = false;
- }
+ },
+ Action::BeginUtf8 => {
+ self.process_utf8(parser, byte);
+ },
}
}
}
pub trait Parser {
- fn print(&mut self, &StateMachine, c: char);
+ fn print(&mut self, c: char);
fn execute(&mut self, &StateMachine, byte: u8);
fn hook(&mut self, &StateMachine, byte: u8);
fn put(&mut self, &StateMachine, byte: u8);
diff --git a/src/table.rs b/src/table.rs
index 923c7eb..d2034b8 100644
--- a/src/table.rs
+++ b/src/table.rs
@@ -6,6 +6,9 @@ use definitions::Action;
pub static STATE_CHANGE: [[u8; 256]; 16] =
[
+ // Beginning of UTF-8 2 byte sequence
+ // Beginning of UTF-8 3 byte sequence
+ // Beginning of UTF-8 4 byte sequence
@@ -280,11 +283,13 @@ pub static STATE_CHANGE: [[u8; 256]; 16] =
80u8, 80u8, 80u8, 80u8, 80u8, 80u8, 0u8, 80u8, 0u8, 0u8, 0u8, 0u8, 0u8,
0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
- 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
- 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
- 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
- 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
- 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8],
+ 0u8, 0u8, 0u8, 0u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8,
+ 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8,
+ 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8,
+ 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8,
+ 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8,
+ 255u8, 255u8, 255u8, 255u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8],
[112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8,
112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8,
112u8, 112u8, 112u8, 112u8, 0u8, 112u8, 0u8, 0u8, 112u8, 112u8, 112u8,
@@ -366,7 +371,7 @@ pub static ENTRY_ACTIONS: &'static [Action] =
Action::OscStart, // State::OscString
Action::None, // State::SosPmApcString
Action::None];
- // State::Unused__
+ // State::Utf8
pub static EXIT_ACTIONS: &'static [Action] =
&[Action::None, // State::Anywhere
@@ -384,4 +389,4 @@ pub static EXIT_ACTIONS: &'static [Action] =
Action::None, // State::Ground
Action::OscEnd, // State::OscString
Action::None, // State::SosPmApcString
- Action::None]; // State::Unused__
+ Action::None]; // State::Utf8
diff --git a/src/table.rs.in b/src/table.rs.in
index 7414c2a..f5a838d 100644
--- a/src/table.rs.in
+++ b/src/table.rs.in
@@ -3,7 +3,7 @@
use definitions::Action;
-pub static STATE_CHANGE: [[u8; 256]; 16] = state_table! {
+pub static STATE_CHANGE: [[u8; 256]; 16] = vt_state_table! {
State::Anywhere => {
0x18 => (Action::Execute, State::Ground),
0x1a => (Action::Execute, State::Ground),
@@ -28,7 +28,13 @@ pub static STATE_CHANGE: [[u8; 256]; 16] = state_table! {
0x20...0x7f => Action::Print,
0x80...0x8f => Action::Execute,
0x91...0x9a => Action::Execute,
- 0x9c => Action::Execute
+ 0x9c => Action::Execute,
+ // Beginning of UTF-8 2 byte sequence
+ 0xc2...0xdf => (State::Utf8, Action::BeginUtf8),
+ // Beginning of UTF-8 3 byte sequence
+ 0xe0...0xef => (State::Utf8, Action::BeginUtf8),
+ // Beginning of UTF-8 4 byte sequence
+ 0xf0...0xf4 => (State::Utf8, Action::BeginUtf8),
},
State::Escape => {
@@ -191,7 +197,7 @@ pub static ENTRY_ACTIONS: &'static [Action] = &[
Action::None, // State::Ground
Action::OscStart, // State::OscString
Action::None, // State::SosPmApcString
- Action::None, // State::Unused__
+ Action::None, // State::Utf8
];
pub static EXIT_ACTIONS: &'static [Action] = &[
@@ -210,5 +216,5 @@ pub static EXIT_ACTIONS: &'static [Action] = &[
Action::None, // State::Ground
Action::OscEnd, // State::OscString
Action::None, // State::SosPmApcString
- Action::None, // State::Unused__
+ Action::None, // State::Utf8
];
diff --git a/src/utf8/mod.rs b/src/utf8/mod.rs
new file mode 100644
index 0000000..3d099b1
--- /dev/null
+++ b/src/utf8/mod.rs
@@ -0,0 +1,91 @@
+//! A table-driven UTF-8 Parser
+//!
+//! This module implements a table-driven UTF-8 parser which should
+//! theoretically contain the minimal number of branches (1). The only branch is
+//! on the `Action` returned from unpacking a transition.
+use std::char;
+
+mod types;
+use self::types::{State, Action, unpack};
+
+mod table;
+use self::table::TRANSITIONS;
+
+/// Handles codepoint and invalid sequence events from the parser.
+pub trait Receiver {
+ /// Code point parsed
+ ///
+ /// Called with the codepoint
+ fn codepoint(&mut self, char);
+
+ /// Invalid sequence encountered
+ fn invalid_sequence(&mut self);
+}
+
+/// A parser for Utf8 Characters
+///
+/// Repeatedly call `advance` with bytes to emit Utf8 characters
+pub struct Parser {
+ point: u32,
+ state: State,
+}
+
+/// Continuation bytes are masked with this value.
+const CONTINUATION_MASK: u8 = 0b0011_1111;
+
+impl Parser {
+ /// Create a new Parser
+ pub fn new() -> Parser {
+ Parser {
+ point: 0,
+ state: State::Ground,
+ }
+ }
+
+ pub fn advance<R>(&mut self, receiver: &mut R, byte: u8)
+ where R: Receiver
+ {
+ let cur = self.state as usize;
+ let change = TRANSITIONS[cur][byte as usize];
+ let (state, action) = unsafe { unpack(change) };
+
+ self.perform_action(receiver, byte, action);
+ self.state = state;
+ }
+
+ fn perform_action<R>(&mut self, receiver: &mut R, byte: u8, action: Action)
+ where R: Receiver
+ {
+ match action {
+ Action::InvalidSequence => {
+ self.point = 0;
+ receiver.invalid_sequence();
+ },
+ Action::EmitByte => {
+ receiver.codepoint(byte as char);
+ },
+ Action::SetByte1 => {
+ let point = self.point | ((byte & CONTINUATION_MASK) as u32);
+ let c = unsafe { char::from_u32_unchecked(point) };
+ self.point = 0;
+
+ receiver.codepoint(c);
+ },
+ Action::SetByte2 => {
+ self.point |= ((byte & CONTINUATION_MASK) as u32) << 6;
+ },
+ Action::SetByte2Top => {
+ self.point |= ((byte & 0b0001_1111) as u32) << 6;
+ },
+ Action::SetByte3 => {
+ self.point |= ((byte & CONTINUATION_MASK) as u32) << 12;
+ },
+ Action::SetByte3Top => {
+ self.point |= ((byte & 0b0000_1111) as u32) << 12;
+ },
+ Action::SetByte4 => {
+ self.point |= ((byte & 0b0000_0111) as u32) << 18;
+ },
+ }
+ }
+}
diff --git a/src/utf8/table.rs b/src/utf8/table.rs
new file mode 100644
index 0000000..5a1292b
--- /dev/null
+++ b/src/utf8/table.rs
@@ -0,0 +1,184 @@
+//! UTF-8 Parse Transition Table
+
+/// Transition table for parsing UTF-8. This is built from the grammar described
+/// at https://tools.ietf.org/html/rfc3629#section-4 which I have copied and
+/// formatted below.
+///
+/// # UTF-8 Grammar
+///
+/// ```ignore
+/// UTF8-octets = *( UTF8-char )
+/// UTF8-char = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4
+/// UTF8-1 = %x00-7F
+/// UTF8-2 = %xC2-DF UTF8-tail
+/// UTF8-3 = %xE0 %xA0-BF UTF8-tail /
+/// %xE1-EC 2( UTF8-tail ) /
+/// %xED %x80-9F UTF8-tail /
+/// %xEE-EF 2( UTF8-tail )
+/// UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) /
+/// %xF1-F3 3( UTF8-tail ) /
+/// %xF4 %x80-8F 2( UTF8-tail )
+/// UTF8-tail = %x80-BF
+/// ```
+///
+/// Not specifying an action in this table is equivalent to specifying
+/// Action::InvalidSequence. Not specifying a state is equivalent to specifying
+/// state::ground.
+pub static TRANSITIONS: [[u8; 256]; 8] =
+ [[16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8,
+ 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8,
+ 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8,
+ 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8,
+ 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8,
+ 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8,
+ 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8,
+ 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8,
+ 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8,
+ 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8,
+ 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8,
+ 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8,
+ 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 100u8, 98u8,
+ 98u8, 98u8, 98u8, 98u8, 98u8, 98u8, 98u8, 98u8, 98u8, 98u8, 98u8, 101u8,
+ 98u8, 98u8, 118u8, 113u8, 113u8, 113u8, 119u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8],
+ [0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8,
+ 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8,
+ 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8,
+ 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8,
+ 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8,
+ 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8],
+ [0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8,
+ 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8,
+ 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8,
+ 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8,
+ 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8,
+ 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8],
+ [0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8,
+ 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8,
+ 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8,
+ 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8,
+ 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8,
+ 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8],
+ [0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8,
+ 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8,
+ 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8,
+ 51u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8],
+ [0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8,
+ 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8,
+ 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8],
+ [0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8,
+ 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8,
+ 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8,
+ 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8,
+ 82u8, 82u8, 82u8, 82u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8],
+ [0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8,
+ 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8]];
diff --git a/src/utf8/table.rs.in b/src/utf8/table.rs.in
new file mode 100644
index 0000000..2acafe7
--- /dev/null
+++ b/src/utf8/table.rs.in
@@ -0,0 +1,60 @@
+//! UTF-8 Parse Transition Table
+
+/// Transition table for parsing UTF-8. This is built from the grammar described
+/// at https://tools.ietf.org/html/rfc3629#section-4 which I have copied and
+/// formatted below.
+///
+/// # UTF-8 Grammar
+///
+/// ```ignore
+/// UTF8-octets = *( UTF8-char )
+/// UTF8-char = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4
+/// UTF8-1 = %x00-7F
+/// UTF8-2 = %xC2-DF UTF8-tail
+/// UTF8-3 = %xE0 %xA0-BF UTF8-tail /
+/// %xE1-EC 2( UTF8-tail ) /
+/// %xED %x80-9F UTF8-tail /
+/// %xEE-EF 2( UTF8-tail )
+/// UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) /
+/// %xF1-F3 3( UTF8-tail ) /
+/// %xF4 %x80-8F 2( UTF8-tail )
+/// UTF8-tail = %x80-BF
+/// ```
+///
+/// Not specifying an action in this table is equivalent to specifying
+/// Action::InvalidSequence. Not specifying a state is equivalent to specifying
+/// state::ground.
+pub static TRANSITIONS: [[u8; 256]; 8] = utf8_state_table! {
+ State::Ground => {
+ 0x00...0x7f => (State::Ground, Action::EmitByte),
+ 0xc2...0xdf => (State::Tail1, Action::SetByte2Top),
+ 0xe0 => (State::U3_2_e0, Action::SetByte3Top),
+ 0xe1...0xec => (State::Tail2, Action::SetByte3Top),
+ 0xed => (State::U3_2_ed, Action::SetByte3Top),
+ 0xee...0xef => (State::Tail2, Action::SetByte3Top),
+ 0xf0 => (State::Utf8_4_3_f0, Action::SetByte4),
+ 0xf1...0xf3 => (State::Tail3, Action::SetByte4),
+ 0xf4 => (State::Utf8_4_3_f4, Action::SetByte4),
+ },
+ State::U3_2_e0 => {
+ 0xa0...0xbf => (State::Tail1, Action::SetByte2),
+ },
+ State::U3_2_ed => {
+ 0x80...0x9f => (State::Tail1, Action::SetByte2),
+ },
+ State::Utf8_4_3_f0 => {
+ 0x90...0xbf => (State::Tail2, Action::SetByte3),
+ },
+ State::Utf8_4_3_f4 => {
+ 0x80...0x8f => (State::Tail2, Action::SetByte3),
+ },
+ State::Tail3 => {
+ 0x80...0xbf => (State::Tail2, Action::SetByte3),
+ },
+ State::Tail2 => {
+ 0x80...0xbf => (State::Tail1, Action::SetByte2),
+ },
+ State::Tail1 => {
+ 0x80...0xbf => (State::Ground, Action::SetByte1),
+ },
+};
diff --git a/src/utf8/types.rs b/src/utf8/types.rs
new file mode 100644
index 0000000..4c604f4
--- /dev/null
+++ b/src/utf8/types.rs
@@ -0,0 +1,77 @@
+//! Types supporting the UTF-8 parser
+#![allow(non_camel_case_types)]
+use std::mem;
+
+/// States the parser can be in.
+///
+/// There is a state for each initial input of the 3 and 4 byte sequences since
+/// the following bytes are subject to different conditions than a tail byte.
+#[allow(dead_code)]
+#[derive(Debug, Copy, Clone)]
+pub enum State {
+ /// Ground state; expect anything
+ Ground = 0,
+ /// 3 tail bytes
+ Tail3 = 1,
+ /// 2 tail bytes
+ Tail2 = 2,
+ /// 1 tail byte
+ Tail1 = 3,
+ /// UTF8-3 starting with E0
+ U3_2_e0 = 4,
+ /// UTF8-3 starting with ED
+ U3_2_ed = 5,
+ /// UTF8-4 starting with F0
+ Utf8_4_3_f0 = 6,
+ /// UTF8-4 starting with F4
+ Utf8_4_3_f4 = 7,
+}
+
+/// Action to take when receiving a byte
+#[allow(dead_code)]
+#[derive(Debug, Copy, Clone)]
+pub enum Action {
+ /// Unexpected byte; sequence is invalid
+ InvalidSequence = 0,
+ /// Received valid 7-bit ASCII byte which can be directly emitted.
+ EmitByte = 1,
+ /// Set the bottom continuation byte
+ SetByte1 = 2,
+ /// Set the 2nd-from-last continuation byte
+ SetByte2 = 3,
+ /// Set the 2nd-from-last byte which is part of a two byte sequence
+ SetByte2Top = 4,
+ /// Set the 3rd-from-last continuation byte
+ SetByte3 = 5,
+ /// Set the 3rd-from-last byte which is part of a three byte sequence
+ SetByte3Top = 6,
+ /// Set the top byte of a four byte sequence.
+ SetByte4 = 7,
+}
+
+/// Convert a state and action to a u8
+///
+/// State will be the bottom 4 bits and action the top 4
+#[inline]
+#[allow(dead_code)]
+pub fn pack(state: State, action: Action) -> u8 {
+ ((action as u8) << 4) | (state as u8)
+}
+
+/// Convert a u8 to a state and action
+///
+/// # Unsafety
+///
+/// If this function is called with a byte that wasn't encoded with the `pack`
+/// function in this module, there is no guarantee that a valid state and action
+/// can be produced.
+#[inline]
+pub unsafe fn unpack(val: u8) -> (State, Action) {
+ (
+ // State is stored in bottom 4 bits
+ mem::transmute(val & 0x0f),
+
+ // Action is stored in top 4 bits
+ mem::transmute(val >> 4),
+ )
+}