diff options
-rw-r--r-- | codegen/src/ext/mod.rs | 2 | ||||
-rw-r--r-- | codegen/src/ext/utf8.rs | 386 | ||||
-rw-r--r-- | codegen/src/ext/vt.rs (renamed from codegen/src/ext.rs) | 10 | ||||
-rw-r--r-- | codegen/src/main.rs | 15 | ||||
-rw-r--r-- | examples/parselog.rs | 2 | ||||
-rw-r--r-- | src/definitions.rs | 8 | ||||
-rw-r--r-- | src/lib.rs | 49 | ||||
-rw-r--r-- | src/table.rs | 19 | ||||
-rw-r--r-- | src/table.rs.in | 14 | ||||
-rw-r--r-- | src/utf8/mod.rs | 91 | ||||
-rw-r--r-- | src/utf8/table.rs | 184 | ||||
-rw-r--r-- | src/utf8/table.rs.in | 60 | ||||
-rw-r--r-- | src/utf8/types.rs | 77 |
13 files changed, 887 insertions, 30 deletions
diff --git a/codegen/src/ext/mod.rs b/codegen/src/ext/mod.rs new file mode 100644 index 0000000..c28d9f7 --- /dev/null +++ b/codegen/src/ext/mod.rs @@ -0,0 +1,2 @@ +pub mod utf8; +pub mod vt; diff --git a/codegen/src/ext/utf8.rs b/codegen/src/ext/utf8.rs new file mode 100644 index 0000000..5b73081 --- /dev/null +++ b/codegen/src/ext/utf8.rs @@ -0,0 +1,386 @@ +//! Macro expansion for the utf8 parser state table +use std::fmt; + +use syntex::Registry; + +use syntex_syntax::ast::{self, ExprKind, Arm, Expr, PatKind, LitKind, Pat}; +use syntex_syntax::codemap::Span; +use syntex_syntax::ext::base::{ExtCtxt, MacEager, MacResult, DummyResult}; +use syntex_syntax::ext::build::AstBuilder; +use syntex_syntax::parse::token::{Token, DelimToken}; +use syntex_syntax::parse::parser::Parser; +use syntex_syntax::parse::PResult; +use syntex_syntax::ptr::P; +use syntex_syntax::tokenstream::TokenTree; + +#[path="../../../src/utf8/types.rs"] +mod types; + +use self::types::{State, Action, pack}; + +pub fn register(registry: &mut Registry) { + registry.add_macro("utf8_state_table", expand_state_table); +} + +fn state_from_str<S>(s: &S) -> Result<State, ()> + where S: AsRef<str> +{ + Ok(match s.as_ref() { + "State::Ground" => State::Ground, + "State::Tail3" => State::Tail3, + "State::Tail2" => State::Tail2, + "State::Tail1" => State::Tail1, + "State::U3_2_e0" => State::U3_2_e0, + "State::U3_2_ed" => State::U3_2_ed, + "State::Utf8_4_3_f0" => State::Utf8_4_3_f0, + "State::Utf8_4_3_f4" => State::Utf8_4_3_f4, + _ => return Err(()) + }) +} + +fn action_from_str<S>(s: &S) -> Result<Action, ()> + where S: AsRef<str> +{ + Ok(match s.as_ref() { + "Action::InvalidSequence" => Action::InvalidSequence, + "Action::EmitByte" => Action::EmitByte, + "Action::SetByte1" => Action::SetByte1, + "Action::SetByte2" => Action::SetByte2, + "Action::SetByte2Top" => Action::SetByte2Top, + "Action::SetByte3" => Action::SetByte3, + "Action::SetByte3Top" => Action::SetByte3Top, + "Action::SetByte4" => Action::SetByte4, + _ => return Err(()) + }) +} + +fn parse_table_input_mappings<'a>(parser: &mut Parser<'a>) -> PResult<'a, Vec<Arm>> { + // Must start on open brace + try!(parser.expect(&Token::OpenDelim(DelimToken::Brace))); + + let mut arms: Vec<Arm> = Vec::new(); + while parser.token != Token::CloseDelim(DelimToken::Brace) { + match parser.parse_arm() { + Ok(arm) => arms.push(arm), + Err(e) => { + // Recover by skipping to the end of the block. + return Err(e); + } + } + } + + // Consume the closing brace + parser.bump(); + Ok(arms) +} + +/// Expressions describing state transitions and actions +#[derive(Debug)] +struct TableDefinitionExprs { + state_expr: P<Expr>, + mapping_arms: Vec<Arm>, +} + +fn state_from_expr(expr: P<Expr>, cx: &mut ExtCtxt) -> Result<State, ()> { + let s = match expr.node { + ExprKind::Path(ref _qself, ref path) => { + path.to_string() + }, + _ => { + cx.span_err(expr.span, "expected State"); + return Err(()) + } + }; + + state_from_str(&s).map_err(|_| { + cx.span_err(expr.span, "expected State"); + () + }) +} + +fn u8_lit_from_expr(expr: &Expr, cx: &mut ExtCtxt) -> Result<u8, ()> { + static MSG: &'static str = "expected u8 int literal"; + + match expr.node { + ExprKind::Lit(ref lit) => { + match lit.node { + LitKind::Int(val, _) => { + Ok(val as u8) + }, + _ => { + cx.span_err(lit.span, MSG); + return Err(()); + } + } + }, + _ => { + cx.span_err(expr.span, MSG); + return Err(()); + } + } +} + +fn input_mapping_from_arm(arm: Arm, cx: &mut ExtCtxt) -> Result<InputMapping, ()> { + let Arm { pats, body, .. } = arm; + + let input = try!(InputDefinition::from_pat(&pats[0], cx)); + let transition = try!(Transition::from_expr(&body, cx)); + + Ok(InputMapping { + input: input, + transition: transition, + }) +} + +/// What happens when certain input is received +#[derive(Copy, Clone)] +enum Transition { + State(State), + Action(Action), + StateAction(State, Action), +} + +impl fmt::Debug for Transition { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match *self { + Transition::State(state) => try!(write!(f, "State({:?})", state)), + Transition::Action(action) => try!(write!(f, "Action({:?})", action)), + Transition::StateAction(state, action) => { + try!(write!(f, "StateAction({:?}, {:?})", state, action)); + } + } + + write!(f, " -> {:?}", self.pack_u8()) + } +} + +impl Transition { + // State is stored in the top 4 bits + fn pack_u8(&self) -> u8 { + match *self { + Transition::State(state) => pack(state, Action::InvalidSequence), + Transition::Action(action) => pack(State::Ground, action), + Transition::StateAction(state, action) => pack(state, action), + } + } +} + +impl Transition { + fn from_expr(expr: &Expr, cx: &mut ExtCtxt) -> Result<Transition, ()> { + match expr.node { + ExprKind::Tup(ref tup_exprs) => { + let mut action = None; + let mut state = None; + + for tup_expr in tup_exprs { + if let ExprKind::Path(_, ref path) = tup_expr.node { + let path_str = path.to_string(); + if path_str.starts_with('A') { + action = Some(try!(action_from_str(&path_str) + .map_err(|_| { + cx.span_err(expr.span, "invalid action"); + }))); + } else { + state = Some(try!(state_from_str(&path_str) + .map_err(|_| { + cx.span_err(expr.span, "invalid state"); + }))); + } + } + } + + match (action, state) { + (Some(action), Some(state)) => Ok(Transition::StateAction(state, action)), + (None, Some(state)) => Ok(Transition::State(state)), + (Some(action), None) => Ok(Transition::Action(action)), + _ => { + cx.span_err(expr.span, "expected Action and/or State"); + Err(()) + } + } + }, + ExprKind::Path(_, ref path) => { + // Path can be Action or State + let path_str = path.to_string(); + + if path_str.starts_with('A') { + let action = try!(action_from_str(&path_str) + .map_err(|_| { + cx.span_err(expr.span, "invalid action"); + })); + Ok(Transition::Action(action)) + } else { + let state = try!(state_from_str(&path_str) + .map_err(|_| { + cx.span_err(expr.span, "invalid state"); + })); + + Ok(Transition::State(state)) + } + }, + _ => { + cx.span_err(expr.span, "expected Action and/or State"); + Err(()) + } + } + } +} + +#[derive(Debug)] +enum InputDefinition { + Specific(u8), + Range { start: u8, end: u8 } +} + +impl InputDefinition { + fn from_pat(pat: &Pat, cx: &mut ExtCtxt) -> Result<InputDefinition, ()> { + Ok(match pat.node { + PatKind::Lit(ref lit_expr) => { + InputDefinition::Specific(try!(u8_lit_from_expr(&lit_expr, cx))) + }, + PatKind::Range(ref start_expr, ref end_expr) => { + InputDefinition::Range { + start: try!(u8_lit_from_expr(start_expr, cx)), + end: try!(u8_lit_from_expr(end_expr, cx)), + } + }, + _ => { + cx.span_err(pat.span, "expected literal or range expression"); + return Err(()) + } + }) + } +} + +#[derive(Debug)] +struct InputMapping { + input: InputDefinition, + transition: Transition, +} + +#[derive(Debug)] +struct TableDefinition { + state: State, + mappings: Vec<InputMapping>, +} + +fn parse_raw_definitions( + definitions: Vec<TableDefinitionExprs>, + cx: &mut ExtCtxt +) -> Result<Vec<TableDefinition>, ()> { + let mut out = Vec::new(); + + for raw in definitions { + let TableDefinitionExprs { state_expr, mapping_arms } = raw; + let state = try!(state_from_expr(state_expr, cx)); + + let mut mappings = Vec::new(); + for arm in mapping_arms { + mappings.push(try!(input_mapping_from_arm(arm, cx))); + } + + out.push(TableDefinition { + state: state, + mappings: mappings, + }) + } + + Ok(out) +} + +fn parse_table_definition<'a>(parser: &mut Parser<'a>) -> PResult<'a, TableDefinitionExprs> { + let state_expr = try!(parser.parse_expr()); + try!(parser.expect(&Token::FatArrow)); + let mappings = try!(parse_table_input_mappings(parser)); + + Ok(TableDefinitionExprs { + state_expr: state_expr, + mapping_arms: mappings + }) +} + +fn parse_table_definition_list<'a>(parser: &mut Parser<'a>) + -> PResult<'a, Vec<TableDefinitionExprs>> +{ + let mut definitions = Vec::new(); + while parser.token != Token::Eof { + definitions.push(try!(parse_table_definition(parser))); + parser.eat(&Token::Comma); + } + + Ok(definitions) +} + +fn build_state_tables<T>(defs: T) -> [[u8; 256]; 8] + where T: AsRef<[TableDefinition]> +{ + let mut result = [[0u8; 256]; 8]; + + for def in defs.as_ref() { + let state = def.state; + let state = state as u8; + let transitions = &mut result[state as usize]; + + for mapping in &def.mappings { + let trans = mapping.transition.pack_u8(); + match mapping.input { + InputDefinition::Specific(idx) => { + transitions[idx as usize] = trans; + }, + InputDefinition::Range { start, end } => { + for idx in start..end { + transitions[idx as usize] = trans; + } + transitions[end as usize] = trans; + }, + } + } + } + + result +} + +fn build_table_ast(cx: &mut ExtCtxt, sp: Span, table: [[u8; 256]; 8]) -> P<ast::Expr> { + let table = table.iter() + .map(|list| { + let exprs = list.iter() + .map(|num| cx.expr_u8(sp, *num)) + .collect(); + cx.expr_vec(sp, exprs) + }) + .collect(); + + cx.expr_vec(sp, table) +} + +fn expand_state_table<'cx>( + cx: &'cx mut ExtCtxt, + sp: Span, + args: &[TokenTree]) + -> Box<MacResult + 'cx> +{ + macro_rules! ptry { + ($pres:expr) => { + match $pres { + Ok(val) => val, + Err(mut err) => { + err.emit(); + return DummyResult::any(sp); + } + } + } + } + + // Parse the lookup spec + let mut parser: Parser = cx.new_parser_from_tts(args); + let definitions = ptry!(parse_table_definition_list(&mut parser)); + let definitions = match parse_raw_definitions(definitions, cx) { + Ok(definitions) => definitions, + Err(_) => return DummyResult::any(sp), + }; + + let table = build_state_tables(&definitions); + let ast = build_table_ast(cx, sp, table); + + MacEager::expr(ast) +} diff --git a/codegen/src/ext.rs b/codegen/src/ext/vt.rs index cef2267..3f5bcf3 100644 --- a/codegen/src/ext.rs +++ b/codegen/src/ext/vt.rs @@ -1,3 +1,4 @@ +//! Macro expansion for the virtual terminal parser state table use std::fmt; use syntex::Registry; @@ -12,10 +13,13 @@ use syntex_syntax::parse::PResult; use syntex_syntax::ptr::P; use syntex_syntax::tokenstream::TokenTree; -use definitions::{State, Action}; +#[path="../../../src/definitions.rs"] +mod definitions; + +use self::definitions::{State, Action}; pub fn register(registry: &mut Registry) { - registry.add_macro("state_table", expand_state_table); + registry.add_macro("vt_state_table", expand_state_table); } fn state_from_str<S>(s: &S) -> Result<State, ()> @@ -37,6 +41,7 @@ fn state_from_str<S>(s: &S) -> Result<State, ()> "State::Ground" => State::Ground, "State::OscString" => State::OscString, "State::SosPmApcString" => State::SosPmApcString, + "State::Utf8" => State::Utf8, _ => return Err(()) }) } @@ -60,6 +65,7 @@ fn action_from_str<S>(s: &S) -> Result<Action, ()> "Action::Print" => Action::Print, "Action::Put" => Action::Put, "Action::Unhook" => Action::Unhook, + "Action::BeginUtf8" => Action::BeginUtf8, _ => return Err(()) }) } diff --git a/codegen/src/main.rs b/codegen/src/main.rs index 64bddd9..5f8d153 100644 --- a/codegen/src/main.rs +++ b/codegen/src/main.rs @@ -1,18 +1,23 @@ +#![allow(dead_code)] extern crate syntex; extern crate syntex_syntax; mod ext; -#[path="../../src/definitions.rs"] -pub mod definitions; - use std::path::Path; fn main() { + // Expand VT parser state table + let mut registry = syntex::Registry::new(); + ext::vt::register(&mut registry); let src = &Path::new("../src/table.rs.in"); let dst = &Path::new("../src/table.rs"); + registry.expand("vt_state_table", src, dst).expect("expand vt_stable_table ok"); + // Expand UTF8 parser state table let mut registry = syntex::Registry::new(); - ext::register(&mut registry); - registry.expand("state_table", src, dst).expect("expand stable_table ok"); + ext::utf8::register(&mut registry); + let src = &Path::new("../src/utf8/table.rs.in"); + let dst = &Path::new("../src/utf8/table.rs"); + registry.expand("utf8_state_table", src, dst).expect("expand utf8_stable_table ok"); } diff --git a/examples/parselog.rs b/examples/parselog.rs index 804c399..f4ae86a 100644 --- a/examples/parselog.rs +++ b/examples/parselog.rs @@ -9,7 +9,7 @@ use vtparse::{StateMachine, Parser}; struct Log; impl Parser for Log { - fn print(&mut self, _machine: &StateMachine, c: char) { + fn print(&mut self, c: char) { println!("[print] {:?}", c); } fn execute(&mut self, _machine: &StateMachine, byte: u8) { diff --git a/src/definitions.rs b/src/definitions.rs index 5177ca6..ded49cf 100644 --- a/src/definitions.rs +++ b/src/definitions.rs @@ -15,7 +15,7 @@ pub enum State { Ground = 12, OscString = 13, SosPmApcString = 14, - Unused__ = 15, + Utf8 = 15, } #[derive(Debug, Clone, Copy)] @@ -35,7 +35,7 @@ pub enum Action { Print = 12, Put = 13, Unhook = 14, - Unused__ = 15, + BeginUtf8 = 15, } /// Unpack a u8 into a State and Action @@ -67,12 +67,12 @@ mod tests { } match unpack(0x0f) { - (State::Unused__, Action::None) => (), + (State::Utf8, Action::None) => (), _ => panic!("unpack failed"), } match unpack(0xff) { - (State::Unused__, Action::Unused__) => (), + (State::Utf8, Action::BeginUtf8) => (), _ => panic!("unpack failed"), } } @@ -1,5 +1,6 @@ mod table; mod definitions; +mod utf8; pub use definitions::{Action, State, unpack}; @@ -27,6 +28,20 @@ impl State { const MAX_INTERMEDIATES: usize = 2; const MAX_PARAMS: usize = 16; +struct VtUtf8Receiver<'a, P: Parser + 'a>(&'a mut P, &'a mut State); + +impl<'a, P: Parser> utf8::Receiver for VtUtf8Receiver<'a, P> { + fn codepoint(&mut self, c: char) { + self.0.print(c); + *self.1 = State::Ground; + } + + fn invalid_sequence(&mut self) { + self.0.print('�'); + *self.1 = State::Ground; + } +} + /// ANSI VTE Parser /// /// As described in http://vt100.net/emu/dec_ansi_parser @@ -38,7 +53,8 @@ pub struct StateMachine { intermediate_idx: usize, params: [i64; MAX_PARAMS], num_params: usize, - ignoring: bool + ignoring: bool, + utf8_parser: utf8::Parser, } impl StateMachine { @@ -50,6 +66,7 @@ impl StateMachine { params: [0i64; MAX_PARAMS], num_params: 0, ignoring: false, + utf8_parser: utf8::Parser::new(), } } @@ -62,6 +79,12 @@ impl StateMachine { } pub fn advance<P: Parser>(&mut self, parser: &mut P, byte: u8) { + // Utf8 characters are handled out-of-band. + if let State::Utf8 = self.state { + self.process_utf8(parser, byte); + return; + } + // Handle state changes in the anywhere state before evaluating changes // for current state. let mut change = STATE_CHANGE[State::Anywhere as usize][byte as usize]; @@ -76,13 +99,22 @@ impl StateMachine { self.perform_state_change(parser, state, action, byte); } + #[inline] + fn process_utf8<P>(&mut self, parser: &mut P, byte: u8) + where P: Parser + { + let mut receiver = VtUtf8Receiver(parser, &mut self.state); + let utf8_parser = &mut self.utf8_parser; + utf8_parser.advance(&mut receiver, byte); + } + fn perform_state_change<P>(&mut self, parser: &mut P, state: State, action: Action, byte: u8) where P: Parser { macro_rules! maybe_action { ($action:expr, $arg:expr) => { match $action { - Action::None | Action::Unused__ => (), + Action::None => (), action => { self.perform_action(parser, action, $arg); }, @@ -91,7 +123,7 @@ impl StateMachine { } match state { - State::Anywhere | State::Unused__ => { + State::Anywhere => { // Just run the action self.perform_action(parser, action, byte); }, @@ -114,7 +146,7 @@ impl StateMachine { fn perform_action<P: Parser>(&mut self, parser: &mut P, action: Action, byte: u8) { match action { - Action::Print => parser.print(self, byte as char), + Action::Print => parser.print(byte as char), Action::Execute => parser.execute(self, byte), Action::Hook => parser.hook(self, byte), Action::Put => parser.put(self, byte), @@ -124,7 +156,7 @@ impl StateMachine { Action::Unhook => parser.unhook(self, byte), Action::CsiDispatch => parser.csi_dispatch(self, byte as char), Action::EscDispatch => parser.esc_dispatch(self, byte), - Action::Ignore | Action::None | Action::Unused__=> (), + Action::Ignore | Action::None => (), Action::Collect => { if self.intermediate_idx == MAX_INTERMEDIATES { self.ignoring = true; @@ -155,13 +187,16 @@ impl StateMachine { self.intermediate_idx = 0; self.num_params = 0; self.ignoring = false; - } + }, + Action::BeginUtf8 => { + self.process_utf8(parser, byte); + }, } } } pub trait Parser { - fn print(&mut self, &StateMachine, c: char); + fn print(&mut self, c: char); fn execute(&mut self, &StateMachine, byte: u8); fn hook(&mut self, &StateMachine, byte: u8); fn put(&mut self, &StateMachine, byte: u8); diff --git a/src/table.rs b/src/table.rs index 923c7eb..d2034b8 100644 --- a/src/table.rs +++ b/src/table.rs @@ -6,6 +6,9 @@ use definitions::Action; pub static STATE_CHANGE: [[u8; 256]; 16] = [ + // Beginning of UTF-8 2 byte sequence + // Beginning of UTF-8 3 byte sequence + // Beginning of UTF-8 4 byte sequence @@ -280,11 +283,13 @@ pub static STATE_CHANGE: [[u8; 256]; 16] = 80u8, 80u8, 80u8, 80u8, 80u8, 80u8, 0u8, 80u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, - 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, - 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, - 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, - 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, - 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8], + 0u8, 0u8, 0u8, 0u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, + 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, + 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, + 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, + 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, + 255u8, 255u8, 255u8, 255u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8], [112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 0u8, 112u8, 0u8, 0u8, 112u8, 112u8, 112u8, @@ -366,7 +371,7 @@ pub static ENTRY_ACTIONS: &'static [Action] = Action::OscStart, // State::OscString Action::None, // State::SosPmApcString Action::None]; - // State::Unused__ + // State::Utf8 pub static EXIT_ACTIONS: &'static [Action] = &[Action::None, // State::Anywhere @@ -384,4 +389,4 @@ pub static EXIT_ACTIONS: &'static [Action] = Action::None, // State::Ground Action::OscEnd, // State::OscString Action::None, // State::SosPmApcString - Action::None]; // State::Unused__ + Action::None]; // State::Utf8 diff --git a/src/table.rs.in b/src/table.rs.in index 7414c2a..f5a838d 100644 --- a/src/table.rs.in +++ b/src/table.rs.in @@ -3,7 +3,7 @@ use definitions::Action; -pub static STATE_CHANGE: [[u8; 256]; 16] = state_table! { +pub static STATE_CHANGE: [[u8; 256]; 16] = vt_state_table! { State::Anywhere => { 0x18 => (Action::Execute, State::Ground), 0x1a => (Action::Execute, State::Ground), @@ -28,7 +28,13 @@ pub static STATE_CHANGE: [[u8; 256]; 16] = state_table! { 0x20...0x7f => Action::Print, 0x80...0x8f => Action::Execute, 0x91...0x9a => Action::Execute, - 0x9c => Action::Execute + 0x9c => Action::Execute, + // Beginning of UTF-8 2 byte sequence + 0xc2...0xdf => (State::Utf8, Action::BeginUtf8), + // Beginning of UTF-8 3 byte sequence + 0xe0...0xef => (State::Utf8, Action::BeginUtf8), + // Beginning of UTF-8 4 byte sequence + 0xf0...0xf4 => (State::Utf8, Action::BeginUtf8), }, State::Escape => { @@ -191,7 +197,7 @@ pub static ENTRY_ACTIONS: &'static [Action] = &[ Action::None, // State::Ground Action::OscStart, // State::OscString Action::None, // State::SosPmApcString - Action::None, // State::Unused__ + Action::None, // State::Utf8 ]; pub static EXIT_ACTIONS: &'static [Action] = &[ @@ -210,5 +216,5 @@ pub static EXIT_ACTIONS: &'static [Action] = &[ Action::None, // State::Ground Action::OscEnd, // State::OscString Action::None, // State::SosPmApcString - Action::None, // State::Unused__ + Action::None, // State::Utf8 ]; diff --git a/src/utf8/mod.rs b/src/utf8/mod.rs new file mode 100644 index 0000000..3d099b1 --- /dev/null +++ b/src/utf8/mod.rs @@ -0,0 +1,91 @@ +//! A table-driven UTF-8 Parser +//! +//! This module implements a table-driven UTF-8 parser which should +//! theoretically contain the minimal number of branches (1). The only branch is +//! on the `Action` returned from unpacking a transition. +use std::char; + +mod types; +use self::types::{State, Action, unpack}; + +mod table; +use self::table::TRANSITIONS; + +/// Handles codepoint and invalid sequence events from the parser. +pub trait Receiver { + /// Code point parsed + /// + /// Called with the codepoint + fn codepoint(&mut self, char); + + /// Invalid sequence encountered + fn invalid_sequence(&mut self); +} + +/// A parser for Utf8 Characters +/// +/// Repeatedly call `advance` with bytes to emit Utf8 characters +pub struct Parser { + point: u32, + state: State, +} + +/// Continuation bytes are masked with this value. +const CONTINUATION_MASK: u8 = 0b0011_1111; + +impl Parser { + /// Create a new Parser + pub fn new() -> Parser { + Parser { + point: 0, + state: State::Ground, + } + } + + pub fn advance<R>(&mut self, receiver: &mut R, byte: u8) + where R: Receiver + { + let cur = self.state as usize; + let change = TRANSITIONS[cur][byte as usize]; + let (state, action) = unsafe { unpack(change) }; + + self.perform_action(receiver, byte, action); + self.state = state; + } + + fn perform_action<R>(&mut self, receiver: &mut R, byte: u8, action: Action) + where R: Receiver + { + match action { + Action::InvalidSequence => { + self.point = 0; + receiver.invalid_sequence(); + }, + Action::EmitByte => { + receiver.codepoint(byte as char); + }, + Action::SetByte1 => { + let point = self.point | ((byte & CONTINUATION_MASK) as u32); + let c = unsafe { char::from_u32_unchecked(point) }; + self.point = 0; + + receiver.codepoint(c); + }, + Action::SetByte2 => { + self.point |= ((byte & CONTINUATION_MASK) as u32) << 6; + }, + Action::SetByte2Top => { + self.point |= ((byte & 0b0001_1111) as u32) << 6; + }, + Action::SetByte3 => { + self.point |= ((byte & CONTINUATION_MASK) as u32) << 12; + }, + Action::SetByte3Top => { + self.point |= ((byte & 0b0000_1111) as u32) << 12; + }, + Action::SetByte4 => { + self.point |= ((byte & 0b0000_0111) as u32) << 18; + }, + } + } +} diff --git a/src/utf8/table.rs b/src/utf8/table.rs new file mode 100644 index 0000000..5a1292b --- /dev/null +++ b/src/utf8/table.rs @@ -0,0 +1,184 @@ +//! UTF-8 Parse Transition Table + +/// Transition table for parsing UTF-8. This is built from the grammar described +/// at https://tools.ietf.org/html/rfc3629#section-4 which I have copied and +/// formatted below. +/// +/// # UTF-8 Grammar +/// +/// ```ignore +/// UTF8-octets = *( UTF8-char ) +/// UTF8-char = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4 +/// UTF8-1 = %x00-7F +/// UTF8-2 = %xC2-DF UTF8-tail +/// UTF8-3 = %xE0 %xA0-BF UTF8-tail / +/// %xE1-EC 2( UTF8-tail ) / +/// %xED %x80-9F UTF8-tail / +/// %xEE-EF 2( UTF8-tail ) +/// UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / +/// %xF1-F3 3( UTF8-tail ) / +/// %xF4 %x80-8F 2( UTF8-tail ) +/// UTF8-tail = %x80-BF +/// ``` +/// +/// Not specifying an action in this table is equivalent to specifying +/// Action::InvalidSequence. Not specifying a state is equivalent to specifying +/// state::ground. +pub static TRANSITIONS: [[u8; 256]; 8] = + [[16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, + 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, + 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, + 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, + 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, + 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, + 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, + 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, + 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, + 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, + 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, + 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, + 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 100u8, 98u8, + 98u8, 98u8, 98u8, 98u8, 98u8, 98u8, 98u8, 98u8, 98u8, 98u8, 98u8, 101u8, + 98u8, 98u8, 118u8, 113u8, 113u8, 113u8, 119u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8], + [0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, + 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, + 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, + 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, + 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, + 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8], + [0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, + 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, + 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, + 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, + 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, + 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8], + [0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, + 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, + 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, + 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, + 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, + 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8], + [0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, + 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, + 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, + 51u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8], + [0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, + 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, + 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8], + [0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, + 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, + 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, + 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, + 82u8, 82u8, 82u8, 82u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8], + [0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, + 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8]]; diff --git a/src/utf8/table.rs.in b/src/utf8/table.rs.in new file mode 100644 index 0000000..2acafe7 --- /dev/null +++ b/src/utf8/table.rs.in @@ -0,0 +1,60 @@ +//! UTF-8 Parse Transition Table + +/// Transition table for parsing UTF-8. This is built from the grammar described +/// at https://tools.ietf.org/html/rfc3629#section-4 which I have copied and +/// formatted below. +/// +/// # UTF-8 Grammar +/// +/// ```ignore +/// UTF8-octets = *( UTF8-char ) +/// UTF8-char = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4 +/// UTF8-1 = %x00-7F +/// UTF8-2 = %xC2-DF UTF8-tail +/// UTF8-3 = %xE0 %xA0-BF UTF8-tail / +/// %xE1-EC 2( UTF8-tail ) / +/// %xED %x80-9F UTF8-tail / +/// %xEE-EF 2( UTF8-tail ) +/// UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / +/// %xF1-F3 3( UTF8-tail ) / +/// %xF4 %x80-8F 2( UTF8-tail ) +/// UTF8-tail = %x80-BF +/// ``` +/// +/// Not specifying an action in this table is equivalent to specifying +/// Action::InvalidSequence. Not specifying a state is equivalent to specifying +/// state::ground. +pub static TRANSITIONS: [[u8; 256]; 8] = utf8_state_table! { + State::Ground => { + 0x00...0x7f => (State::Ground, Action::EmitByte), + 0xc2...0xdf => (State::Tail1, Action::SetByte2Top), + 0xe0 => (State::U3_2_e0, Action::SetByte3Top), + 0xe1...0xec => (State::Tail2, Action::SetByte3Top), + 0xed => (State::U3_2_ed, Action::SetByte3Top), + 0xee...0xef => (State::Tail2, Action::SetByte3Top), + 0xf0 => (State::Utf8_4_3_f0, Action::SetByte4), + 0xf1...0xf3 => (State::Tail3, Action::SetByte4), + 0xf4 => (State::Utf8_4_3_f4, Action::SetByte4), + }, + State::U3_2_e0 => { + 0xa0...0xbf => (State::Tail1, Action::SetByte2), + }, + State::U3_2_ed => { + 0x80...0x9f => (State::Tail1, Action::SetByte2), + }, + State::Utf8_4_3_f0 => { + 0x90...0xbf => (State::Tail2, Action::SetByte3), + }, + State::Utf8_4_3_f4 => { + 0x80...0x8f => (State::Tail2, Action::SetByte3), + }, + State::Tail3 => { + 0x80...0xbf => (State::Tail2, Action::SetByte3), + }, + State::Tail2 => { + 0x80...0xbf => (State::Tail1, Action::SetByte2), + }, + State::Tail1 => { + 0x80...0xbf => (State::Ground, Action::SetByte1), + }, +}; diff --git a/src/utf8/types.rs b/src/utf8/types.rs new file mode 100644 index 0000000..4c604f4 --- /dev/null +++ b/src/utf8/types.rs @@ -0,0 +1,77 @@ +//! Types supporting the UTF-8 parser +#![allow(non_camel_case_types)] +use std::mem; + +/// States the parser can be in. +/// +/// There is a state for each initial input of the 3 and 4 byte sequences since +/// the following bytes are subject to different conditions than a tail byte. +#[allow(dead_code)] +#[derive(Debug, Copy, Clone)] +pub enum State { + /// Ground state; expect anything + Ground = 0, + /// 3 tail bytes + Tail3 = 1, + /// 2 tail bytes + Tail2 = 2, + /// 1 tail byte + Tail1 = 3, + /// UTF8-3 starting with E0 + U3_2_e0 = 4, + /// UTF8-3 starting with ED + U3_2_ed = 5, + /// UTF8-4 starting with F0 + Utf8_4_3_f0 = 6, + /// UTF8-4 starting with F4 + Utf8_4_3_f4 = 7, +} + +/// Action to take when receiving a byte +#[allow(dead_code)] +#[derive(Debug, Copy, Clone)] +pub enum Action { + /// Unexpected byte; sequence is invalid + InvalidSequence = 0, + /// Received valid 7-bit ASCII byte which can be directly emitted. + EmitByte = 1, + /// Set the bottom continuation byte + SetByte1 = 2, + /// Set the 2nd-from-last continuation byte + SetByte2 = 3, + /// Set the 2nd-from-last byte which is part of a two byte sequence + SetByte2Top = 4, + /// Set the 3rd-from-last continuation byte + SetByte3 = 5, + /// Set the 3rd-from-last byte which is part of a three byte sequence + SetByte3Top = 6, + /// Set the top byte of a four byte sequence. + SetByte4 = 7, +} + +/// Convert a state and action to a u8 +/// +/// State will be the bottom 4 bits and action the top 4 +#[inline] +#[allow(dead_code)] +pub fn pack(state: State, action: Action) -> u8 { + ((action as u8) << 4) | (state as u8) +} + +/// Convert a u8 to a state and action +/// +/// # Unsafety +/// +/// If this function is called with a byte that wasn't encoded with the `pack` +/// function in this module, there is no guarantee that a valid state and action +/// can be produced. +#[inline] +pub unsafe fn unpack(val: u8) -> (State, Action) { + ( + // State is stored in bottom 4 bits + mem::transmute(val & 0x0f), + + // Action is stored in top 4 bits + mem::transmute(val >> 4), + ) +} |