diff options
author | Joe Wilm <joe@jwilm.com> | 2016-09-17 15:51:45 -0700 |
---|---|---|
committer | Joe Wilm <joe@jwilm.com> | 2016-09-17 17:03:20 -0700 |
commit | cffdb6de59ceb3fd9983a1c19476e5109da8db97 (patch) | |
tree | 26603abf607d21eefd3b9a6ac79a36dab63b5781 | |
parent | 930f8cc30a5bc4943c1b56e18cf1a3f8bb00bc2a (diff) | |
download | r-alacritty-vte-cffdb6de59ceb3fd9983a1c19476e5109da8db97.tar.gz r-alacritty-vte-cffdb6de59ceb3fd9983a1c19476e5109da8db97.tar.bz2 r-alacritty-vte-cffdb6de59ceb3fd9983a1c19476e5109da8db97.zip |
Add support for UTF-8
This adds a table-driven UTF-8 parser which only has a single branch for
the entire parser. UTF-8 support is essentially bolted onto the VTE
parser. Not the most elegant, but it does prevent the transition tables
from blowing up.
Instead of refactoring the syntax extension to handle both table
definitions, I've opted to copy/paste now for both simplicities sake and
because I can't see a clear path to a minimal shared solution.
-rw-r--r-- | codegen/src/ext/mod.rs | 2 | ||||
-rw-r--r-- | codegen/src/ext/utf8.rs | 386 | ||||
-rw-r--r-- | codegen/src/ext/vt.rs (renamed from codegen/src/ext.rs) | 10 | ||||
-rw-r--r-- | codegen/src/main.rs | 15 | ||||
-rw-r--r-- | examples/parselog.rs | 2 | ||||
-rw-r--r-- | src/definitions.rs | 8 | ||||
-rw-r--r-- | src/lib.rs | 49 | ||||
-rw-r--r-- | src/table.rs | 19 | ||||
-rw-r--r-- | src/table.rs.in | 14 | ||||
-rw-r--r-- | src/utf8/mod.rs | 91 | ||||
-rw-r--r-- | src/utf8/table.rs | 184 | ||||
-rw-r--r-- | src/utf8/table.rs.in | 60 | ||||
-rw-r--r-- | src/utf8/types.rs | 77 |
13 files changed, 887 insertions, 30 deletions
diff --git a/codegen/src/ext/mod.rs b/codegen/src/ext/mod.rs new file mode 100644 index 0000000..c28d9f7 --- /dev/null +++ b/codegen/src/ext/mod.rs @@ -0,0 +1,2 @@ +pub mod utf8; +pub mod vt; diff --git a/codegen/src/ext/utf8.rs b/codegen/src/ext/utf8.rs new file mode 100644 index 0000000..5b73081 --- /dev/null +++ b/codegen/src/ext/utf8.rs @@ -0,0 +1,386 @@ +//! Macro expansion for the utf8 parser state table +use std::fmt; + +use syntex::Registry; + +use syntex_syntax::ast::{self, ExprKind, Arm, Expr, PatKind, LitKind, Pat}; +use syntex_syntax::codemap::Span; +use syntex_syntax::ext::base::{ExtCtxt, MacEager, MacResult, DummyResult}; +use syntex_syntax::ext::build::AstBuilder; +use syntex_syntax::parse::token::{Token, DelimToken}; +use syntex_syntax::parse::parser::Parser; +use syntex_syntax::parse::PResult; +use syntex_syntax::ptr::P; +use syntex_syntax::tokenstream::TokenTree; + +#[path="../../../src/utf8/types.rs"] +mod types; + +use self::types::{State, Action, pack}; + +pub fn register(registry: &mut Registry) { + registry.add_macro("utf8_state_table", expand_state_table); +} + +fn state_from_str<S>(s: &S) -> Result<State, ()> + where S: AsRef<str> +{ + Ok(match s.as_ref() { + "State::Ground" => State::Ground, + "State::Tail3" => State::Tail3, + "State::Tail2" => State::Tail2, + "State::Tail1" => State::Tail1, + "State::U3_2_e0" => State::U3_2_e0, + "State::U3_2_ed" => State::U3_2_ed, + "State::Utf8_4_3_f0" => State::Utf8_4_3_f0, + "State::Utf8_4_3_f4" => State::Utf8_4_3_f4, + _ => return Err(()) + }) +} + +fn action_from_str<S>(s: &S) -> Result<Action, ()> + where S: AsRef<str> +{ + Ok(match s.as_ref() { + "Action::InvalidSequence" => Action::InvalidSequence, + "Action::EmitByte" => Action::EmitByte, + "Action::SetByte1" => Action::SetByte1, + "Action::SetByte2" => Action::SetByte2, + "Action::SetByte2Top" => Action::SetByte2Top, + "Action::SetByte3" => Action::SetByte3, + "Action::SetByte3Top" => Action::SetByte3Top, + "Action::SetByte4" => Action::SetByte4, + _ => return Err(()) + }) +} + +fn parse_table_input_mappings<'a>(parser: &mut Parser<'a>) -> PResult<'a, Vec<Arm>> { + // Must start on open brace + try!(parser.expect(&Token::OpenDelim(DelimToken::Brace))); + + let mut arms: Vec<Arm> = Vec::new(); + while parser.token != Token::CloseDelim(DelimToken::Brace) { + match parser.parse_arm() { + Ok(arm) => arms.push(arm), + Err(e) => { + // Recover by skipping to the end of the block. + return Err(e); + } + } + } + + // Consume the closing brace + parser.bump(); + Ok(arms) +} + +/// Expressions describing state transitions and actions +#[derive(Debug)] +struct TableDefinitionExprs { + state_expr: P<Expr>, + mapping_arms: Vec<Arm>, +} + +fn state_from_expr(expr: P<Expr>, cx: &mut ExtCtxt) -> Result<State, ()> { + let s = match expr.node { + ExprKind::Path(ref _qself, ref path) => { + path.to_string() + }, + _ => { + cx.span_err(expr.span, "expected State"); + return Err(()) + } + }; + + state_from_str(&s).map_err(|_| { + cx.span_err(expr.span, "expected State"); + () + }) +} + +fn u8_lit_from_expr(expr: &Expr, cx: &mut ExtCtxt) -> Result<u8, ()> { + static MSG: &'static str = "expected u8 int literal"; + + match expr.node { + ExprKind::Lit(ref lit) => { + match lit.node { + LitKind::Int(val, _) => { + Ok(val as u8) + }, + _ => { + cx.span_err(lit.span, MSG); + return Err(()); + } + } + }, + _ => { + cx.span_err(expr.span, MSG); + return Err(()); + } + } +} + +fn input_mapping_from_arm(arm: Arm, cx: &mut ExtCtxt) -> Result<InputMapping, ()> { + let Arm { pats, body, .. } = arm; + + let input = try!(InputDefinition::from_pat(&pats[0], cx)); + let transition = try!(Transition::from_expr(&body, cx)); + + Ok(InputMapping { + input: input, + transition: transition, + }) +} + +/// What happens when certain input is received +#[derive(Copy, Clone)] +enum Transition { + State(State), + Action(Action), + StateAction(State, Action), +} + +impl fmt::Debug for Transition { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match *self { + Transition::State(state) => try!(write!(f, "State({:?})", state)), + Transition::Action(action) => try!(write!(f, "Action({:?})", action)), + Transition::StateAction(state, action) => { + try!(write!(f, "StateAction({:?}, {:?})", state, action)); + } + } + + write!(f, " -> {:?}", self.pack_u8()) + } +} + +impl Transition { + // State is stored in the top 4 bits + fn pack_u8(&self) -> u8 { + match *self { + Transition::State(state) => pack(state, Action::InvalidSequence), + Transition::Action(action) => pack(State::Ground, action), + Transition::StateAction(state, action) => pack(state, action), + } + } +} + +impl Transition { + fn from_expr(expr: &Expr, cx: &mut ExtCtxt) -> Result<Transition, ()> { + match expr.node { + ExprKind::Tup(ref tup_exprs) => { + let mut action = None; + let mut state = None; + + for tup_expr in tup_exprs { + if let ExprKind::Path(_, ref path) = tup_expr.node { + let path_str = path.to_string(); + if path_str.starts_with('A') { + action = Some(try!(action_from_str(&path_str) + .map_err(|_| { + cx.span_err(expr.span, "invalid action"); + }))); + } else { + state = Some(try!(state_from_str(&path_str) + .map_err(|_| { + cx.span_err(expr.span, "invalid state"); + }))); + } + } + } + + match (action, state) { + (Some(action), Some(state)) => Ok(Transition::StateAction(state, action)), + (None, Some(state)) => Ok(Transition::State(state)), + (Some(action), None) => Ok(Transition::Action(action)), + _ => { + cx.span_err(expr.span, "expected Action and/or State"); + Err(()) + } + } + }, + ExprKind::Path(_, ref path) => { + // Path can be Action or State + let path_str = path.to_string(); + + if path_str.starts_with('A') { + let action = try!(action_from_str(&path_str) + .map_err(|_| { + cx.span_err(expr.span, "invalid action"); + })); + Ok(Transition::Action(action)) + } else { + let state = try!(state_from_str(&path_str) + .map_err(|_| { + cx.span_err(expr.span, "invalid state"); + })); + + Ok(Transition::State(state)) + } + }, + _ => { + cx.span_err(expr.span, "expected Action and/or State"); + Err(()) + } + } + } +} + +#[derive(Debug)] +enum InputDefinition { + Specific(u8), + Range { start: u8, end: u8 } +} + +impl InputDefinition { + fn from_pat(pat: &Pat, cx: &mut ExtCtxt) -> Result<InputDefinition, ()> { + Ok(match pat.node { + PatKind::Lit(ref lit_expr) => { + InputDefinition::Specific(try!(u8_lit_from_expr(&lit_expr, cx))) + }, + PatKind::Range(ref start_expr, ref end_expr) => { + InputDefinition::Range { + start: try!(u8_lit_from_expr(start_expr, cx)), + end: try!(u8_lit_from_expr(end_expr, cx)), + } + }, + _ => { + cx.span_err(pat.span, "expected literal or range expression"); + return Err(()) + } + }) + } +} + +#[derive(Debug)] +struct InputMapping { + input: InputDefinition, + transition: Transition, +} + +#[derive(Debug)] +struct TableDefinition { + state: State, + mappings: Vec<InputMapping>, +} + +fn parse_raw_definitions( + definitions: Vec<TableDefinitionExprs>, + cx: &mut ExtCtxt +) -> Result<Vec<TableDefinition>, ()> { + let mut out = Vec::new(); + + for raw in definitions { + let TableDefinitionExprs { state_expr, mapping_arms } = raw; + let state = try!(state_from_expr(state_expr, cx)); + + let mut mappings = Vec::new(); + for arm in mapping_arms { + mappings.push(try!(input_mapping_from_arm(arm, cx))); + } + + out.push(TableDefinition { + state: state, + mappings: mappings, + }) + } + + Ok(out) +} + +fn parse_table_definition<'a>(parser: &mut Parser<'a>) -> PResult<'a, TableDefinitionExprs> { + let state_expr = try!(parser.parse_expr()); + try!(parser.expect(&Token::FatArrow)); + let mappings = try!(parse_table_input_mappings(parser)); + + Ok(TableDefinitionExprs { + state_expr: state_expr, + mapping_arms: mappings + }) +} + +fn parse_table_definition_list<'a>(parser: &mut Parser<'a>) + -> PResult<'a, Vec<TableDefinitionExprs>> +{ + let mut definitions = Vec::new(); + while parser.token != Token::Eof { + definitions.push(try!(parse_table_definition(parser))); + parser.eat(&Token::Comma); + } + + Ok(definitions) +} + +fn build_state_tables<T>(defs: T) -> [[u8; 256]; 8] + where T: AsRef<[TableDefinition]> +{ + let mut result = [[0u8; 256]; 8]; + + for def in defs.as_ref() { + let state = def.state; + let state = state as u8; + let transitions = &mut result[state as usize]; + + for mapping in &def.mappings { + let trans = mapping.transition.pack_u8(); + match mapping.input { + InputDefinition::Specific(idx) => { + transitions[idx as usize] = trans; + }, + InputDefinition::Range { start, end } => { + for idx in start..end { + transitions[idx as usize] = trans; + } + transitions[end as usize] = trans; + }, + } + } + } + + result +} + +fn build_table_ast(cx: &mut ExtCtxt, sp: Span, table: [[u8; 256]; 8]) -> P<ast::Expr> { + let table = table.iter() + .map(|list| { + let exprs = list.iter() + .map(|num| cx.expr_u8(sp, *num)) + .collect(); + cx.expr_vec(sp, exprs) + }) + .collect(); + + cx.expr_vec(sp, table) +} + +fn expand_state_table<'cx>( + cx: &'cx mut ExtCtxt, + sp: Span, + args: &[TokenTree]) + -> Box<MacResult + 'cx> +{ + macro_rules! ptry { + ($pres:expr) => { + match $pres { + Ok(val) => val, + Err(mut err) => { + err.emit(); + return DummyResult::any(sp); + } + } + } + } + + // Parse the lookup spec + let mut parser: Parser = cx.new_parser_from_tts(args); + let definitions = ptry!(parse_table_definition_list(&mut parser)); + let definitions = match parse_raw_definitions(definitions, cx) { + Ok(definitions) => definitions, + Err(_) => return DummyResult::any(sp), + }; + + let table = build_state_tables(&definitions); + let ast = build_table_ast(cx, sp, table); + + MacEager::expr(ast) +} diff --git a/codegen/src/ext.rs b/codegen/src/ext/vt.rs index cef2267..3f5bcf3 100644 --- a/codegen/src/ext.rs +++ b/codegen/src/ext/vt.rs @@ -1,3 +1,4 @@ +//! Macro expansion for the virtual terminal parser state table use std::fmt; use syntex::Registry; @@ -12,10 +13,13 @@ use syntex_syntax::parse::PResult; use syntex_syntax::ptr::P; use syntex_syntax::tokenstream::TokenTree; -use definitions::{State, Action}; +#[path="../../../src/definitions.rs"] +mod definitions; + +use self::definitions::{State, Action}; pub fn register(registry: &mut Registry) { - registry.add_macro("state_table", expand_state_table); + registry.add_macro("vt_state_table", expand_state_table); } fn state_from_str<S>(s: &S) -> Result<State, ()> @@ -37,6 +41,7 @@ fn state_from_str<S>(s: &S) -> Result<State, ()> "State::Ground" => State::Ground, "State::OscString" => State::OscString, "State::SosPmApcString" => State::SosPmApcString, + "State::Utf8" => State::Utf8, _ => return Err(()) }) } @@ -60,6 +65,7 @@ fn action_from_str<S>(s: &S) -> Result<Action, ()> "Action::Print" => Action::Print, "Action::Put" => Action::Put, "Action::Unhook" => Action::Unhook, + "Action::BeginUtf8" => Action::BeginUtf8, _ => return Err(()) }) } diff --git a/codegen/src/main.rs b/codegen/src/main.rs index 64bddd9..5f8d153 100644 --- a/codegen/src/main.rs +++ b/codegen/src/main.rs @@ -1,18 +1,23 @@ +#![allow(dead_code)] extern crate syntex; extern crate syntex_syntax; mod ext; -#[path="../../src/definitions.rs"] -pub mod definitions; - use std::path::Path; fn main() { + // Expand VT parser state table + let mut registry = syntex::Registry::new(); + ext::vt::register(&mut registry); let src = &Path::new("../src/table.rs.in"); let dst = &Path::new("../src/table.rs"); + registry.expand("vt_state_table", src, dst).expect("expand vt_stable_table ok"); + // Expand UTF8 parser state table let mut registry = syntex::Registry::new(); - ext::register(&mut registry); - registry.expand("state_table", src, dst).expect("expand stable_table ok"); + ext::utf8::register(&mut registry); + let src = &Path::new("../src/utf8/table.rs.in"); + let dst = &Path::new("../src/utf8/table.rs"); + registry.expand("utf8_state_table", src, dst).expect("expand utf8_stable_table ok"); } diff --git a/examples/parselog.rs b/examples/parselog.rs index 804c399..f4ae86a 100644 --- a/examples/parselog.rs +++ b/examples/parselog.rs @@ -9,7 +9,7 @@ use vtparse::{StateMachine, Parser}; struct Log; impl Parser for Log { - fn print(&mut self, _machine: &StateMachine, c: char) { + fn print(&mut self, c: char) { println!("[print] {:?}", c); } fn execute(&mut self, _machine: &StateMachine, byte: u8) { diff --git a/src/definitions.rs b/src/definitions.rs index 5177ca6..ded49cf 100644 --- a/src/definitions.rs +++ b/src/definitions.rs @@ -15,7 +15,7 @@ pub enum State { Ground = 12, OscString = 13, SosPmApcString = 14, - Unused__ = 15, + Utf8 = 15, } #[derive(Debug, Clone, Copy)] @@ -35,7 +35,7 @@ pub enum Action { Print = 12, Put = 13, Unhook = 14, - Unused__ = 15, + BeginUtf8 = 15, } /// Unpack a u8 into a State and Action @@ -67,12 +67,12 @@ mod tests { } match unpack(0x0f) { - (State::Unused__, Action::None) => (), + (State::Utf8, Action::None) => (), _ => panic!("unpack failed"), } match unpack(0xff) { - (State::Unused__, Action::Unused__) => (), + (State::Utf8, Action::BeginUtf8) => (), _ => panic!("unpack failed"), } } @@ -1,5 +1,6 @@ mod table; mod definitions; +mod utf8; pub use definitions::{Action, State, unpack}; @@ -27,6 +28,20 @@ impl State { const MAX_INTERMEDIATES: usize = 2; const MAX_PARAMS: usize = 16; +struct VtUtf8Receiver<'a, P: Parser + 'a>(&'a mut P, &'a mut State); + +impl<'a, P: Parser> utf8::Receiver for VtUtf8Receiver<'a, P> { + fn codepoint(&mut self, c: char) { + self.0.print(c); + *self.1 = State::Ground; + } + + fn invalid_sequence(&mut self) { + self.0.print('�'); + *self.1 = State::Ground; + } +} + /// ANSI VTE Parser /// /// As described in http://vt100.net/emu/dec_ansi_parser @@ -38,7 +53,8 @@ pub struct StateMachine { intermediate_idx: usize, params: [i64; MAX_PARAMS], num_params: usize, - ignoring: bool + ignoring: bool, + utf8_parser: utf8::Parser, } impl StateMachine { @@ -50,6 +66,7 @@ impl StateMachine { params: [0i64; MAX_PARAMS], num_params: 0, ignoring: false, + utf8_parser: utf8::Parser::new(), } } @@ -62,6 +79,12 @@ impl StateMachine { } pub fn advance<P: Parser>(&mut self, parser: &mut P, byte: u8) { + // Utf8 characters are handled out-of-band. + if let State::Utf8 = self.state { + self.process_utf8(parser, byte); + return; + } + // Handle state changes in the anywhere state before evaluating changes // for current state. let mut change = STATE_CHANGE[State::Anywhere as usize][byte as usize]; @@ -76,13 +99,22 @@ impl StateMachine { self.perform_state_change(parser, state, action, byte); } + #[inline] + fn process_utf8<P>(&mut self, parser: &mut P, byte: u8) + where P: Parser + { + let mut receiver = VtUtf8Receiver(parser, &mut self.state); + let utf8_parser = &mut self.utf8_parser; + utf8_parser.advance(&mut receiver, byte); + } + fn perform_state_change<P>(&mut self, parser: &mut P, state: State, action: Action, byte: u8) where P: Parser { macro_rules! maybe_action { ($action:expr, $arg:expr) => { match $action { - Action::None | Action::Unused__ => (), + Action::None => (), action => { self.perform_action(parser, action, $arg); }, @@ -91,7 +123,7 @@ impl StateMachine { } match state { - State::Anywhere | State::Unused__ => { + State::Anywhere => { // Just run the action self.perform_action(parser, action, byte); }, @@ -114,7 +146,7 @@ impl StateMachine { fn perform_action<P: Parser>(&mut self, parser: &mut P, action: Action, byte: u8) { match action { - Action::Print => parser.print(self, byte as char), + Action::Print => parser.print(byte as char), Action::Execute => parser.execute(self, byte), Action::Hook => parser.hook(self, byte), Action::Put => parser.put(self, byte), @@ -124,7 +156,7 @@ impl StateMachine { Action::Unhook => parser.unhook(self, byte), Action::CsiDispatch => parser.csi_dispatch(self, byte as char), Action::EscDispatch => parser.esc_dispatch(self, byte), - Action::Ignore | Action::None | Action::Unused__=> (), + Action::Ignore | Action::None => (), Action::Collect => { if self.intermediate_idx == MAX_INTERMEDIATES { self.ignoring = true; @@ -155,13 +187,16 @@ impl StateMachine { self.intermediate_idx = 0; self.num_params = 0; self.ignoring = false; - } + }, + Action::BeginUtf8 => { + self.process_utf8(parser, byte); + }, } } } pub trait Parser { - fn print(&mut self, &StateMachine, c: char); + fn print(&mut self, c: char); fn execute(&mut self, &StateMachine, byte: u8); fn hook(&mut self, &StateMachine, byte: u8); fn put(&mut self, &StateMachine, byte: u8); diff --git a/src/table.rs b/src/table.rs index 923c7eb..d2034b8 100644 --- a/src/table.rs +++ b/src/table.rs @@ -6,6 +6,9 @@ use definitions::Action; pub static STATE_CHANGE: [[u8; 256]; 16] = [ + // Beginning of UTF-8 2 byte sequence + // Beginning of UTF-8 3 byte sequence + // Beginning of UTF-8 4 byte sequence @@ -280,11 +283,13 @@ pub static STATE_CHANGE: [[u8; 256]; 16] = 80u8, 80u8, 80u8, 80u8, 80u8, 80u8, 0u8, 80u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, - 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, - 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, - 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, - 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, - 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8], + 0u8, 0u8, 0u8, 0u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, + 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, + 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, + 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, + 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, + 255u8, 255u8, 255u8, 255u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8], [112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 0u8, 112u8, 0u8, 0u8, 112u8, 112u8, 112u8, @@ -366,7 +371,7 @@ pub static ENTRY_ACTIONS: &'static [Action] = Action::OscStart, // State::OscString Action::None, // State::SosPmApcString Action::None]; - // State::Unused__ + // State::Utf8 pub static EXIT_ACTIONS: &'static [Action] = &[Action::None, // State::Anywhere @@ -384,4 +389,4 @@ pub static EXIT_ACTIONS: &'static [Action] = Action::None, // State::Ground Action::OscEnd, // State::OscString Action::None, // State::SosPmApcString - Action::None]; // State::Unused__ + Action::None]; // State::Utf8 diff --git a/src/table.rs.in b/src/table.rs.in index 7414c2a..f5a838d 100644 --- a/src/table.rs.in +++ b/src/table.rs.in @@ -3,7 +3,7 @@ use definitions::Action; -pub static STATE_CHANGE: [[u8; 256]; 16] = state_table! { +pub static STATE_CHANGE: [[u8; 256]; 16] = vt_state_table! { State::Anywhere => { 0x18 => (Action::Execute, State::Ground), 0x1a => (Action::Execute, State::Ground), @@ -28,7 +28,13 @@ pub static STATE_CHANGE: [[u8; 256]; 16] = state_table! { 0x20...0x7f => Action::Print, 0x80...0x8f => Action::Execute, 0x91...0x9a => Action::Execute, - 0x9c => Action::Execute + 0x9c => Action::Execute, + // Beginning of UTF-8 2 byte sequence + 0xc2...0xdf => (State::Utf8, Action::BeginUtf8), + // Beginning of UTF-8 3 byte sequence + 0xe0...0xef => (State::Utf8, Action::BeginUtf8), + // Beginning of UTF-8 4 byte sequence + 0xf0...0xf4 => (State::Utf8, Action::BeginUtf8), }, State::Escape => { @@ -191,7 +197,7 @@ pub static ENTRY_ACTIONS: &'static [Action] = &[ Action::None, // State::Ground Action::OscStart, // State::OscString Action::None, // State::SosPmApcString - Action::None, // State::Unused__ + Action::None, // State::Utf8 ]; pub static EXIT_ACTIONS: &'static [Action] = &[ @@ -210,5 +216,5 @@ pub static EXIT_ACTIONS: &'static [Action] = &[ Action::None, // State::Ground Action::OscEnd, // State::OscString Action::None, // State::SosPmApcString - Action::None, // State::Unused__ + Action::None, // State::Utf8 ]; diff --git a/src/utf8/mod.rs b/src/utf8/mod.rs new file mode 100644 index 0000000..3d099b1 --- /dev/null +++ b/src/utf8/mod.rs @@ -0,0 +1,91 @@ +//! A table-driven UTF-8 Parser +//! +//! This module implements a table-driven UTF-8 parser which should +//! theoretically contain the minimal number of branches (1). The only branch is +//! on the `Action` returned from unpacking a transition. +use std::char; + +mod types; +use self::types::{State, Action, unpack}; + +mod table; +use self::table::TRANSITIONS; + +/// Handles codepoint and invalid sequence events from the parser. +pub trait Receiver { + /// Code point parsed + /// + /// Called with the codepoint + fn codepoint(&mut self, char); + + /// Invalid sequence encountered + fn invalid_sequence(&mut self); +} + +/// A parser for Utf8 Characters +/// +/// Repeatedly call `advance` with bytes to emit Utf8 characters +pub struct Parser { + point: u32, + state: State, +} + +/// Continuation bytes are masked with this value. +const CONTINUATION_MASK: u8 = 0b0011_1111; + +impl Parser { + /// Create a new Parser + pub fn new() -> Parser { + Parser { + point: 0, + state: State::Ground, + } + } + + pub fn advance<R>(&mut self, receiver: &mut R, byte: u8) + where R: Receiver + { + let cur = self.state as usize; + let change = TRANSITIONS[cur][byte as usize]; + let (state, action) = unsafe { unpack(change) }; + + self.perform_action(receiver, byte, action); + self.state = state; + } + + fn perform_action<R>(&mut self, receiver: &mut R, byte: u8, action: Action) + where R: Receiver + { + match action { + Action::InvalidSequence => { + self.point = 0; + receiver.invalid_sequence(); + }, + Action::EmitByte => { + receiver.codepoint(byte as char); + }, + Action::SetByte1 => { + let point = self.point | ((byte & CONTINUATION_MASK) as u32); + let c = unsafe { char::from_u32_unchecked(point) }; + self.point = 0; + + receiver.codepoint(c); + }, + Action::SetByte2 => { + self.point |= ((byte & CONTINUATION_MASK) as u32) << 6; + }, + Action::SetByte2Top => { + self.point |= ((byte & 0b0001_1111) as u32) << 6; + }, + Action::SetByte3 => { + self.point |= ((byte & CONTINUATION_MASK) as u32) << 12; + }, + Action::SetByte3Top => { + self.point |= ((byte & 0b0000_1111) as u32) << 12; + }, + Action::SetByte4 => { + self.point |= ((byte & 0b0000_0111) as u32) << 18; + }, + } + } +} diff --git a/src/utf8/table.rs b/src/utf8/table.rs new file mode 100644 index 0000000..5a1292b --- /dev/null +++ b/src/utf8/table.rs @@ -0,0 +1,184 @@ +//! UTF-8 Parse Transition Table + +/// Transition table for parsing UTF-8. This is built from the grammar described +/// at https://tools.ietf.org/html/rfc3629#section-4 which I have copied and +/// formatted below. +/// +/// # UTF-8 Grammar +/// +/// ```ignore +/// UTF8-octets = *( UTF8-char ) +/// UTF8-char = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4 +/// UTF8-1 = %x00-7F +/// UTF8-2 = %xC2-DF UTF8-tail +/// UTF8-3 = %xE0 %xA0-BF UTF8-tail / +/// %xE1-EC 2( UTF8-tail ) / +/// %xED %x80-9F UTF8-tail / +/// %xEE-EF 2( UTF8-tail ) +/// UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / +/// %xF1-F3 3( UTF8-tail ) / +/// %xF4 %x80-8F 2( UTF8-tail ) +/// UTF8-tail = %x80-BF +/// ``` +/// +/// Not specifying an action in this table is equivalent to specifying +/// Action::InvalidSequence. Not specifying a state is equivalent to specifying +/// state::ground. +pub static TRANSITIONS: [[u8; 256]; 8] = + [[16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, + 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, + 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, + 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, + 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, + 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, + 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, + 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, + 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, + 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, + 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, + 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, + 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 100u8, 98u8, + 98u8, 98u8, 98u8, 98u8, 98u8, 98u8, 98u8, 98u8, 98u8, 98u8, 98u8, 101u8, + 98u8, 98u8, 118u8, 113u8, 113u8, 113u8, 119u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8], + [0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, + 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, + 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, + 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, + 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, + 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8], + [0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, + 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, + 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, + 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, + 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, + 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8], + [0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, + 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, + 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, + 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, + 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, + 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8], + [0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, + 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, + 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, + 51u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8], + [0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, + 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, + 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8], + [0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, + 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, + 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, + 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, + 82u8, 82u8, 82u8, 82u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8], + [0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, + 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8]]; diff --git a/src/utf8/table.rs.in b/src/utf8/table.rs.in new file mode 100644 index 0000000..2acafe7 --- /dev/null +++ b/src/utf8/table.rs.in @@ -0,0 +1,60 @@ +//! UTF-8 Parse Transition Table + +/// Transition table for parsing UTF-8. This is built from the grammar described +/// at https://tools.ietf.org/html/rfc3629#section-4 which I have copied and +/// formatted below. +/// +/// # UTF-8 Grammar +/// +/// ```ignore +/// UTF8-octets = *( UTF8-char ) +/// UTF8-char = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4 +/// UTF8-1 = %x00-7F +/// UTF8-2 = %xC2-DF UTF8-tail +/// UTF8-3 = %xE0 %xA0-BF UTF8-tail / +/// %xE1-EC 2( UTF8-tail ) / +/// %xED %x80-9F UTF8-tail / +/// %xEE-EF 2( UTF8-tail ) +/// UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / +/// %xF1-F3 3( UTF8-tail ) / +/// %xF4 %x80-8F 2( UTF8-tail ) +/// UTF8-tail = %x80-BF +/// ``` +/// +/// Not specifying an action in this table is equivalent to specifying +/// Action::InvalidSequence. Not specifying a state is equivalent to specifying +/// state::ground. +pub static TRANSITIONS: [[u8; 256]; 8] = utf8_state_table! { + State::Ground => { + 0x00...0x7f => (State::Ground, Action::EmitByte), + 0xc2...0xdf => (State::Tail1, Action::SetByte2Top), + 0xe0 => (State::U3_2_e0, Action::SetByte3Top), + 0xe1...0xec => (State::Tail2, Action::SetByte3Top), + 0xed => (State::U3_2_ed, Action::SetByte3Top), + 0xee...0xef => (State::Tail2, Action::SetByte3Top), + 0xf0 => (State::Utf8_4_3_f0, Action::SetByte4), + 0xf1...0xf3 => (State::Tail3, Action::SetByte4), + 0xf4 => (State::Utf8_4_3_f4, Action::SetByte4), + }, + State::U3_2_e0 => { + 0xa0...0xbf => (State::Tail1, Action::SetByte2), + }, + State::U3_2_ed => { + 0x80...0x9f => (State::Tail1, Action::SetByte2), + }, + State::Utf8_4_3_f0 => { + 0x90...0xbf => (State::Tail2, Action::SetByte3), + }, + State::Utf8_4_3_f4 => { + 0x80...0x8f => (State::Tail2, Action::SetByte3), + }, + State::Tail3 => { + 0x80...0xbf => (State::Tail2, Action::SetByte3), + }, + State::Tail2 => { + 0x80...0xbf => (State::Tail1, Action::SetByte2), + }, + State::Tail1 => { + 0x80...0xbf => (State::Ground, Action::SetByte1), + }, +}; diff --git a/src/utf8/types.rs b/src/utf8/types.rs new file mode 100644 index 0000000..4c604f4 --- /dev/null +++ b/src/utf8/types.rs @@ -0,0 +1,77 @@ +//! Types supporting the UTF-8 parser +#![allow(non_camel_case_types)] +use std::mem; + +/// States the parser can be in. +/// +/// There is a state for each initial input of the 3 and 4 byte sequences since +/// the following bytes are subject to different conditions than a tail byte. +#[allow(dead_code)] +#[derive(Debug, Copy, Clone)] +pub enum State { + /// Ground state; expect anything + Ground = 0, + /// 3 tail bytes + Tail3 = 1, + /// 2 tail bytes + Tail2 = 2, + /// 1 tail byte + Tail1 = 3, + /// UTF8-3 starting with E0 + U3_2_e0 = 4, + /// UTF8-3 starting with ED + U3_2_ed = 5, + /// UTF8-4 starting with F0 + Utf8_4_3_f0 = 6, + /// UTF8-4 starting with F4 + Utf8_4_3_f4 = 7, +} + +/// Action to take when receiving a byte +#[allow(dead_code)] +#[derive(Debug, Copy, Clone)] +pub enum Action { + /// Unexpected byte; sequence is invalid + InvalidSequence = 0, + /// Received valid 7-bit ASCII byte which can be directly emitted. + EmitByte = 1, + /// Set the bottom continuation byte + SetByte1 = 2, + /// Set the 2nd-from-last continuation byte + SetByte2 = 3, + /// Set the 2nd-from-last byte which is part of a two byte sequence + SetByte2Top = 4, + /// Set the 3rd-from-last continuation byte + SetByte3 = 5, + /// Set the 3rd-from-last byte which is part of a three byte sequence + SetByte3Top = 6, + /// Set the top byte of a four byte sequence. + SetByte4 = 7, +} + +/// Convert a state and action to a u8 +/// +/// State will be the bottom 4 bits and action the top 4 +#[inline] +#[allow(dead_code)] +pub fn pack(state: State, action: Action) -> u8 { + ((action as u8) << 4) | (state as u8) +} + +/// Convert a u8 to a state and action +/// +/// # Unsafety +/// +/// If this function is called with a byte that wasn't encoded with the `pack` +/// function in this module, there is no guarantee that a valid state and action +/// can be produced. +#[inline] +pub unsafe fn unpack(val: u8) -> (State, Action) { + ( + // State is stored in bottom 4 bits + mem::transmute(val & 0x0f), + + // Action is stored in top 4 bits + mem::transmute(val >> 4), + ) +} |