//! A table-driven UTF-8 Parser //! //! This module implements a table-driven UTF-8 parser which should //! theoretically contain the minimal number of branches (1). The only branch is //! on the `Action` returned from unpacking a transition. #![no_std] use core::char; mod types; use self::types::{State, Action, unpack}; mod table; use self::table::TRANSITIONS; /// Handles codepoint and invalid sequence events from the parser. pub trait Receiver { /// Called whenever a codepoint is parsed successfully fn codepoint(&mut self, char); /// Called when an invalid_sequence is detected fn invalid_sequence(&mut self); } /// A parser for Utf8 Characters /// /// Repeatedly call `advance` with bytes to emit Utf8 characters pub struct Parser { point: u32, state: State, } /// Continuation bytes are masked with this value. const CONTINUATION_MASK: u8 = 0b0011_1111; impl Parser { /// Create a new Parser pub fn new() -> Parser { Parser { point: 0, state: State::Ground, } } /// Advance the parser /// /// The provider receiver will be called whenever a codepoint is completed or an invalid /// sequence is detected. pub fn advance(&mut self, receiver: &mut R, byte: u8) where R: Receiver { let cur = self.state as usize; let change = TRANSITIONS[cur][byte as usize]; let (state, action) = unsafe { unpack(change) }; self.perform_action(receiver, byte, action); self.state = state; } fn perform_action(&mut self, receiver: &mut R, byte: u8, action: Action) where R: Receiver { match action { Action::InvalidSequence => { self.point = 0; receiver.invalid_sequence(); }, Action::EmitByte => { receiver.codepoint(byte as char); }, Action::SetByte1 => { let point = self.point | ((byte & CONTINUATION_MASK) as u32); let c = unsafe { char::from_u32_unchecked(point) }; self.point = 0; receiver.codepoint(c); }, Action::SetByte2 => { self.point |= ((byte & CONTINUATION_MASK) as u32) << 6; }, Action::SetByte2Top => { self.point |= ((byte & 0b0001_1111) as u32) << 6; }, Action::SetByte3 => { self.point |= ((byte & CONTINUATION_MASK) as u32) << 12; }, Action::SetByte3Top => { self.point |= ((byte & 0b0000_1111) as u32) << 12; }, Action::SetByte4 => { self.point |= ((byte & 0b0000_0111) as u32) << 18; }, } } } #[cfg(test)] #[macro_use] extern crate std; #[cfg(test)] mod tests { use std::io::Read; use std::fs::File; use std::string::String; use Receiver; use Parser; impl Receiver for String { fn codepoint(&mut self, c: char) { self.push(c); } fn invalid_sequence(&mut self) { } } #[test] fn utf8parse_test() { let mut buffer = String::new(); let mut file = File::open("src/UTF-8-demo.txt").unwrap(); let mut parser = Parser::new(); // read the file to a buffer file.read_to_string(&mut buffer).expect("Reading file to string"); // standard library implementation let expected = String::from_utf8(buffer.as_bytes().to_vec()).unwrap(); // utf8parse implementation let mut actual = String::new(); for byte in buffer.as_bytes().to_vec() { parser.advance(&mut actual, byte) } assert_eq!(actual, expected); } }