diff options
author | Joe Wilm <joe@jwilm.com> | 2016-09-17 17:02:29 -0700 |
---|---|---|
committer | Joe Wilm <joe@jwilm.com> | 2016-09-17 17:03:25 -0700 |
commit | 917080a5c27b3310daab135f9bfdbc531cb54186 (patch) | |
tree | 29e73dbde735185a6edbf0e7d3b1c354cf6a75b5 /utf8parse/src/lib.rs | |
parent | 85388ab070fbc41c8cce3ffbfbcc0d1d917109e0 (diff) | |
download | r-alacritty-vte-917080a5c27b3310daab135f9bfdbc531cb54186.tar.gz r-alacritty-vte-917080a5c27b3310daab135f9bfdbc531cb54186.tar.bz2 r-alacritty-vte-917080a5c27b3310daab135f9bfdbc531cb54186.zip |
Move utf8 parsing into separate crate
Diffstat (limited to 'utf8parse/src/lib.rs')
-rw-r--r-- | utf8parse/src/lib.rs | 93 |
1 files changed, 93 insertions, 0 deletions
diff --git a/utf8parse/src/lib.rs b/utf8parse/src/lib.rs new file mode 100644 index 0000000..9585642 --- /dev/null +++ b/utf8parse/src/lib.rs @@ -0,0 +1,93 @@ +//! A table-driven UTF-8 Parser +//! +//! This module implements a table-driven UTF-8 parser which should +//! theoretically contain the minimal number of branches (1). The only branch is +//! on the `Action` returned from unpacking a transition. +use std::char; + +mod types; +use self::types::{State, Action, unpack}; + +mod table; +use self::table::TRANSITIONS; + +/// Handles codepoint and invalid sequence events from the parser. +pub trait Receiver { + /// Called whenever a codepoint is parsed successfully + fn codepoint(&mut self, char); + + /// Called when an invalid_sequence is detected + fn invalid_sequence(&mut self); +} + +/// A parser for Utf8 Characters +/// +/// Repeatedly call `advance` with bytes to emit Utf8 characters +pub struct Parser { + point: u32, + state: State, +} + +/// Continuation bytes are masked with this value. +const CONTINUATION_MASK: u8 = 0b0011_1111; + +impl Parser { + /// Create a new Parser + pub fn new() -> Parser { + Parser { + point: 0, + state: State::Ground, + } + } + + /// Advance the parser + /// + /// The provider receiver will be called whenever a codepoint is completed or an invalid + /// sequence is detected. + pub fn advance<R>(&mut self, receiver: &mut R, byte: u8) + where R: Receiver + { + let cur = self.state as usize; + let change = TRANSITIONS[cur][byte as usize]; + let (state, action) = unsafe { unpack(change) }; + + self.perform_action(receiver, byte, action); + self.state = state; + } + + fn perform_action<R>(&mut self, receiver: &mut R, byte: u8, action: Action) + where R: Receiver + { + match action { + Action::InvalidSequence => { + self.point = 0; + receiver.invalid_sequence(); + }, + Action::EmitByte => { + receiver.codepoint(byte as char); + }, + Action::SetByte1 => { + let point = self.point | ((byte & CONTINUATION_MASK) as u32); + let c = unsafe { char::from_u32_unchecked(point) }; + self.point = 0; + + receiver.codepoint(c); + }, + Action::SetByte2 => { + self.point |= ((byte & CONTINUATION_MASK) as u32) << 6; + }, + Action::SetByte2Top => { + self.point |= ((byte & 0b0001_1111) as u32) << 6; + }, + Action::SetByte3 => { + self.point |= ((byte & CONTINUATION_MASK) as u32) << 12; + }, + Action::SetByte3Top => { + self.point |= ((byte & 0b0000_1111) as u32) << 12; + }, + Action::SetByte4 => { + self.point |= ((byte & 0b0000_0111) as u32) << 18; + }, + } + } +} |