diff options
author | Christian Duerr <contact@christianduerr.com> | 2019-12-10 19:16:01 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2019-12-10 19:16:01 +0100 |
commit | 9d37aa7a71801f3569d2a2a55dc82c37935f205a (patch) | |
tree | fd20b01398034934957c0d311209103482836771 /utf8parse/src/types.rs | |
parent | ea940fcb74abce67b927788e4f9f64fc63073d37 (diff) | |
download | r-alacritty-vte-9d37aa7a71801f3569d2a2a55dc82c37935f205a.tar.gz r-alacritty-vte-9d37aa7a71801f3569d2a2a55dc82c37935f205a.tar.bz2 r-alacritty-vte-9d37aa7a71801f3569d2a2a55dc82c37935f205a.zip |
Remove table generation
This completely removes the `codegen` project, which relied on outdated
libraries to parse DSLs to build the utf8 and vte state tables, to make
the library easier to maintain.
The utf8 table could be completely removed in favor of a `match`
statement, which also lead to a performance improvement with the utf8
parser.
The vte table did not benefit from `match` statements at all and instead
had significantly worse performance with it. To replace the old
codegeneration for vte, the `generate_state_changes` crate has been
created instead, which uses the language's proc_macro feature to create
a `const fn` which will generate the table at compile time.
Diffstat (limited to 'utf8parse/src/types.rs')
-rw-r--r-- | utf8parse/src/types.rs | 121 |
1 files changed, 72 insertions, 49 deletions
diff --git a/utf8parse/src/types.rs b/utf8parse/src/types.rs index 93607fb..5a70b3c 100644 --- a/utf8parse/src/types.rs +++ b/utf8parse/src/types.rs @@ -1,12 +1,31 @@ //! Types supporting the UTF-8 parser -#![allow(non_camel_case_types)] -use core::mem; + +/// Action to take when receiving a byte +#[derive(Debug, Copy, Clone)] +pub enum Action { + /// Unexpected byte; sequence is invalid + InvalidSequence = 0, + /// Received valid 7-bit ASCII byte which can be directly emitted. + EmitByte = 1, + /// Set the bottom continuation byte + SetByte1 = 2, + /// Set the 2nd-from-last continuation byte + SetByte2 = 3, + /// Set the 2nd-from-last byte which is part of a two byte sequence + SetByte2Top = 4, + /// Set the 3rd-from-last continuation byte + SetByte3 = 5, + /// Set the 3rd-from-last byte which is part of a three byte sequence + SetByte3Top = 6, + /// Set the top byte of a four byte sequence. + SetByte4 = 7, +} /// States the parser can be in. /// /// There is a state for each initial input of the 3 and 4 byte sequences since /// the following bytes are subject to different conditions than a tail byte. -#[allow(dead_code)] +#[allow(non_camel_case_types)] #[derive(Debug, Copy, Clone)] pub enum State { /// Ground state; expect anything @@ -33,50 +52,54 @@ impl Default for State { } } -/// Action to take when receiving a byte -#[allow(dead_code)] -#[derive(Debug, Copy, Clone)] -pub enum Action { - /// Unexpected byte; sequence is invalid - InvalidSequence = 0, - /// Received valid 7-bit ASCII byte which can be directly emitted. - EmitByte = 1, - /// Set the bottom continuation byte - SetByte1 = 2, - /// Set the 2nd-from-last continuation byte - SetByte2 = 3, - /// Set the 2nd-from-last byte which is part of a two byte sequence - SetByte2Top = 4, - /// Set the 3rd-from-last continuation byte - SetByte3 = 5, - /// Set the 3rd-from-last byte which is part of a three byte sequence - SetByte3Top = 6, - /// Set the top byte of a four byte sequence. - SetByte4 = 7, -} - -/// Convert a state and action to a u8 -/// -/// State will be the bottom 4 bits and action the top 4 -#[inline] -#[allow(dead_code)] -pub fn pack(state: State, action: Action) -> u8 { - ((action as u8) << 4) | (state as u8) -} - -/// Convert a u8 to a state and action -/// -/// # Unsafety -/// -/// If this function is called with a byte that wasn't encoded with the `pack` -/// function in this module, there is no guarantee that a valid state and action -/// can be produced. -#[inline] -pub unsafe fn unpack(val: u8) -> (State, Action) { - ( - // State is stored in bottom 4 bits - mem::transmute(val & 0x0f), - // Action is stored in top 4 bits - mem::transmute(val >> 4), - ) +impl State { + /// Advance the parser state. + /// + /// This takes the current state and input byte into consideration, to determine the next state + /// and any action that should be taken. + #[inline] + pub fn advance(&self, byte: u8) -> (State, Action) { + match self { + State::Ground => match byte { + 0x00..=0x7f => (State::Ground, Action::EmitByte), + 0xc2..=0xdf => (State::Tail1, Action::SetByte2Top), + 0xe0 => (State::U3_2_e0, Action::SetByte3Top), + 0xe1..=0xec => (State::Tail2, Action::SetByte3Top), + 0xed => (State::U3_2_ed, Action::SetByte3Top), + 0xee..=0xef => (State::Tail2, Action::SetByte3Top), + 0xf0 => (State::Utf8_4_3_f0, Action::SetByte4), + 0xf1..=0xf3 => (State::Tail3, Action::SetByte4), + 0xf4 => (State::Utf8_4_3_f4, Action::SetByte4), + _ => (State::Ground, Action::InvalidSequence), + }, + State::U3_2_e0 => match byte { + 0xa0..=0xbf => (State::Tail1, Action::SetByte2), + _ => (State::Ground, Action::InvalidSequence), + }, + State::U3_2_ed => match byte { + 0x80..=0x9f => (State::Tail1, Action::SetByte2), + _ => (State::Ground, Action::InvalidSequence), + }, + State::Utf8_4_3_f0 => match byte { + 0x90..=0xbf => (State::Tail2, Action::SetByte3), + _ => (State::Ground, Action::InvalidSequence), + }, + State::Utf8_4_3_f4 => match byte { + 0x80..=0x8f => (State::Tail2, Action::SetByte3), + _ => (State::Ground, Action::InvalidSequence), + }, + State::Tail3 => match byte { + 0x80..=0xbf => (State::Tail2, Action::SetByte3), + _ => (State::Ground, Action::InvalidSequence), + }, + State::Tail2 => match byte { + 0x80..=0xbf => (State::Tail1, Action::SetByte2), + _ => (State::Ground, Action::InvalidSequence), + }, + State::Tail1 => match byte { + 0x80..=0xbf => (State::Ground, Action::SetByte1), + _ => (State::Ground, Action::InvalidSequence), + }, + } + } } |