diff options
author | Joe Wilm <joe@jwilm.com> | 2016-09-17 15:51:45 -0700 |
---|---|---|
committer | Joe Wilm <joe@jwilm.com> | 2016-09-17 17:03:20 -0700 |
commit | cffdb6de59ceb3fd9983a1c19476e5109da8db97 (patch) | |
tree | 26603abf607d21eefd3b9a6ac79a36dab63b5781 /src/utf8/types.rs | |
parent | 930f8cc30a5bc4943c1b56e18cf1a3f8bb00bc2a (diff) | |
download | r-alacritty-vte-cffdb6de59ceb3fd9983a1c19476e5109da8db97.tar.gz r-alacritty-vte-cffdb6de59ceb3fd9983a1c19476e5109da8db97.tar.bz2 r-alacritty-vte-cffdb6de59ceb3fd9983a1c19476e5109da8db97.zip |
Add support for UTF-8
This adds a table-driven UTF-8 parser which only has a single branch for
the entire parser. UTF-8 support is essentially bolted onto the VTE
parser. Not the most elegant, but it does prevent the transition tables
from blowing up.
Instead of refactoring the syntax extension to handle both table
definitions, I've opted to copy/paste now for both simplicities sake and
because I can't see a clear path to a minimal shared solution.
Diffstat (limited to 'src/utf8/types.rs')
-rw-r--r-- | src/utf8/types.rs | 77 |
1 files changed, 77 insertions, 0 deletions
diff --git a/src/utf8/types.rs b/src/utf8/types.rs new file mode 100644 index 0000000..4c604f4 --- /dev/null +++ b/src/utf8/types.rs @@ -0,0 +1,77 @@ +//! Types supporting the UTF-8 parser +#![allow(non_camel_case_types)] +use std::mem; + +/// States the parser can be in. +/// +/// There is a state for each initial input of the 3 and 4 byte sequences since +/// the following bytes are subject to different conditions than a tail byte. +#[allow(dead_code)] +#[derive(Debug, Copy, Clone)] +pub enum State { + /// Ground state; expect anything + Ground = 0, + /// 3 tail bytes + Tail3 = 1, + /// 2 tail bytes + Tail2 = 2, + /// 1 tail byte + Tail1 = 3, + /// UTF8-3 starting with E0 + U3_2_e0 = 4, + /// UTF8-3 starting with ED + U3_2_ed = 5, + /// UTF8-4 starting with F0 + Utf8_4_3_f0 = 6, + /// UTF8-4 starting with F4 + Utf8_4_3_f4 = 7, +} + +/// Action to take when receiving a byte +#[allow(dead_code)] +#[derive(Debug, Copy, Clone)] +pub enum Action { + /// Unexpected byte; sequence is invalid + InvalidSequence = 0, + /// Received valid 7-bit ASCII byte which can be directly emitted. + EmitByte = 1, + /// Set the bottom continuation byte + SetByte1 = 2, + /// Set the 2nd-from-last continuation byte + SetByte2 = 3, + /// Set the 2nd-from-last byte which is part of a two byte sequence + SetByte2Top = 4, + /// Set the 3rd-from-last continuation byte + SetByte3 = 5, + /// Set the 3rd-from-last byte which is part of a three byte sequence + SetByte3Top = 6, + /// Set the top byte of a four byte sequence. + SetByte4 = 7, +} + +/// Convert a state and action to a u8 +/// +/// State will be the bottom 4 bits and action the top 4 +#[inline] +#[allow(dead_code)] +pub fn pack(state: State, action: Action) -> u8 { + ((action as u8) << 4) | (state as u8) +} + +/// Convert a u8 to a state and action +/// +/// # Unsafety +/// +/// If this function is called with a byte that wasn't encoded with the `pack` +/// function in this module, there is no guarantee that a valid state and action +/// can be produced. +#[inline] +pub unsafe fn unpack(val: u8) -> (State, Action) { + ( + // State is stored in bottom 4 bits + mem::transmute(val & 0x0f), + + // Action is stored in top 4 bits + mem::transmute(val >> 4), + ) +} |