aboutsummaryrefslogtreecommitdiff
path: root/src/utf8/mod.rs
diff options
context:
space:
mode:
authorJoe Wilm <joe@jwilm.com>2016-09-17 15:51:45 -0700
committerJoe Wilm <joe@jwilm.com>2016-09-17 17:03:20 -0700
commitcffdb6de59ceb3fd9983a1c19476e5109da8db97 (patch)
tree26603abf607d21eefd3b9a6ac79a36dab63b5781 /src/utf8/mod.rs
parent930f8cc30a5bc4943c1b56e18cf1a3f8bb00bc2a (diff)
downloadr-alacritty-vte-cffdb6de59ceb3fd9983a1c19476e5109da8db97.tar.gz
r-alacritty-vte-cffdb6de59ceb3fd9983a1c19476e5109da8db97.tar.bz2
r-alacritty-vte-cffdb6de59ceb3fd9983a1c19476e5109da8db97.zip
Add support for UTF-8
This adds a table-driven UTF-8 parser which only has a single branch for the entire parser. UTF-8 support is essentially bolted onto the VTE parser. Not the most elegant, but it does prevent the transition tables from blowing up. Instead of refactoring the syntax extension to handle both table definitions, I've opted to copy/paste now for both simplicities sake and because I can't see a clear path to a minimal shared solution.
Diffstat (limited to 'src/utf8/mod.rs')
-rw-r--r--src/utf8/mod.rs91
1 files changed, 91 insertions, 0 deletions
diff --git a/src/utf8/mod.rs b/src/utf8/mod.rs
new file mode 100644
index 0000000..3d099b1
--- /dev/null
+++ b/src/utf8/mod.rs
@@ -0,0 +1,91 @@
+//! A table-driven UTF-8 Parser
+//!
+//! This module implements a table-driven UTF-8 parser which should
+//! theoretically contain the minimal number of branches (1). The only branch is
+//! on the `Action` returned from unpacking a transition.
+use std::char;
+
+mod types;
+use self::types::{State, Action, unpack};
+
+mod table;
+use self::table::TRANSITIONS;
+
+/// Handles codepoint and invalid sequence events from the parser.
+pub trait Receiver {
+ /// Code point parsed
+ ///
+ /// Called with the codepoint
+ fn codepoint(&mut self, char);
+
+ /// Invalid sequence encountered
+ fn invalid_sequence(&mut self);
+}
+
+/// A parser for Utf8 Characters
+///
+/// Repeatedly call `advance` with bytes to emit Utf8 characters
+pub struct Parser {
+ point: u32,
+ state: State,
+}
+
+/// Continuation bytes are masked with this value.
+const CONTINUATION_MASK: u8 = 0b0011_1111;
+
+impl Parser {
+ /// Create a new Parser
+ pub fn new() -> Parser {
+ Parser {
+ point: 0,
+ state: State::Ground,
+ }
+ }
+
+ pub fn advance<R>(&mut self, receiver: &mut R, byte: u8)
+ where R: Receiver
+ {
+ let cur = self.state as usize;
+ let change = TRANSITIONS[cur][byte as usize];
+ let (state, action) = unsafe { unpack(change) };
+
+ self.perform_action(receiver, byte, action);
+ self.state = state;
+ }
+
+ fn perform_action<R>(&mut self, receiver: &mut R, byte: u8, action: Action)
+ where R: Receiver
+ {
+ match action {
+ Action::InvalidSequence => {
+ self.point = 0;
+ receiver.invalid_sequence();
+ },
+ Action::EmitByte => {
+ receiver.codepoint(byte as char);
+ },
+ Action::SetByte1 => {
+ let point = self.point | ((byte & CONTINUATION_MASK) as u32);
+ let c = unsafe { char::from_u32_unchecked(point) };
+ self.point = 0;
+
+ receiver.codepoint(c);
+ },
+ Action::SetByte2 => {
+ self.point |= ((byte & CONTINUATION_MASK) as u32) << 6;
+ },
+ Action::SetByte2Top => {
+ self.point |= ((byte & 0b0001_1111) as u32) << 6;
+ },
+ Action::SetByte3 => {
+ self.point |= ((byte & CONTINUATION_MASK) as u32) << 12;
+ },
+ Action::SetByte3Top => {
+ self.point |= ((byte & 0b0000_1111) as u32) << 12;
+ },
+ Action::SetByte4 => {
+ self.point |= ((byte & 0b0000_0111) as u32) << 18;
+ },
+ }
+ }
+}