1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
|
//! A table-driven UTF-8 Parser
//!
//! This module implements a table-driven UTF-8 parser which should
//! theoretically contain the minimal number of branches (1). The only branch is
//! on the `Action` returned from unpacking a transition.
#![no_std]
use core::char;
mod table;
mod types;
use table::TRANSITIONS;
use types::{unpack, Action, State};
/// Handles codepoint and invalid sequence events from the parser.
pub trait Receiver {
/// Called whenever a codepoint is parsed successfully
fn codepoint(&mut self, _: char);
/// Called when an invalid_sequence is detected
fn invalid_sequence(&mut self);
}
/// A parser for Utf8 Characters
///
/// Repeatedly call `advance` with bytes to emit Utf8 characters
#[derive(Default)]
pub struct Parser {
point: u32,
state: State,
}
/// Continuation bytes are masked with this value.
const CONTINUATION_MASK: u8 = 0b0011_1111;
impl Parser {
/// Create a new Parser
pub fn new() -> Parser {
Parser { point: 0, state: State::Ground }
}
/// Advance the parser
///
/// The provider receiver will be called whenever a codepoint is completed or an invalid
/// sequence is detected.
pub fn advance<R>(&mut self, receiver: &mut R, byte: u8)
where
R: Receiver,
{
let cur = self.state as usize;
let change = TRANSITIONS[cur][byte as usize];
let (state, action) = unsafe { unpack(change) };
self.perform_action(receiver, byte, action);
self.state = state;
}
fn perform_action<R>(&mut self, receiver: &mut R, byte: u8, action: Action)
where
R: Receiver,
{
match action {
Action::InvalidSequence => {
self.point = 0;
receiver.invalid_sequence();
},
Action::EmitByte => {
receiver.codepoint(byte as char);
},
Action::SetByte1 => {
let point = self.point | ((byte & CONTINUATION_MASK) as u32);
let c = unsafe { char::from_u32_unchecked(point) };
self.point = 0;
receiver.codepoint(c);
},
Action::SetByte2 => {
self.point |= ((byte & CONTINUATION_MASK) as u32) << 6;
},
Action::SetByte2Top => {
self.point |= ((byte & 0b0001_1111) as u32) << 6;
},
Action::SetByte3 => {
self.point |= ((byte & CONTINUATION_MASK) as u32) << 12;
},
Action::SetByte3Top => {
self.point |= ((byte & 0b0000_1111) as u32) << 12;
},
Action::SetByte4 => {
self.point |= ((byte & 0b0000_0111) as u32) << 18;
},
}
}
}
|