{-# LANGUAGE DeriveFunctor #-} {-# LANGUAGE OverloadedStrings #-} module Language.Fiddle.Tokenizer where import Data.Char (isDigit) import Data.Text (Text) import qualified Data.Text import Language.Fiddle.Types import Text.Parsec import qualified Text.Parsec data T = KWAssertPos | TokIdent !Text | KWAt | KWBits | KWEnum | TokComment !Text | TokDocComment !Text | KWLocation | KWObject | KWObjtype | KWOption | KWPackage | KWReg | KWReserved | KWRo | KWWo | KWRw | TokLitNum !Text | TokColon | TokComma | TokEq | TokLBrace | TokLBracket | TokLParen | TokRBrace | TokRBracket | TokRParen | TokSemi deriving (Eq, Ord, Show, Read) data Token a = Token !T a deriving (Eq, Ord, Show, Functor) parseToken :: (Monad m) => ParsecT Text u m (Token SourceSpan) parseToken = spaces *> tok parseToken' <* spaces where tok tp = do p1 <- getPosition t <- tp Token t . SourceSpan p1 <$> getPosition parseAlNumTok :: Text -> T parseAlNumTok str = case str of "at" -> KWAt "bits" -> KWBits "enum" -> KWEnum "location" -> KWLocation "object" -> KWObject "objtype" -> KWObjtype "option" -> KWOption "package" -> KWPackage "reg" -> KWReg "ro" -> KWRo "wo" -> KWWo "rw" -> KWRw "reserved" -> KWReserved "assert_pos" -> KWAssertPos (Data.Text.head -> h) | isDigit h -> TokLitNum str ident -> TokIdent ident parseComment = try ( do string "//" TokComment . Data.Text.pack <$> manyTill anyChar (char '\n') ) <|> try ( do string "/**" TokDocComment . Data.Text.pack <$> manyTill anyChar (try $ string "*/") ) parseSymbol = choice [ char ':' $> TokColon, char ',' $> TokComma, char '=' $> TokEq, char '{' $> TokLBrace, char '[' $> TokLBracket, char '(' $> TokLParen, char '}' $> TokRBrace, char ']' $> TokRBracket, char ')' $> TokRParen, char ';' $> TokSemi ] where a $> b = a >> return b parseToken' = fmap (parseAlNumTok . Data.Text.pack) (many1 (alphaNum <|> char '_')) <|> parseComment <|> parseSymbol tokenize :: String -> Text -> Either ParseError [Token SourceSpan] tokenize = Text.Parsec.runParser (many parseToken <* eof) () tokenType :: Token SourceSpan -> T tokenType (Token t _) = t