{-# LANGUAGE DeriveFunctor #-} {-# LANGUAGE OverloadedStrings #-} module Language.Fiddle.Tokenizer where import Data.Char (isDigit) import Data.Text (Text) import qualified Data.Text import Language.Fiddle.Types import Text.Parsec data T = KWAssertPos | KWAt | KWBits | KWEnum | KWInstance | KWLocation | KWOption | KWPackage | KWReg | KWReserved | KWRo | KWRw | KWStruct | KWType | KWUnion | KWWo | KWImport | KWUsing | TokColon | TokComma | TokDot | TokComment !Text | TokDocComment !Text | TokEq | TokIdent !Text | TokLBrace | TokLBracket | TokLParen | TokLitNum !Text | TokRBrace | TokRBracket | TokRParen | TokSemi | TokString !Text | TokDirectiveStart -- [[ | TokDirectiveEnd -- ]] deriving (Eq, Ord, Show, Read) textOf :: T -> Maybe Text textOf t = do case t of KWAssertPos -> Just "assert_pos" KWAt -> Just "at" KWBits -> Just "bits" KWEnum -> Just "enum" KWInstance -> Just "instance" KWLocation -> Just "location" KWOption -> Just "option" KWPackage -> Just "package" KWReg -> Just "reg" KWReserved -> Just "reserved" KWRo -> Just "ro" KWRw -> Just "rw" KWStruct -> Just "struct" KWType -> Just "type" KWUnion -> Just "union" KWWo -> Just "wo" KWImport -> Just "import" KWUsing -> Just "using" TokIdent i -> Just i TokLitNum n -> Just n _ -> Nothing data Token a = Token !T a deriving (Eq, Ord, Show, Functor) parseToken :: (Monad m) => ParsecT Text u m (Token SourceSpan) parseToken = spaces *> tok parseToken' <* spaces where tok tp = do p1 <- getPosition t <- tp Token t . SourceSpan p1 <$> getPosition parseAlNumTok :: Text -> T parseAlNumTok str = case str of "at" -> KWAt "bits" -> KWBits "enum" -> KWEnum "location" -> KWLocation "instance" -> KWInstance "type" -> KWType "option" -> KWOption "package" -> KWPackage "import" -> KWImport "reg" -> KWReg "ro" -> KWRo "wo" -> KWWo "rw" -> KWRw "reserved" -> KWReserved "union" -> KWUnion "using" -> KWUsing "struct" -> KWStruct "assert_pos" -> KWAssertPos (Data.Text.head -> h) | isDigit h -> TokLitNum str ident -> TokIdent ident parseString = fmap (TokString . Data.Text.pack . concat) $ do _ <- char '"' manyTill ( do c <- anyChar if c == '\\' then do c2 <- anyChar return [c, c2] else return [c] ) (char '"') parseComment = try ( do _ <- string "//" TokComment . Data.Text.pack <$> manyTill anyChar (char '\n') ) <|> try ( do _ <- string "/**" TokDocComment . Data.Text.pack <$> manyTill anyChar (try $ string "*/") ) parseSymbol = choice [ try (string "[[" $> TokDirectiveStart), try (string "]]" $> TokDirectiveEnd), char ':' $> TokColon, char ',' $> TokComma, char '.' $> TokDot, char '=' $> TokEq, char '{' $> TokLBrace, char '[' $> TokLBracket, char '(' $> TokLParen, char '}' $> TokRBrace, char ']' $> TokRBracket, char ')' $> TokRParen, char ';' $> TokSemi ] where a $> b = a >> return b parseToken' = fmap (parseAlNumTok . Data.Text.pack) (many1 (alphaNum <|> char '_')) <|> parseString <|> parseComment <|> parseSymbol tokenize :: String -> Text -> Either ParseError [Token SourceSpan] tokenize = Text.Parsec.runParser (many parseToken <* eof) () tokenType :: Token SourceSpan -> T tokenType (Token t _) = t