From 47c776413ed4e11839ad6838575d0077ddd496a3 Mon Sep 17 00:00:00 2001 From: Josh Rahm Date: Sat, 17 Dec 2022 01:28:06 -0700 Subject: fiddle: have a basic tokenizer working. --- src/Language/Fiddle/Tokenizer.hs | 128 +++++++++++++++++++++++++++++---------- 1 file changed, 96 insertions(+), 32 deletions(-) (limited to 'src/Language/Fiddle/Tokenizer.hs') diff --git a/src/Language/Fiddle/Tokenizer.hs b/src/Language/Fiddle/Tokenizer.hs index 9931523..d3239fd 100644 --- a/src/Language/Fiddle/Tokenizer.hs +++ b/src/Language/Fiddle/Tokenizer.hs @@ -1,37 +1,101 @@ +{-# LANGUAGE DeriveFunctor #-} module Language.Fiddle.Tokenizer where +import Data.Char (isDigit) +import Data.Text (Text) import Language.Fiddle.Types +import Text.Parsec +import qualified Text.Parsec data T - = KW_assert_pos - | KW_at - | KW_bittype - | KW_enum - | KW_location - | KW_object - | KW_objtype - | KW_option - | KW_package - | KW_reg - | KW_ro - | KW_wo - | Tok_colon - | Tok_comma - | Tok_comment - | Tok_docComment - | Tok_eq - | Tok_ident String - | Tok_lbrace - | Tok_lbracket - | Tok_litnum String - | Tok_lparen - | Tok_package - | Tok_rbrace - | Tok_rbracket - | Tok_rparen - | Tok_semi - -data Token = Token T SourceSpan - -tokenize :: String -> Text -> [Token] -tokenize srcName txt = undefined + = KWAssertPos + | Ident !String + | KWAt + | KWBittype + | KWEnum + | CommentTok !String + | DocCommentTok !String + | KWLocation + | KWObject + | KWObjtype + | KWOption + | KWPackage + | KWReg + | KWRo + | KWWo + | LitNum !String + | TokColon + | TokComma + | TokEq + | TokLBrace + | TokLBracket + | TokLParen + | TokRBrace + | TokRBracket + | TokRParen + | TokSemi + deriving (Eq, Ord, Show, Read) + +data Token a = Token !T a + deriving (Eq, Ord, Show, Functor) + +parseToken :: (Monad m) => ParsecT Text u m (Token SourceSpan) +parseToken = spaces *> tok parseToken' <* spaces + where + tok tp = do + p1 <- getPosition + t <- tp + + Token t . SourceSpan p1 <$> getPosition + + parseAlNumTok str = + case str of + "at" -> KWAt + "bittype" -> KWBittype + "enum" -> KWEnum + "location" -> KWLocation + "object" -> KWObject + "objtype" -> KWObjtype + "option" -> KWOption + "package" -> KWPackage + "reg" -> KWReg + "ro" -> KWRo + "wo" -> KWWo + (h : _) | isDigit h -> LitNum str + ident -> Ident ident + + parseComment = + try + ( do + string "//" + CommentTok <$> manyTill anyChar (char '\n') + ) + <|> try + ( do + string "/**" + DocCommentTok <$> manyTill anyChar (try $ string "*/") + ) + + parseSymbol = + choice + [ char ':' $> TokColon, + char ',' $> TokComma, + char '=' $> TokEq, + char '{' $> TokLBrace, + char '[' $> TokLBracket, + char '(' $> TokLParen, + char '}' $> TokRBrace, + char ']' $> TokRBracket, + char ')' $> TokRParen, + char ';' $> TokSemi + ] + where + a $> b = a >> return b + + parseToken' = + fmap parseAlNumTok (many1 (alphaNum <|> char '_')) + <|> parseComment + <|> parseSymbol + +tokenize :: String -> Text -> Either ParseError [Token SourceSpan] +tokenize = Text.Parsec.runParser (many parseToken <* eof) () -- cgit