summaryrefslogtreecommitdiff
path: root/src/Language/Fiddle/Tokenizer.hs
diff options
context:
space:
mode:
authorJosh Rahm <joshuarahm@gmail.com>2022-12-17 01:28:06 -0700
committerJosh Rahm <joshuarahm@gmail.com>2022-12-17 01:28:06 -0700
commit47c776413ed4e11839ad6838575d0077ddd496a3 (patch)
treedafe4a828cf103986062e7f00609109aa17380e6 /src/Language/Fiddle/Tokenizer.hs
parentbf66c00aa9ee8a7f8058e396db167324076331b2 (diff)
downloadfiddle-47c776413ed4e11839ad6838575d0077ddd496a3.tar.gz
fiddle-47c776413ed4e11839ad6838575d0077ddd496a3.tar.bz2
fiddle-47c776413ed4e11839ad6838575d0077ddd496a3.zip
fiddle: have a basic tokenizer working.
Diffstat (limited to 'src/Language/Fiddle/Tokenizer.hs')
-rw-r--r--src/Language/Fiddle/Tokenizer.hs128
1 files changed, 96 insertions, 32 deletions
diff --git a/src/Language/Fiddle/Tokenizer.hs b/src/Language/Fiddle/Tokenizer.hs
index 9931523..d3239fd 100644
--- a/src/Language/Fiddle/Tokenizer.hs
+++ b/src/Language/Fiddle/Tokenizer.hs
@@ -1,37 +1,101 @@
+{-# LANGUAGE DeriveFunctor #-}
module Language.Fiddle.Tokenizer where
+import Data.Char (isDigit)
+import Data.Text (Text)
import Language.Fiddle.Types
+import Text.Parsec
+import qualified Text.Parsec
data T
- = KW_assert_pos
- | KW_at
- | KW_bittype
- | KW_enum
- | KW_location
- | KW_object
- | KW_objtype
- | KW_option
- | KW_package
- | KW_reg
- | KW_ro
- | KW_wo
- | Tok_colon
- | Tok_comma
- | Tok_comment
- | Tok_docComment
- | Tok_eq
- | Tok_ident String
- | Tok_lbrace
- | Tok_lbracket
- | Tok_litnum String
- | Tok_lparen
- | Tok_package
- | Tok_rbrace
- | Tok_rbracket
- | Tok_rparen
- | Tok_semi
-
-data Token = Token T SourceSpan
-
-tokenize :: String -> Text -> [Token]
-tokenize srcName txt = undefined
+ = KWAssertPos
+ | Ident !String
+ | KWAt
+ | KWBittype
+ | KWEnum
+ | CommentTok !String
+ | DocCommentTok !String
+ | KWLocation
+ | KWObject
+ | KWObjtype
+ | KWOption
+ | KWPackage
+ | KWReg
+ | KWRo
+ | KWWo
+ | LitNum !String
+ | TokColon
+ | TokComma
+ | TokEq
+ | TokLBrace
+ | TokLBracket
+ | TokLParen
+ | TokRBrace
+ | TokRBracket
+ | TokRParen
+ | TokSemi
+ deriving (Eq, Ord, Show, Read)
+
+data Token a = Token !T a
+ deriving (Eq, Ord, Show, Functor)
+
+parseToken :: (Monad m) => ParsecT Text u m (Token SourceSpan)
+parseToken = spaces *> tok parseToken' <* spaces
+ where
+ tok tp = do
+ p1 <- getPosition
+ t <- tp
+
+ Token t . SourceSpan p1 <$> getPosition
+
+ parseAlNumTok str =
+ case str of
+ "at" -> KWAt
+ "bittype" -> KWBittype
+ "enum" -> KWEnum
+ "location" -> KWLocation
+ "object" -> KWObject
+ "objtype" -> KWObjtype
+ "option" -> KWOption
+ "package" -> KWPackage
+ "reg" -> KWReg
+ "ro" -> KWRo
+ "wo" -> KWWo
+ (h : _) | isDigit h -> LitNum str
+ ident -> Ident ident
+
+ parseComment =
+ try
+ ( do
+ string "//"
+ CommentTok <$> manyTill anyChar (char '\n')
+ )
+ <|> try
+ ( do
+ string "/**"
+ DocCommentTok <$> manyTill anyChar (try $ string "*/")
+ )
+
+ parseSymbol =
+ choice
+ [ char ':' $> TokColon,
+ char ',' $> TokComma,
+ char '=' $> TokEq,
+ char '{' $> TokLBrace,
+ char '[' $> TokLBracket,
+ char '(' $> TokLParen,
+ char '}' $> TokRBrace,
+ char ']' $> TokRBracket,
+ char ')' $> TokRParen,
+ char ';' $> TokSemi
+ ]
+ where
+ a $> b = a >> return b
+
+ parseToken' =
+ fmap parseAlNumTok (many1 (alphaNum <|> char '_'))
+ <|> parseComment
+ <|> parseSymbol
+
+tokenize :: String -> Text -> Either ParseError [Token SourceSpan]
+tokenize = Text.Parsec.runParser (many parseToken <* eof) ()