| 156 | | -------------------------------------------------------------------------------- |
| 157 | | |
| 158 | | newtype Token = Token (SourcePos, Symbol) |
| 159 | | |
| 160 | | instance Show Token where |
| 161 | | show (Token (pos, symbol)) = show symbol |
| 162 | | |
| 163 | | data Symbol |
| 164 | | = Word String |
| 165 | | | SingleQuoted String |
| 166 | | | DotString String |
| 167 | | | QuestionString String |
| 168 | | | Num Int |
| 169 | | | Equals |
| 170 | | | LeftRoundBracket |
| 171 | | | RightRoundBracket |
| 172 | | | LeftSquareBracket |
| 173 | | | RightSquareBracket |
| 174 | | | Comma |
| 175 | | | Colon |
| 176 | | | SemiColon |
| 177 | | | Newline |
| 178 | | | BackSlash |
| 179 | | | Exclamation |
| 180 | | | Dash |
| 181 | | | GreaterThan |
| 182 | | | Space |
| 183 | | | Bad String |
| 184 | | deriving (Eq) |
| 185 | | |
| 186 | | -- slighlty pretty printing of Symbols via Show. The |
| 187 | | -- prettiness is due to the fact that the Parsec parser |
| 188 | | -- uses show to print Symbols as part of error messages |
| 189 | | instance Show Symbol where |
| 190 | | show (Word s) = "word: " ++ s |
| 191 | | show (SingleQuoted s) = "single quoted string: " ++ s |
| 192 | | show (DotString s) = ".-string: " ++ s |
| 193 | | show (QuestionString s) = "?-string: " ++ s |
| 194 | | show (Num n) = "number: " ++ show n |
| 195 | | show Equals = "equals sign: '='" |
| 196 | | show LeftRoundBracket = "bracket: '('" |
| 197 | | show RightRoundBracket = "bracket: ')'" |
| 198 | | show LeftSquareBracket = "bracket: '['" |
| 199 | | show RightSquareBracket = "bracket: ']'" |
| 200 | | show Comma = "comma: ','" |
| 201 | | show Colon = "colon: ':'" |
| 202 | | show SemiColon = "semi-colon: ';'" |
| 203 | | show Newline = "newline: '\\n'" |
| 204 | | show BackSlash = "backslash: '\'" |
| 205 | | show Exclamation = "exclamation sign: '!'" |
| 206 | | show Dash = "dash '-'" |
| 207 | | show GreaterThan = "greater than sign: '>'" |
| 208 | | show Space = "space" |
| 209 | | show (Bad str) = str |
| 210 | | |
| 211 | | -- turn a stream of characters into a stream of tokens |
| 212 | | lexer :: String -> String -> [Token] |
| 213 | | lexer filename input |
| 214 | | = lexWork (newPos filename 1 1) input |
| 215 | | |
| 216 | | lexWork :: SourcePos -> String -> [Token] |
| 217 | | lexWork pos [] = [] |
| 218 | | lexWork pos (x:xs) |
| 219 | | | x == '=' = simpleToken Equals nextCol |
| 220 | | | x == '(' = simpleToken LeftRoundBracket nextCol |
| 221 | | | x == ')' = simpleToken RightRoundBracket nextCol |
| 222 | | | x == '[' = simpleToken LeftSquareBracket nextCol |
| 223 | | | x == ']' = simpleToken RightSquareBracket nextCol |
| 224 | | | x == ',' = simpleToken Comma nextCol |
| 225 | | | x == ':' = simpleToken Colon nextCol |
| 226 | | | x == ';' = simpleToken SemiColon nextCol |
| 227 | | | x == '\\' = simpleToken BackSlash nextCol |
| 228 | | | x == '!' = simpleToken Exclamation nextCol |
| 229 | | | x == '-' = simpleToken Dash nextCol |
| 230 | | | x == '>' = simpleToken GreaterThan nextCol |
| 231 | | | x == '\n' = simpleToken Newline nextLine |
| 232 | | -- source location does not need to be accurate within a comment |
| 233 | | | x == '#' = lexWork pos (dropWhile (/= '\n') xs) |
| 234 | | | x == '.' = Token (pos, DotString xs) : lexWork pos (dropWhile (/= '\n') xs) |
| 235 | | | x == '?' = Token (pos, QuestionString xs) : lexWork pos (dropWhile (/= '\n') xs) |
| 236 | | | isWhiteSpace x = simpleToken Space nextCol |
| 237 | | | isDigit x = let (num, rest) = span isDigit (x:xs) |
| 238 | | nextPos = incSourceColumn pos (length num) |
| 239 | | in Token (pos, Num $ read num) : lexWork nextPos rest |
| 240 | | | isAlpha x |
| 241 | | = let (restWord, rest) = span isWordChar xs |
| 242 | | word = x:restWord |
| 243 | | nextPos = incSourceColumn pos (length word) |
| 244 | | in Token (pos, Word word) : lexWork nextPos rest |
| 245 | | -- quoted strings need special care - escaped quotes can appear |
| 246 | | -- within the string, and the string might not be terminated |
| 247 | | -- by a quote in the case of a lexical error |
| 248 | | | x == '\'' |
| 249 | | = let (thisString, rest) = lexTailQuotedString xs |
| 250 | | nextPos = incSourceColumn pos (length thisString + 2) |
| 251 | | in if null rest |
| 252 | | then [Token (pos, Bad $ "ill-quoted input: " ++ x:thisString)] |
| 253 | | else Token (pos, SingleQuoted thisString) : lexWork nextPos (tail rest) |
| 254 | | | otherwise = simpleToken (Bad $ "symbol: " ++ show x) nextCol |
| 255 | | where |
| 256 | | simpleToken :: Symbol -> (SourcePos -> SourcePos) -> [Token] |
| 257 | | simpleToken tok srcPosUpdate |
| 258 | | = Token (pos, tok) : lexWork (srcPosUpdate pos) xs |
| 259 | | isWhiteSpace :: Char -> Bool |
| 260 | | isWhiteSpace c = c `elem` " \t\r\f\v\xA0" |
| 261 | | isWordChar :: Char -> Bool |
| 262 | | isWordChar c = isAlpha c || isDigit c |
| 263 | | nextLine :: SourcePos -> SourcePos |
| 264 | | nextLine pos = incSourceLine (setSourceColumn pos 1) 1 |
| 265 | | nextCol :: SourcePos -> SourcePos |
| 266 | | nextCol pos = incSourceColumn pos 1 |
| 267 | | |
| 268 | | -- lex the rest of a string following the first quote mark |
| 269 | | -- must skip escaped quotes, and escaped backslashes |
| 270 | | -- include the final quote mark in the result |
| 271 | | lexTailQuotedString :: String -> (String, String) |
| 272 | | lexTailQuotedString [] = ([], []) |
| 273 | | lexTailQuotedString str@('\'':xs) = ([], str) |
| 274 | | -- a backslash then a quote is escaped |
| 275 | | lexTailQuotedString ('\\':'\'':xs) |
| 276 | | = let (string, rest) = lexTailQuotedString xs in ('\'':string, rest) |
| 277 | | -- a backslash then a backslash is an escaped backslash |
| 278 | | lexTailQuotedString ('\\':'\\':xs) |
| 279 | | = let (string, rest) = lexTailQuotedString xs in ('\\':'\\':string, rest) |
| 280 | | lexTailQuotedString (x:xs) |
| 281 | | = let (string, rest) = lexTailQuotedString xs in (x:string, rest) |