| 1 | {-# OPTIONS_GHC -fglasgow-exts #-} |
|---|
| 2 | module Pugs.Parser.Program ( |
|---|
| 3 | -- Before you would import Pugs.Parser, now you import Pugs.Parser.Program. |
|---|
| 4 | parseProgram, |
|---|
| 5 | ) where |
|---|
| 6 | import Pugs.Internals |
|---|
| 7 | import Pugs.AST |
|---|
| 8 | import Pugs.Types (isSigilChar) |
|---|
| 9 | |
|---|
| 10 | import Pugs.Parser |
|---|
| 11 | import Pugs.Rule |
|---|
| 12 | import Text.ParserCombinators.Parsec.Error (showErrorMessages, errorMessages) |
|---|
| 13 | import qualified Data.Map as Map |
|---|
| 14 | |
|---|
| 15 | parseProgram :: Env -> FilePath -> String -> Env |
|---|
| 16 | parseProgram env path str = runRule env ruleProgram path progWithEOL |
|---|
| 17 | where |
|---|
| 18 | prog = decodeProgram str |
|---|
| 19 | progWithEOL |
|---|
| 20 | | [] <- prog = "\n" |
|---|
| 21 | | last prog == '\n' = prog |
|---|
| 22 | | otherwise = prog ++ "\n" |
|---|
| 23 | |
|---|
| 24 | -- Based on: <http://hackage.haskell.org/trac/haskell-prime/wiki/SourceEncodingDetection> |
|---|
| 25 | data EncodedSource |
|---|
| 26 | = UTF8 !String |
|---|
| 27 | | UTF16 !Endian !String |
|---|
| 28 | | UTF32 !Endian !String |
|---|
| 29 | -- ... | UserDefined ... |
|---|
| 30 | |
|---|
| 31 | data Endian = LittleEndian | BigEndian |
|---|
| 32 | |
|---|
| 33 | decodeProgram :: String -> String |
|---|
| 34 | decodeProgram str = case detectSourceEncoding str of |
|---|
| 35 | UTF8 xs -> decodeUTF8 (removeCRLF xs) |
|---|
| 36 | UTF16 LittleEndian xs -> removeCRLF (decodeUTF16LE xs) |
|---|
| 37 | UTF16 BigEndian xs -> removeCRLF (decodeUTF16BE xs) |
|---|
| 38 | UTF32 LittleEndian xs -> removeCRLF (decodeUTF32LE xs) |
|---|
| 39 | UTF32 BigEndian xs -> removeCRLF (decodeUTF32BE xs) |
|---|
| 40 | where |
|---|
| 41 | removeCRLF ('\r':'\n':xs) = '\n':removeCRLF xs |
|---|
| 42 | removeCRLF (x:xs) = x:removeCRLF xs |
|---|
| 43 | removeCRLF [] = [] |
|---|
| 44 | decodeUTF16BE (a:b:c:d:xs) |
|---|
| 45 | | a >= '\xD8', a <= '\xDB' -- High surrogate |
|---|
| 46 | , c >= '\xDC', c <= '\xDF' -- Low surrogate |
|---|
| 47 | = let rest = decodeUTF16BE xs |
|---|
| 48 | hi = (ord a - 0xD8) * 0x100 + ord b |
|---|
| 49 | lo = (ord c - 0xDC) * 0x100 + ord d |
|---|
| 50 | in seq rest (chr (0x10000 + hi * 0x400 + lo) : rest) |
|---|
| 51 | decodeUTF16BE (a:b:xs) = let rest = decodeUTF16BE xs |
|---|
| 52 | in seq rest (chr (ord a * 0x100 + ord b) : rest) |
|---|
| 53 | decodeUTF16BE _ = [] |
|---|
| 54 | decodeUTF16LE (a:b:c:d:xs) |
|---|
| 55 | | b >= '\xD8', b <= '\xDB' -- High surrogate |
|---|
| 56 | , d >= '\xDC', d <= '\xDF' -- Low surrogate |
|---|
| 57 | = let rest = decodeUTF16LE xs |
|---|
| 58 | hi = (ord b - 0xD8) * 0x100 + ord a |
|---|
| 59 | lo = (ord d - 0xDC) * 0x100 + ord c |
|---|
| 60 | in seq rest (chr (0x10000 + hi * 0x400 + lo) : rest) |
|---|
| 61 | decodeUTF16LE (a:b:xs) = let rest = decodeUTF16LE xs |
|---|
| 62 | in seq rest (chr (ord b * 0x100 + ord a) : rest) |
|---|
| 63 | decodeUTF16LE _ = [] |
|---|
| 64 | decodeUTF32BE (a:b:c:d:xs) = let rest = decodeUTF32BE xs |
|---|
| 65 | in seq rest (chr (ord a * 0x1000000 + ord b * 0x10000 + ord c * 0x100 + ord d) : rest) |
|---|
| 66 | decodeUTF32BE _ = [] |
|---|
| 67 | decodeUTF32LE (a:b:c:d:xs) = let rest = decodeUTF32LE xs |
|---|
| 68 | in seq rest (chr (ord d * 0x1000000 + ord c * 0x10000 + ord b * 0x100 + ord a) : rest) |
|---|
| 69 | decodeUTF32LE _ = [] |
|---|
| 70 | |
|---|
| 71 | detectSourceEncoding :: String -> EncodedSource |
|---|
| 72 | detectSourceEncoding bytes = case bytes of |
|---|
| 73 | [] -> UTF8 [] |
|---|
| 74 | ['\x00'] -> invalidNulls |
|---|
| 75 | xs@[_] -> UTF8 xs |
|---|
| 76 | ['\xFF', '\xFE'] -> UTF16 LittleEndian [] |
|---|
| 77 | ('\xFE':'\xFF':xs) -> UTF16 BigEndian xs |
|---|
| 78 | ['\x00', '\x00'] -> invalidNulls |
|---|
| 79 | xs@['\x00', _] -> UTF16 BigEndian xs |
|---|
| 80 | xs@[_, '\x00'] -> UTF16 LittleEndian xs |
|---|
| 81 | xs@[_, _] -> UTF8 xs |
|---|
| 82 | ['\x00', '\x00', '\x00'] -> invalidNulls |
|---|
| 83 | xs@[_, _, _] -> UTF8 xs |
|---|
| 84 | ('\xEF':'\xBB':'\xBF':xs) -> UTF8 xs |
|---|
| 85 | ('\x00':'\x00':'\xFE':'\xFF':xs) -> UTF32 BigEndian xs |
|---|
| 86 | ('\xFF':'\xFE':'\x00':'\x00':xs) -> UTF32 LittleEndian xs |
|---|
| 87 | ('\xFF':'\xFE':xs) -> UTF16 LittleEndian xs |
|---|
| 88 | ('\x00':'\x00':'\x00':'\x00':_) -> invalidNulls |
|---|
| 89 | xs@('\x00':'\x00':'\x00':_) -> UTF32 BigEndian xs |
|---|
| 90 | xs@(_:'\x00':'\x00':'\x00':_) -> UTF32 LittleEndian xs |
|---|
| 91 | ('\x00':'\x00':_) -> invalidNulls |
|---|
| 92 | xs@('\x00':_) -> UTF16 BigEndian xs |
|---|
| 93 | xs@(_:'\x00':_) -> UTF16 LittleEndian xs |
|---|
| 94 | xs -> UTF8 xs |
|---|
| 95 | where |
|---|
| 96 | invalidNulls = error "(invalid nulls)" |
|---|
| 97 | |
|---|
| 98 | makeState :: Env -> RuleState |
|---|
| 99 | makeState env = MkState |
|---|
| 100 | { s_env = env |
|---|
| 101 | , s_parseProgram = parseProgram |
|---|
| 102 | , s_dynParsers = MkDynParsersEmpty |
|---|
| 103 | , s_bracketLevel = StatementBracket |
|---|
| 104 | -- , s_char = ' ' |
|---|
| 105 | -- , s_name = nullID |
|---|
| 106 | -- , s_pos = 0 |
|---|
| 107 | , s_wsLine = 0 |
|---|
| 108 | , s_wsColumn = 0 |
|---|
| 109 | , s_closureTraits = [id] |
|---|
| 110 | -- , s_freeVars = Set.empty |
|---|
| 111 | , s_knownVars = Map.map (const topMPad) (padEntries (envLexical env)) |
|---|
| 112 | , s_outerVars = Map.empty |
|---|
| 113 | , s_protoPad = emptyPad |
|---|
| 114 | } |
|---|
| 115 | |
|---|
| 116 | -- ^ A fake 'top' MPad for s_knownVars above to refer to things outside the eval scope. |
|---|
| 117 | {-# NOINLINE topMPad #-} |
|---|
| 118 | topMPad :: MPad |
|---|
| 119 | topMPad = unsafePerformIO $ do |
|---|
| 120 | tvar <- newTVarIO emptyPad |
|---|
| 121 | return $ MkMPad (addressOf tvar) tvar |
|---|
| 122 | |
|---|
| 123 | -- XXX - Pending clarification about those 3 -- are they routine-implicit or block-implicit? |
|---|
| 124 | {- |
|---|
| 125 | protoPad :: Pad |
|---|
| 126 | protoPad = mkPad |
|---|
| 127 | [ (cast "$_", PELexical |
|---|
| 128 | , (cast "$/", |
|---|
| 129 | , (cast "$!", |
|---|
| 130 | ] |
|---|
| 131 | -} |
|---|
| 132 | |
|---|
| 133 | runRule :: Env -> RuleParser Env -> FilePath -> String -> Env |
|---|
| 134 | runRule env p name str = |
|---|
| 135 | case ( runParser p (makeState env) name str ) of |
|---|
| 136 | Left err -> env { envBody = Val $ VError (VStr msg) [mkPos pos pos] } |
|---|
| 137 | where |
|---|
| 138 | msg = concat (intersperse "\n" (map filterUnexpected $ lines (showErr err))) |
|---|
| 139 | pos = errorPos err |
|---|
| 140 | cur = case takeSameClassWords (dropUntilPos pos str) of |
|---|
| 141 | "" -> "end of input" |
|---|
| 142 | xs -> show xs |
|---|
| 143 | filterUnexpected ('!':_) = "Unexpected " ++ cur |
|---|
| 144 | filterUnexpected line = line |
|---|
| 145 | Right env' -> env' |
|---|
| 146 | |
|---|
| 147 | takeSameClassWords :: String -> String |
|---|
| 148 | takeSameClassWords "" = "" |
|---|
| 149 | takeSameClassWords (x:xs) |
|---|
| 150 | | isSigilChar x = x : takeSameClassWords xs |
|---|
| 151 | | otherwise = case charClassOf x of |
|---|
| 152 | SpaceClass -> x : takeSameClassWords xs |
|---|
| 153 | cls -> x : takeWhile ((== cls) . charClassOf) xs |
|---|
| 154 | |
|---|
| 155 | dropUntilPos :: SourcePos -> String -> String |
|---|
| 156 | dropUntilPos pos str |
|---|
| 157 | | (curline:_) <- drop (ln - 1) (lines str) = drop (col - 1) curline |
|---|
| 158 | | otherwise = "" |
|---|
| 159 | where |
|---|
| 160 | col = sourceColumn pos |
|---|
| 161 | ln = sourceLine pos |
|---|
| 162 | |
|---|
| 163 | showErr :: ParseError -> String |
|---|
| 164 | showErr err = |
|---|
| 165 | showErrorMessages "or" "unknown parse error" |
|---|
| 166 | "expecting" "!" "end of input" |
|---|
| 167 | (errorMessages err) |
|---|
| 168 | |
|---|
| 169 | -- Lexical units -------------------------------------------------- |
|---|
| 170 | |
|---|
| 171 | ruleProgram :: RuleParser Env |
|---|
| 172 | ruleProgram = rule "program" $ do |
|---|
| 173 | env <- getRuleEnv |
|---|
| 174 | |
|---|
| 175 | topPad <- genParamEntries SubRoutine [defaultArrayParam] |
|---|
| 176 | modify $ \s -> s{ s_protoPad = topPad } |
|---|
| 177 | |
|---|
| 178 | block <- ruleBlockBody `finallyM` eof |
|---|
| 179 | main <- retVerbatimBlock SubPrim Nothing False $ |
|---|
| 180 | block{ bi_body = mergeStmts emptyExp $ bi_body block } |
|---|
| 181 | |
|---|
| 182 | -- We are still in the compile time. |
|---|
| 183 | modify $ \s -> s{ s_env = (s_env s){ envCompPad = Just (error "no comp pad") } } |
|---|
| 184 | |
|---|
| 185 | -- Force a reclose-pad evaluation here by way of unsafeEvalExp. |
|---|
| 186 | main'@(Val (VCode vc)) <- unsafeEvalExp $ Syn "" [unwrap main] |
|---|
| 187 | |
|---|
| 188 | -- S04: CHECK {...}* at compile time, ALAP |
|---|
| 189 | -- $_() for @*CHECK |
|---|
| 190 | rv <- unsafeEvalExp $ Syn "for" |
|---|
| 191 | [ _Var "@*CHECK" |
|---|
| 192 | , Syn "sub" |
|---|
| 193 | [ Val . VCode $ mkPrim |
|---|
| 194 | { subBody = App (_Var "$_") Nothing [] |
|---|
| 195 | , subParams = [defaultScalarParam] |
|---|
| 196 | , subInnerPad = defaultScalarPad |
|---|
| 197 | } |
|---|
| 198 | ] |
|---|
| 199 | ] |
|---|
| 200 | |
|---|
| 201 | -- If there was a exit() in a CHECK block, we have to exit. |
|---|
| 202 | possiblyExit rv |
|---|
| 203 | |
|---|
| 204 | env' <- getRuleEnv |
|---|
| 205 | return $ env' |
|---|
| 206 | { envBody = App (Syn "block" [main']) Nothing (replicate (length $ subParams vc) (_Var "$_")) -- _Var "@*ARGS"] |
|---|
| 207 | , envPackage = envPackage env |
|---|
| 208 | , envCompPad = Nothing |
|---|
| 209 | } |
|---|