never executed always true always false
    1 {-# LANGUAGE OverloadedStrings #-}
    2 module Cheapskate.Inlines (
    3         parseInlines
    4       , pHtmlTag
    5       , pReference
    6       , pLinkLabel)
    7 where
    8 import Cheapskate.ParserCombinators
    9 import Cheapskate.Util
   10 import Cheapskate.Types
   11 import Data.Char hiding (Space)
   12 import qualified Data.Sequence as Seq
   13 import Data.Sequence (singleton, (<|), viewl, ViewL(..))
   14 import Prelude hiding (takeWhile)
   15 import Control.Applicative
   16 import Control.Monad
   17 import Data.Text (Text)
   18 import qualified Data.Text as T
   19 import qualified Data.Set as Set
   20 
   21 -- Returns tag type and whole tag.
   22 pHtmlTag :: Parser (HtmlTagType, Text)
   23 pHtmlTag = do
   24   _ <- char '<'
   25   -- do not end the tag with a > character in a quoted attribute.
   26   closing <- (char '/' >> return True) <|> return False
   27   tagname <- takeWhile1 (\c -> isAsciiAlphaNum c || c == '?' || c == '!')
   28   let tagname' = T.toLower tagname
   29   let attr = do ss <- takeWhile isSpace
   30                 x <- satisfy isLetter
   31                 xs <- takeWhile (\c -> isAsciiAlphaNum c || c == ':')
   32                 skip (=='=')
   33                 v <- pQuoted '"' <|> pQuoted '\'' <|> takeWhile1 isAlphaNum
   34                       <|> return ""
   35                 return $ ss <> T.singleton x <> xs <> "=" <> v
   36   attrs <- T.concat <$> many attr
   37   final <- takeWhile (\c -> isSpace c || c == '/')
   38   _ <- char '>'
   39   let tagtype = if closing
   40                    then Closing tagname'
   41                    else case T.stripSuffix "/" final of
   42                          Just _  -> SelfClosing tagname'
   43                          Nothing -> Opening tagname'
   44   return (tagtype,
   45           T.pack ('<' : ['/' | closing]) <> tagname <> attrs <> final <> ">")
   46 
   47 -- Parses a quoted attribute value.
   48 pQuoted :: Char -> Parser Text
   49 pQuoted c = do
   50   skip (== c)
   51   contents <- takeTill (== c)
   52   skip (== c)
   53   return (T.singleton c <> contents <> T.singleton c)
   54 
   55 -- Parses an HTML comment. This isn't really correct to spec, but should
   56 -- do for now.
   57 pHtmlComment :: Parser Text
   58 pHtmlComment = do
   59   _ <- string "<!--"
   60   rest <- manyTill anyChar (string "-->")
   61   return $ "<!--" <> T.pack rest <> "-->"
   62 
   63 -- A link label [like this].  Note the precedence:  code backticks have
   64 -- precedence over label bracket markers, which have precedence over
   65 -- *, _, and other inline formatting markers.
   66 -- So, 2 below contains a link while 1 does not:
   67 -- 1. [a link `with a ](/url)` character
   68 -- 2. [a link *with emphasized ](/url) text*
   69 pLinkLabel :: Parser Text
   70 pLinkLabel = char '[' *> (T.concat <$>
   71   (manyTill (regChunk <|> pEscaped <|> bracketed <|> codeChunk) (char ']')))
   72   where regChunk = takeWhile1 (\c -> c /='`' && c /='[' && c /=']' && c /='\\')
   73         codeChunk = snd <$> pCode'
   74         bracketed = inBrackets <$> pLinkLabel
   75         inBrackets t = "[" <> t <> "]"
   76 
   77 -- A URL in a link or reference.  This may optionally be contained
   78 -- in `<..>`; otherwise whitespace and unbalanced right parentheses
   79 -- aren't allowed.  Newlines aren't allowed in any case.
   80 pLinkUrl :: Parser Text
   81 pLinkUrl = do
   82   inPointy <- (char '<' >> return True) <|> return False
   83   if inPointy
   84      then T.pack <$> manyTill
   85            (pSatisfy (\c -> c /='\r' && c /='\n')) (char '>')
   86      else T.concat <$> many (regChunk <|> parenChunk)
   87     where regChunk = takeWhile1 (notInClass " \n()\\") <|> pEscaped
   88           parenChunk = parenthesize . T.concat <$> (char '(' *>
   89                          manyTill (regChunk <|> parenChunk) (char ')'))
   90           parenthesize x = "(" <> x <> ")"
   91 
   92 -- A link title, single or double quoted or in parentheses.
   93 -- Note that Markdown.pl doesn't allow the parenthesized form in
   94 -- inline links -- only in references -- but this restriction seems
   95 -- arbitrary, so we remove it here.
   96 pLinkTitle :: Parser Text
   97 pLinkTitle = do
   98   c <- satisfy (\c -> c == '"' || c == '\'' || c == '(')
   99   next <- peekChar
  100   case next of
  101        Nothing                 -> mzero
  102        Just x
  103          | isWhitespace x      -> mzero
  104          | x == ')'            -> mzero
  105          | otherwise           -> return ()
  106   let ender = if c == '(' then ')' else c
  107   let pEnder = char ender <* nfb (skip isAlphaNum)
  108   let regChunk = takeWhile1 (\x -> x /= ender && x /= '\\') <|> pEscaped
  109   let nestedChunk = (\x -> T.singleton c <> x <> T.singleton ender)
  110                       <$> pLinkTitle
  111   T.concat <$> manyTill (regChunk <|> nestedChunk) pEnder
  112 
  113 -- A link reference is a square-bracketed link label, a colon,
  114 -- optional space or newline, a URL, optional space or newline,
  115 -- and an optional link title.  (Note:  we assume the input is
  116 -- pre-stripped, with no leading/trailing spaces.)
  117 pReference :: Parser (Text, Text, Text)
  118 pReference = do
  119   lab <- pLinkLabel
  120   _ <- char ':'
  121   scanSpnl
  122   url <- pLinkUrl
  123   tit <- option T.empty $ scanSpnl >> pLinkTitle
  124   endOfInput
  125   return (lab, url, tit)
  126 
  127 -- Parses an escaped character and returns a Text.
  128 pEscaped :: Parser Text
  129 pEscaped = T.singleton <$> (skip (=='\\') *> satisfy isEscapable)
  130 
  131 -- Parses a (possibly escaped) character satisfying the predicate.
  132 pSatisfy :: (Char -> Bool) -> Parser Char
  133 pSatisfy p =
  134   satisfy (\c -> c /= '\\' && p c)
  135    <|> (char '\\' *> satisfy (\c -> isEscapable c && p c))
  136 
  137 -- Parse a text into inlines, resolving reference links
  138 -- using the reference map.
  139 parseInlines :: ReferenceMap -> Text -> Inlines
  140 parseInlines refmap t =
  141   case parse (msum <$> many (pInline refmap) <* endOfInput) t of
  142        Left e   -> error ("parseInlines: " ++ show e) -- should not happen
  143        Right r  -> r
  144 
  145 pInline :: ReferenceMap -> Parser Inlines
  146 pInline refmap =
  147            pAsciiStr
  148        <|> pSpace
  149        <|> pEnclosure '*' refmap  -- strong/emph
  150        <|> (notAfter isAlphaNum *> pEnclosure '_' refmap)
  151        <|> pCode
  152        <|> pLink refmap
  153        <|> pImage refmap
  154        <|> pRawHtml
  155        <|> pAutolink
  156        <|> pEntity
  157        <|> pSym
  158 
  159 -- Parse spaces or newlines, and determine whether
  160 -- we have a regular space, a line break (two spaces before
  161 -- a newline), or a soft break (newline without two spaces
  162 -- before).
  163 pSpace :: Parser Inlines
  164 pSpace = do
  165   ss <- takeWhile1 isWhitespace
  166   return $ singleton
  167          $ if T.any (=='\n') ss
  168               then if "  " `T.isPrefixOf` ss
  169                    then LineBreak
  170                    else SoftBreak
  171               else Space
  172 
  173 isAsciiAlphaNum :: Char -> Bool
  174 isAsciiAlphaNum c =
  175   (c >= 'a' && c <= 'z') ||
  176   (c >= 'A' && c <= 'Z') ||
  177   (c >= '0' && c <= '9')
  178 
  179 pAsciiStr :: Parser Inlines
  180 pAsciiStr = do
  181   t <- takeWhile1 isAsciiAlphaNum
  182   mbc <- peekChar
  183   case mbc of
  184        Just ':' -> if t `Set.member` schemeSet
  185                       then pUri t
  186                       else return $ singleton $ Str t
  187        _        -> return $ singleton $ Str t
  188 
  189 -- Catch all -- parse an escaped character, an escaped
  190 -- newline, or any remaining symbol character.
  191 pSym :: Parser Inlines
  192 pSym = do
  193   c <- anyChar
  194   let ch = singleton . Str . T.singleton
  195   if c == '\\'
  196      then ch <$> satisfy isEscapable
  197           <|> singleton LineBreak <$ satisfy (=='\n')
  198           <|> return (ch '\\')
  199      else return (ch c)
  200 
  201 -- http://www.iana.org/assignments/uri-schemes.html plus
  202 -- the unofficial schemes coap, doi, javascript.
  203 schemes :: [Text]
  204 schemes = [ -- unofficial
  205             "coap","doi","javascript"
  206            -- official
  207            ,"aaa","aaas","about","acap"
  208            ,"cap","cid","crid","data","dav","dict","dns","file","ftp"
  209            ,"geo","go","gopher","h323","http","https","iax","icap","im"
  210            ,"imap","info","ipp","iris","iris.beep","iris.xpc","iris.xpcs"
  211            ,"iris.lwz","ldap","mailto","mid","msrp","msrps","mtqp"
  212            ,"mupdate","news","nfs","ni","nih","nntp","opaquelocktoken","pop"
  213            ,"pres","rtsp","service","session","shttp","sieve","sip","sips"
  214            ,"sms","snmp","soap.beep","soap.beeps","tag","tel","telnet","tftp"
  215            ,"thismessage","tn3270","tip","tv","urn","vemmi","ws","wss"
  216            ,"xcon","xcon-userid","xmlrpc.beep","xmlrpc.beeps","xmpp","z39.50r"
  217            ,"z39.50s"
  218            -- provisional
  219            ,"adiumxtra","afp","afs","aim","apt","attachment","aw"
  220            ,"beshare","bitcoin","bolo","callto","chrome","chrome-extension"
  221            ,"com-eventbrite-attendee","content","cvs","dlna-playsingle"
  222            ,"dlna-playcontainer","dtn","dvb","ed2k","facetime","feed"
  223            ,"finger","fish","gg","git","gizmoproject","gtalk"
  224            ,"hcp","icon","ipn","irc","irc6","ircs","itms","jar"
  225            ,"jms","keyparc","lastfm","ldaps","magnet","maps","market"
  226            ,"message","mms","ms-help","msnim","mumble","mvn","notes"
  227            ,"oid","palm","paparazzi","platform","proxy","psyc","query"
  228            ,"res","resource","rmi","rsync","rtmp","secondlife","sftp"
  229            ,"sgn","skype","smb","soldat","spotify","ssh","steam","svn"
  230            ,"teamspeak","things","udp","unreal","ut2004","ventrilo"
  231            ,"view-source","webcal","wtai","wyciwyg","xfire","xri"
  232            ,"ymsgr" ]
  233 
  234 -- Make them a set for more efficient lookup.
  235 schemeSet :: Set.Set Text
  236 schemeSet = Set.fromList $ schemes ++ map T.toUpper schemes
  237 
  238 -- Parse a URI, using heuristics to avoid capturing final punctuation.
  239 pUri :: Text -> Parser Inlines
  240 pUri scheme = do
  241   _ <- char ':'
  242   x <- scan (OpenParens 0) uriScanner
  243   guard $ not $ T.null x
  244   let (rawuri, endingpunct) =
  245         case T.last x of
  246              c | c `elem` (".;?!:," :: String) ->
  247                (scheme <> ":" <> T.init x, singleton (Str (T.singleton c)))
  248              _ -> (scheme <> ":" <> x, mempty)
  249   return $ autoLink rawuri <> endingpunct
  250 
  251 -- Scan non-ascii characters and ascii characters allowed in a URI.
  252 -- We allow punctuation except when followed by a space, since
  253 -- we don't want the trailing '.' in 'http://google.com.'
  254 -- We want to allow
  255 -- http://en.wikipedia.org/wiki/State_of_emergency_(disambiguation)
  256 -- as a URL, while NOT picking up the closing paren in
  257 -- (http://wikipedia.org)
  258 -- So we include balanced parens in the URL.
  259 
  260 data OpenParens = OpenParens Int
  261 
  262 uriScanner :: OpenParens -> Char -> Maybe OpenParens
  263 uriScanner _ ' '  = Nothing
  264 uriScanner _ '\n' = Nothing
  265 uriScanner (OpenParens n) '(' = Just (OpenParens (n + 1))
  266 uriScanner (OpenParens n) ')'
  267   | n > 0 = Just (OpenParens (n - 1))
  268   | otherwise = Nothing
  269 uriScanner st '+' = Just st
  270 uriScanner st '/' = Just st
  271 uriScanner _ c | isSpace c = Nothing
  272 uriScanner st _ = Just st
  273 
  274 -- Parses material enclosed in *s, **s, _s, or __s.
  275 -- Designed to avoid backtracking.
  276 pEnclosure :: Char -> ReferenceMap -> Parser Inlines
  277 pEnclosure c refmap = do
  278   cs <- takeWhile1 (== c)
  279   (Str cs <|) <$> pSpace
  280    <|> case T.length cs of
  281             3  -> pThree c refmap
  282             2  -> pTwo c refmap mempty
  283             1  -> pOne c refmap mempty
  284             _  -> return (singleton $ Str cs)
  285 
  286 -- singleton sequence or empty if contents are empty
  287 single :: (Inlines -> Inline) -> Inlines -> Inlines
  288 single constructor ils = if Seq.null ils
  289                             then mempty
  290                             else singleton (constructor ils)
  291 
  292 -- parse inlines til you hit a c, and emit Emph.
  293 -- if you never hit a c, emit '*' + inlines parsed.
  294 pOne :: Char -> ReferenceMap -> Inlines -> Parser Inlines
  295 pOne c refmap prefix = do
  296   contents <- msum <$> many ( (nfbChar c >> pInline refmap)
  297                              <|> (string (T.pack [c,c]) >>
  298                                   nfbChar c >> pTwo c refmap mempty) )
  299   (char c >> return (single Emph $ prefix <> contents))
  300     <|> return (singleton (Str (T.singleton c)) <> (prefix <> contents))
  301 
  302 -- parse inlines til you hit two c's, and emit Strong.
  303 -- if you never do hit two c's, emit '**' plus + inlines parsed.
  304 pTwo :: Char -> ReferenceMap -> Inlines -> Parser Inlines
  305 pTwo c refmap prefix = do
  306   let ender = string $ T.pack [c,c]
  307   contents <- msum <$> many (nfb ender >> pInline refmap)
  308   (ender >> return (single Strong $ prefix <> contents))
  309     <|> return (singleton (Str $ T.pack [c,c]) <> (prefix <> contents))
  310 
  311 -- parse inlines til you hit one c or a sequence of two c's.
  312 -- If one c, emit Emph and then parse pTwo.
  313 -- if two c's, emit Strong and then parse pOne.
  314 pThree :: Char -> ReferenceMap -> Parser Inlines
  315 pThree c refmap = do
  316   contents <- msum <$> (many (nfbChar c >> pInline refmap))
  317   (string (T.pack [c,c]) >> (pOne c refmap (single Strong contents)))
  318    <|> (char c >> (pTwo c refmap (single Emph contents)))
  319    <|> return (singleton (Str $ T.pack [c,c,c]) <> contents)
  320 
  321 -- Inline code span.
  322 pCode :: Parser Inlines
  323 pCode = fst <$> pCode'
  324 
  325 -- this is factored out because it needed in pLinkLabel.
  326 pCode' :: Parser (Inlines, Text)
  327 pCode' = do
  328   ticks <- takeWhile1 (== '`')
  329   let end = string ticks >> nfb (char '`')
  330   let nonBacktickSpan = takeWhile1 (/= '`')
  331   let backtickSpan = takeWhile1 (== '`')
  332   contents <- T.concat <$> manyTill (nonBacktickSpan <|> backtickSpan) end
  333   return (singleton . Code . T.strip $ contents, ticks <> contents <> ticks)
  334 
  335 pLink :: ReferenceMap -> Parser Inlines
  336 pLink refmap = do
  337   lab <- pLinkLabel
  338   let lab' = parseInlines refmap lab
  339   pInlineLink lab' <|> pReferenceLink refmap lab lab'
  340     -- fallback without backtracking if it's not a link:
  341     <|> return (singleton (Str "[") <> lab' <> singleton (Str "]"))
  342 
  343 -- An inline link: [label](/url "optional title")
  344 pInlineLink :: Inlines -> Parser Inlines
  345 pInlineLink lab = do
  346   _ <- char '('
  347   scanSpaces
  348   url <- pLinkUrl
  349   tit <- option "" $ scanSpnl *> pLinkTitle <* scanSpaces
  350   _ <- char ')'
  351   return $ singleton $ Link lab (Url url) tit
  352 
  353 -- A reference link: [label], [foo][label], or [label][].
  354 pReferenceLink :: ReferenceMap -> Text -> Inlines -> Parser Inlines
  355 pReferenceLink _ rawlab lab = do
  356   ref <- option rawlab $ scanSpnl >> pLinkLabel
  357   return $ singleton $ Link lab (Ref ref) ""
  358 
  359 
  360 -- An image:  ! followed by a link.
  361 pImage :: ReferenceMap -> Parser Inlines
  362 pImage refmap = do
  363   _ <- char '!'
  364   (linkToImage <$> pLink refmap) <|> return (singleton (Str "!"))
  365 
  366 linkToImage :: Inlines -> Inlines
  367 linkToImage ils =
  368   case viewl ils of
  369         (Link lab (Url url) tit :< x)
  370           | Seq.null x -> singleton (Image lab url tit)
  371         _ -> singleton (Str "!") <> ils
  372 
  373 -- An entity.  We store these in a special inline element.
  374 -- This ensures that entities in the input come out as
  375 -- entities in the output. Alternatively we could simply
  376 -- convert them to characters and store them as Str inlines.
  377 pEntity :: Parser Inlines
  378 pEntity = do
  379   _ <- char '&'
  380   res <- pCharEntity <|> pDecEntity <|> pHexEntity
  381   _ <- char ';'
  382   return $ singleton $ Entity $ "&" <> res <> ";"
  383 
  384 pCharEntity :: Parser Text
  385 pCharEntity = takeWhile1 (\c -> isAscii c && isLetter c)
  386 
  387 pDecEntity :: Parser Text
  388 pDecEntity = do
  389   _ <- char '#'
  390   res <- takeWhile1 isDigit
  391   return $ "#" <> res
  392 
  393 pHexEntity :: Parser Text
  394 pHexEntity = do
  395   _ <- char '#'
  396   x <- char 'X' <|> char 'x'
  397   res <- takeWhile1 isHexDigit
  398   return $ "#" <> T.singleton x <> res
  399 
  400 -- Raw HTML tag or comment.
  401 pRawHtml :: Parser Inlines
  402 pRawHtml = singleton . RawHtml <$> (snd <$> pHtmlTag <|> pHtmlComment)
  403 
  404 -- A link like this: <http://whatever.com> or <me@mydomain.edu>.
  405 -- Markdown.pl does email obfuscation; we don't bother with that here.
  406 pAutolink :: Parser Inlines
  407 pAutolink = do
  408   skip (=='<')
  409   s <- takeWhile1 (\c -> c /= ':' && c /= '@')
  410   rest <- takeWhile1 (\c -> c /='>' && c /= ' ')
  411   skip (=='>')
  412   case True of
  413        _ | "@" `T.isPrefixOf` rest -> return $ emailLink (s <> rest)
  414          | s `Set.member` schemeSet -> return $ autoLink (s <> rest)
  415          | otherwise   -> fail "Unknown contents of <>"
  416 
  417 autoLink :: Text -> Inlines
  418 autoLink t = singleton $ Link (toInlines t) (Url t) (T.empty)
  419   where toInlines t' = case parse pToInlines t' of
  420                          Right r   -> r
  421                          Left e    -> error $ "autolink: " ++ show e
  422         pToInlines = mconcat <$> many strOrEntity
  423         strOrEntity = ((singleton . Str) <$> takeWhile1 (/='&'))
  424                    <|> pEntity
  425                    <|> ((singleton . Str) <$> string "&")
  426 
  427 emailLink :: Text -> Inlines
  428 emailLink t = singleton $ Link (singleton $ Str t)
  429                                (Url $ "mailto:" <> t) (T.empty)