never executed always true always false
1 {-# LANGUAGE OverloadedStrings #-}
2 module Cheapskate.Inlines (
3 parseInlines
4 , pHtmlTag
5 , pReference
6 , pLinkLabel)
7 where
8 import Cheapskate.ParserCombinators
9 import Cheapskate.Util
10 import Cheapskate.Types
11 import Data.Char hiding (Space)
12 import qualified Data.Sequence as Seq
13 import Data.Sequence (singleton, (<|), viewl, ViewL(..))
14 import Prelude hiding (takeWhile)
15 import Control.Applicative
16 import Control.Monad
17 import Data.Text (Text)
18 import qualified Data.Text as T
19 import qualified Data.Set as Set
20
21 -- Returns tag type and whole tag.
22 pHtmlTag :: Parser (HtmlTagType, Text)
23 pHtmlTag = do
24 _ <- char '<'
25 -- do not end the tag with a > character in a quoted attribute.
26 closing <- (char '/' >> return True) <|> return False
27 tagname <- takeWhile1 (\c -> isAsciiAlphaNum c || c == '?' || c == '!')
28 let tagname' = T.toLower tagname
29 let attr = do ss <- takeWhile isSpace
30 x <- satisfy isLetter
31 xs <- takeWhile (\c -> isAsciiAlphaNum c || c == ':')
32 skip (=='=')
33 v <- pQuoted '"' <|> pQuoted '\'' <|> takeWhile1 isAlphaNum
34 <|> return ""
35 return $ ss <> T.singleton x <> xs <> "=" <> v
36 attrs <- T.concat <$> many attr
37 final <- takeWhile (\c -> isSpace c || c == '/')
38 _ <- char '>'
39 let tagtype = if closing
40 then Closing tagname'
41 else case T.stripSuffix "/" final of
42 Just _ -> SelfClosing tagname'
43 Nothing -> Opening tagname'
44 return (tagtype,
45 T.pack ('<' : ['/' | closing]) <> tagname <> attrs <> final <> ">")
46
47 -- Parses a quoted attribute value.
48 pQuoted :: Char -> Parser Text
49 pQuoted c = do
50 skip (== c)
51 contents <- takeTill (== c)
52 skip (== c)
53 return (T.singleton c <> contents <> T.singleton c)
54
55 -- Parses an HTML comment. This isn't really correct to spec, but should
56 -- do for now.
57 pHtmlComment :: Parser Text
58 pHtmlComment = do
59 _ <- string "<!--"
60 rest <- manyTill anyChar (string "-->")
61 return $ "<!--" <> T.pack rest <> "-->"
62
63 -- A link label [like this]. Note the precedence: code backticks have
64 -- precedence over label bracket markers, which have precedence over
65 -- *, _, and other inline formatting markers.
66 -- So, 2 below contains a link while 1 does not:
67 -- 1. [a link `with a ](/url)` character
68 -- 2. [a link *with emphasized ](/url) text*
69 pLinkLabel :: Parser Text
70 pLinkLabel = char '[' *> (T.concat <$>
71 (manyTill (regChunk <|> pEscaped <|> bracketed <|> codeChunk) (char ']')))
72 where regChunk = takeWhile1 (\c -> c /='`' && c /='[' && c /=']' && c /='\\')
73 codeChunk = snd <$> pCode'
74 bracketed = inBrackets <$> pLinkLabel
75 inBrackets t = "[" <> t <> "]"
76
77 -- A URL in a link or reference. This may optionally be contained
78 -- in `<..>`; otherwise whitespace and unbalanced right parentheses
79 -- aren't allowed. Newlines aren't allowed in any case.
80 pLinkUrl :: Parser Text
81 pLinkUrl = do
82 inPointy <- (char '<' >> return True) <|> return False
83 if inPointy
84 then T.pack <$> manyTill
85 (pSatisfy (\c -> c /='\r' && c /='\n')) (char '>')
86 else T.concat <$> many (regChunk <|> parenChunk)
87 where regChunk = takeWhile1 (notInClass " \n()\\") <|> pEscaped
88 parenChunk = parenthesize . T.concat <$> (char '(' *>
89 manyTill (regChunk <|> parenChunk) (char ')'))
90 parenthesize x = "(" <> x <> ")"
91
92 -- A link title, single or double quoted or in parentheses.
93 -- Note that Markdown.pl doesn't allow the parenthesized form in
94 -- inline links -- only in references -- but this restriction seems
95 -- arbitrary, so we remove it here.
96 pLinkTitle :: Parser Text
97 pLinkTitle = do
98 c <- satisfy (\c -> c == '"' || c == '\'' || c == '(')
99 next <- peekChar
100 case next of
101 Nothing -> mzero
102 Just x
103 | isWhitespace x -> mzero
104 | x == ')' -> mzero
105 | otherwise -> return ()
106 let ender = if c == '(' then ')' else c
107 let pEnder = char ender <* nfb (skip isAlphaNum)
108 let regChunk = takeWhile1 (\x -> x /= ender && x /= '\\') <|> pEscaped
109 let nestedChunk = (\x -> T.singleton c <> x <> T.singleton ender)
110 <$> pLinkTitle
111 T.concat <$> manyTill (regChunk <|> nestedChunk) pEnder
112
113 -- A link reference is a square-bracketed link label, a colon,
114 -- optional space or newline, a URL, optional space or newline,
115 -- and an optional link title. (Note: we assume the input is
116 -- pre-stripped, with no leading/trailing spaces.)
117 pReference :: Parser (Text, Text, Text)
118 pReference = do
119 lab <- pLinkLabel
120 _ <- char ':'
121 scanSpnl
122 url <- pLinkUrl
123 tit <- option T.empty $ scanSpnl >> pLinkTitle
124 endOfInput
125 return (lab, url, tit)
126
127 -- Parses an escaped character and returns a Text.
128 pEscaped :: Parser Text
129 pEscaped = T.singleton <$> (skip (=='\\') *> satisfy isEscapable)
130
131 -- Parses a (possibly escaped) character satisfying the predicate.
132 pSatisfy :: (Char -> Bool) -> Parser Char
133 pSatisfy p =
134 satisfy (\c -> c /= '\\' && p c)
135 <|> (char '\\' *> satisfy (\c -> isEscapable c && p c))
136
137 -- Parse a text into inlines, resolving reference links
138 -- using the reference map.
139 parseInlines :: ReferenceMap -> Text -> Inlines
140 parseInlines refmap t =
141 case parse (msum <$> many (pInline refmap) <* endOfInput) t of
142 Left e -> error ("parseInlines: " ++ show e) -- should not happen
143 Right r -> r
144
145 pInline :: ReferenceMap -> Parser Inlines
146 pInline refmap =
147 pAsciiStr
148 <|> pSpace
149 <|> pEnclosure '*' refmap -- strong/emph
150 <|> (notAfter isAlphaNum *> pEnclosure '_' refmap)
151 <|> pCode
152 <|> pLink refmap
153 <|> pImage refmap
154 <|> pRawHtml
155 <|> pAutolink
156 <|> pEntity
157 <|> pSym
158
159 -- Parse spaces or newlines, and determine whether
160 -- we have a regular space, a line break (two spaces before
161 -- a newline), or a soft break (newline without two spaces
162 -- before).
163 pSpace :: Parser Inlines
164 pSpace = do
165 ss <- takeWhile1 isWhitespace
166 return $ singleton
167 $ if T.any (=='\n') ss
168 then if " " `T.isPrefixOf` ss
169 then LineBreak
170 else SoftBreak
171 else Space
172
173 isAsciiAlphaNum :: Char -> Bool
174 isAsciiAlphaNum c =
175 (c >= 'a' && c <= 'z') ||
176 (c >= 'A' && c <= 'Z') ||
177 (c >= '0' && c <= '9')
178
179 pAsciiStr :: Parser Inlines
180 pAsciiStr = do
181 t <- takeWhile1 isAsciiAlphaNum
182 mbc <- peekChar
183 case mbc of
184 Just ':' -> if t `Set.member` schemeSet
185 then pUri t
186 else return $ singleton $ Str t
187 _ -> return $ singleton $ Str t
188
189 -- Catch all -- parse an escaped character, an escaped
190 -- newline, or any remaining symbol character.
191 pSym :: Parser Inlines
192 pSym = do
193 c <- anyChar
194 let ch = singleton . Str . T.singleton
195 if c == '\\'
196 then ch <$> satisfy isEscapable
197 <|> singleton LineBreak <$ satisfy (=='\n')
198 <|> return (ch '\\')
199 else return (ch c)
200
201 -- http://www.iana.org/assignments/uri-schemes.html plus
202 -- the unofficial schemes coap, doi, javascript.
203 schemes :: [Text]
204 schemes = [ -- unofficial
205 "coap","doi","javascript"
206 -- official
207 ,"aaa","aaas","about","acap"
208 ,"cap","cid","crid","data","dav","dict","dns","file","ftp"
209 ,"geo","go","gopher","h323","http","https","iax","icap","im"
210 ,"imap","info","ipp","iris","iris.beep","iris.xpc","iris.xpcs"
211 ,"iris.lwz","ldap","mailto","mid","msrp","msrps","mtqp"
212 ,"mupdate","news","nfs","ni","nih","nntp","opaquelocktoken","pop"
213 ,"pres","rtsp","service","session","shttp","sieve","sip","sips"
214 ,"sms","snmp","soap.beep","soap.beeps","tag","tel","telnet","tftp"
215 ,"thismessage","tn3270","tip","tv","urn","vemmi","ws","wss"
216 ,"xcon","xcon-userid","xmlrpc.beep","xmlrpc.beeps","xmpp","z39.50r"
217 ,"z39.50s"
218 -- provisional
219 ,"adiumxtra","afp","afs","aim","apt","attachment","aw"
220 ,"beshare","bitcoin","bolo","callto","chrome","chrome-extension"
221 ,"com-eventbrite-attendee","content","cvs","dlna-playsingle"
222 ,"dlna-playcontainer","dtn","dvb","ed2k","facetime","feed"
223 ,"finger","fish","gg","git","gizmoproject","gtalk"
224 ,"hcp","icon","ipn","irc","irc6","ircs","itms","jar"
225 ,"jms","keyparc","lastfm","ldaps","magnet","maps","market"
226 ,"message","mms","ms-help","msnim","mumble","mvn","notes"
227 ,"oid","palm","paparazzi","platform","proxy","psyc","query"
228 ,"res","resource","rmi","rsync","rtmp","secondlife","sftp"
229 ,"sgn","skype","smb","soldat","spotify","ssh","steam","svn"
230 ,"teamspeak","things","udp","unreal","ut2004","ventrilo"
231 ,"view-source","webcal","wtai","wyciwyg","xfire","xri"
232 ,"ymsgr" ]
233
234 -- Make them a set for more efficient lookup.
235 schemeSet :: Set.Set Text
236 schemeSet = Set.fromList $ schemes ++ map T.toUpper schemes
237
238 -- Parse a URI, using heuristics to avoid capturing final punctuation.
239 pUri :: Text -> Parser Inlines
240 pUri scheme = do
241 _ <- char ':'
242 x <- scan (OpenParens 0) uriScanner
243 guard $ not $ T.null x
244 let (rawuri, endingpunct) =
245 case T.last x of
246 c | c `elem` (".;?!:," :: String) ->
247 (scheme <> ":" <> T.init x, singleton (Str (T.singleton c)))
248 _ -> (scheme <> ":" <> x, mempty)
249 return $ autoLink rawuri <> endingpunct
250
251 -- Scan non-ascii characters and ascii characters allowed in a URI.
252 -- We allow punctuation except when followed by a space, since
253 -- we don't want the trailing '.' in 'http://google.com.'
254 -- We want to allow
255 -- http://en.wikipedia.org/wiki/State_of_emergency_(disambiguation)
256 -- as a URL, while NOT picking up the closing paren in
257 -- (http://wikipedia.org)
258 -- So we include balanced parens in the URL.
259
260 data OpenParens = OpenParens Int
261
262 uriScanner :: OpenParens -> Char -> Maybe OpenParens
263 uriScanner _ ' ' = Nothing
264 uriScanner _ '\n' = Nothing
265 uriScanner (OpenParens n) '(' = Just (OpenParens (n + 1))
266 uriScanner (OpenParens n) ')'
267 | n > 0 = Just (OpenParens (n - 1))
268 | otherwise = Nothing
269 uriScanner st '+' = Just st
270 uriScanner st '/' = Just st
271 uriScanner _ c | isSpace c = Nothing
272 uriScanner st _ = Just st
273
274 -- Parses material enclosed in *s, **s, _s, or __s.
275 -- Designed to avoid backtracking.
276 pEnclosure :: Char -> ReferenceMap -> Parser Inlines
277 pEnclosure c refmap = do
278 cs <- takeWhile1 (== c)
279 (Str cs <|) <$> pSpace
280 <|> case T.length cs of
281 3 -> pThree c refmap
282 2 -> pTwo c refmap mempty
283 1 -> pOne c refmap mempty
284 _ -> return (singleton $ Str cs)
285
286 -- singleton sequence or empty if contents are empty
287 single :: (Inlines -> Inline) -> Inlines -> Inlines
288 single constructor ils = if Seq.null ils
289 then mempty
290 else singleton (constructor ils)
291
292 -- parse inlines til you hit a c, and emit Emph.
293 -- if you never hit a c, emit '*' + inlines parsed.
294 pOne :: Char -> ReferenceMap -> Inlines -> Parser Inlines
295 pOne c refmap prefix = do
296 contents <- msum <$> many ( (nfbChar c >> pInline refmap)
297 <|> (string (T.pack [c,c]) >>
298 nfbChar c >> pTwo c refmap mempty) )
299 (char c >> return (single Emph $ prefix <> contents))
300 <|> return (singleton (Str (T.singleton c)) <> (prefix <> contents))
301
302 -- parse inlines til you hit two c's, and emit Strong.
303 -- if you never do hit two c's, emit '**' plus + inlines parsed.
304 pTwo :: Char -> ReferenceMap -> Inlines -> Parser Inlines
305 pTwo c refmap prefix = do
306 let ender = string $ T.pack [c,c]
307 contents <- msum <$> many (nfb ender >> pInline refmap)
308 (ender >> return (single Strong $ prefix <> contents))
309 <|> return (singleton (Str $ T.pack [c,c]) <> (prefix <> contents))
310
311 -- parse inlines til you hit one c or a sequence of two c's.
312 -- If one c, emit Emph and then parse pTwo.
313 -- if two c's, emit Strong and then parse pOne.
314 pThree :: Char -> ReferenceMap -> Parser Inlines
315 pThree c refmap = do
316 contents <- msum <$> (many (nfbChar c >> pInline refmap))
317 (string (T.pack [c,c]) >> (pOne c refmap (single Strong contents)))
318 <|> (char c >> (pTwo c refmap (single Emph contents)))
319 <|> return (singleton (Str $ T.pack [c,c,c]) <> contents)
320
321 -- Inline code span.
322 pCode :: Parser Inlines
323 pCode = fst <$> pCode'
324
325 -- this is factored out because it needed in pLinkLabel.
326 pCode' :: Parser (Inlines, Text)
327 pCode' = do
328 ticks <- takeWhile1 (== '`')
329 let end = string ticks >> nfb (char '`')
330 let nonBacktickSpan = takeWhile1 (/= '`')
331 let backtickSpan = takeWhile1 (== '`')
332 contents <- T.concat <$> manyTill (nonBacktickSpan <|> backtickSpan) end
333 return (singleton . Code . T.strip $ contents, ticks <> contents <> ticks)
334
335 pLink :: ReferenceMap -> Parser Inlines
336 pLink refmap = do
337 lab <- pLinkLabel
338 let lab' = parseInlines refmap lab
339 pInlineLink lab' <|> pReferenceLink refmap lab lab'
340 -- fallback without backtracking if it's not a link:
341 <|> return (singleton (Str "[") <> lab' <> singleton (Str "]"))
342
343 -- An inline link: [label](/url "optional title")
344 pInlineLink :: Inlines -> Parser Inlines
345 pInlineLink lab = do
346 _ <- char '('
347 scanSpaces
348 url <- pLinkUrl
349 tit <- option "" $ scanSpnl *> pLinkTitle <* scanSpaces
350 _ <- char ')'
351 return $ singleton $ Link lab (Url url) tit
352
353 -- A reference link: [label], [foo][label], or [label][].
354 pReferenceLink :: ReferenceMap -> Text -> Inlines -> Parser Inlines
355 pReferenceLink _ rawlab lab = do
356 ref <- option rawlab $ scanSpnl >> pLinkLabel
357 return $ singleton $ Link lab (Ref ref) ""
358
359
360 -- An image: ! followed by a link.
361 pImage :: ReferenceMap -> Parser Inlines
362 pImage refmap = do
363 _ <- char '!'
364 (linkToImage <$> pLink refmap) <|> return (singleton (Str "!"))
365
366 linkToImage :: Inlines -> Inlines
367 linkToImage ils =
368 case viewl ils of
369 (Link lab (Url url) tit :< x)
370 | Seq.null x -> singleton (Image lab url tit)
371 _ -> singleton (Str "!") <> ils
372
373 -- An entity. We store these in a special inline element.
374 -- This ensures that entities in the input come out as
375 -- entities in the output. Alternatively we could simply
376 -- convert them to characters and store them as Str inlines.
377 pEntity :: Parser Inlines
378 pEntity = do
379 _ <- char '&'
380 res <- pCharEntity <|> pDecEntity <|> pHexEntity
381 _ <- char ';'
382 return $ singleton $ Entity $ "&" <> res <> ";"
383
384 pCharEntity :: Parser Text
385 pCharEntity = takeWhile1 (\c -> isAscii c && isLetter c)
386
387 pDecEntity :: Parser Text
388 pDecEntity = do
389 _ <- char '#'
390 res <- takeWhile1 isDigit
391 return $ "#" <> res
392
393 pHexEntity :: Parser Text
394 pHexEntity = do
395 _ <- char '#'
396 x <- char 'X' <|> char 'x'
397 res <- takeWhile1 isHexDigit
398 return $ "#" <> T.singleton x <> res
399
400 -- Raw HTML tag or comment.
401 pRawHtml :: Parser Inlines
402 pRawHtml = singleton . RawHtml <$> (snd <$> pHtmlTag <|> pHtmlComment)
403
404 -- A link like this: <http://whatever.com> or <me@mydomain.edu>.
405 -- Markdown.pl does email obfuscation; we don't bother with that here.
406 pAutolink :: Parser Inlines
407 pAutolink = do
408 skip (=='<')
409 s <- takeWhile1 (\c -> c /= ':' && c /= '@')
410 rest <- takeWhile1 (\c -> c /='>' && c /= ' ')
411 skip (=='>')
412 case True of
413 _ | "@" `T.isPrefixOf` rest -> return $ emailLink (s <> rest)
414 | s `Set.member` schemeSet -> return $ autoLink (s <> rest)
415 | otherwise -> fail "Unknown contents of <>"
416
417 autoLink :: Text -> Inlines
418 autoLink t = singleton $ Link (toInlines t) (Url t) (T.empty)
419 where toInlines t' = case parse pToInlines t' of
420 Right r -> r
421 Left e -> error $ "autolink: " ++ show e
422 pToInlines = mconcat <$> many strOrEntity
423 strOrEntity = ((singleton . Str) <$> takeWhile1 (/='&'))
424 <|> pEntity
425 <|> ((singleton . Str) <$> string "&")
426
427 emailLink :: Text -> Inlines
428 emailLink t = singleton $ Link (singleton $ Str t)
429 (Url $ "mailto:" <> t) (T.empty)