never executed always true always false
    1 {-# LANGUAGE OverloadedStrings, PatternGuards #-}
    2 module Cheapskate.Parse (
    3          markdown
    4        ) where
    5 import Cheapskate.ParserCombinators
    6 import Cheapskate.Util
    7 import Cheapskate.Inlines
    8 import Cheapskate.Types
    9 import Data.Char hiding (Space)
   10 import qualified Data.Set as Set
   11 import Prelude hiding (takeWhile)
   12 import Data.Maybe (mapMaybe)
   13 import Data.Text (Text)
   14 import qualified Data.Text as T
   15 import Data.Foldable (toList)
   16 import Data.Sequence ((|>), viewr, ViewR(..), singleton, Seq)
   17 import qualified Data.Sequence as Seq
   18 import Control.Monad.RWS
   19 import Control.Applicative
   20 import qualified Data.Map as M
   21 import Data.List (intercalate)
   22 
   23 import Debug.Trace
   24 
   25 -- | Parses the input as a markdown document.  Note that 'Doc' is an instance
   26 -- of 'ToMarkup', so the document can be converted to 'Html' using 'toHtml'.
   27 -- A simple 'Text' to 'Html' filter would be
   28 --
   29 -- > markdownToHtml :: Text -> Html
   30 -- > markdownToHtml = toHtml . markdown def
   31 markdown :: Options -> Text -> Doc
   32 markdown opts
   33   | debug opts = (\x -> trace (show x) $ Doc opts mempty) . processLines
   34   | otherwise  = Doc opts . processDocument . processLines
   35 
   36 -- General parsing strategy:
   37 --
   38 -- Step 1:  processLines
   39 --
   40 -- We process the input line by line.  Each line modifies the
   41 -- container stack, by adding a leaf to the current open container,
   42 -- sometimes after closing old containers and/or opening new ones.
   43 --
   44 -- To open a container is to add it to the top of the container stack,
   45 -- so that new content will be added under this container.
   46 -- To close a container is to remove it from the container stack and
   47 -- make it a child of the container above it on the container stack.
   48 --
   49 -- When all the input has been processed, we close all open containers
   50 -- except the root (Document) container.  At this point we should also
   51 -- have a ReferenceMap containing any defined link references.
   52 --
   53 -- Step 2:  processDocument
   54 --
   55 -- We then convert this container structure into an AST.  This principally
   56 -- involves (a) gathering consecutive ListItem containers into lists, (b)
   57 -- gathering TextLine nodes that don't belong to verbatim containers into
   58 -- paragraphs, and (c) parsing the inline contents of non-verbatim TextLines.
   59 
   60 --------
   61 
   62 -- Container stack definitions:
   63 
   64 data ContainerStack =
   65   ContainerStack Container {- top -} [Container] {- rest -}
   66 
   67 type LineNumber   = Int
   68 
   69 -- Generic type for a container or a leaf.
   70 data Elt = C Container
   71          | L LineNumber Leaf
   72          deriving Show
   73 
   74 data Container = Container{
   75                      containerType :: ContainerType
   76                    , children      :: Seq Elt
   77                    }
   78 
   79 data ContainerType = Document
   80                    | BlockQuote
   81                    | ListItem { markerColumn :: Int
   82                               , padding      :: Int
   83                               , listType     :: ListType }
   84                    | FencedCode { startColumn :: Int
   85                                 , fence :: Text
   86                                 , info :: Text }
   87                    | IndentedCode
   88                    | RawHtmlBlock
   89                    | Reference
   90                    deriving (Eq, Show)
   91 
   92 instance Show Container where
   93   show c = show (containerType c) ++ "\n" ++
   94     nest 2 (intercalate "\n" (map showElt $ toList $ children c))
   95 
   96 nest :: Int -> String -> String
   97 nest num = intercalate "\n" . map ((replicate num ' ') ++) . lines
   98 
   99 showElt :: Elt -> String
  100 showElt (C c) = show c
  101 showElt (L _ (TextLine s)) = show s
  102 showElt (L _ lf) = show lf
  103 
  104 -- Scanners that must be satisfied if the current open container
  105 -- is to be continued on a new line (ignoring lazy continuations).
  106 containerContinue :: Container -> Scanner
  107 containerContinue c =
  108   case containerType c of
  109        BlockQuote     -> scanNonindentSpace *> scanBlockquoteStart
  110        IndentedCode   -> scanIndentSpace
  111        FencedCode{startColumn = col} ->
  112                          scanSpacesToColumn col
  113        RawHtmlBlock   -> nfb scanBlankline
  114        li@ListItem{}  -> scanBlankline
  115                          <|>
  116                          (do scanSpacesToColumn
  117                                 (markerColumn li + 1)
  118                              _ <- upToCountChars (padding li - 1)
  119                                 (==' ')
  120                              return ())
  121        Reference{}    -> nfb scanBlankline >>
  122                          nfb (scanNonindentSpace *> scanReference)
  123        _              -> return ()
  124 {-# INLINE containerContinue #-}
  125 
  126 -- Defines parsers that open new containers.
  127 containerStart :: Bool -> Parser ContainerType
  128 containerStart _lastLineIsText = scanNonindentSpace *>
  129    (  (BlockQuote <$ scanBlockquoteStart)
  130   <|> parseListMarker
  131    )
  132 
  133 -- Defines parsers that open new verbatim containers (containers
  134 -- that take only TextLine and BlankLine as children).
  135 verbatimContainerStart :: Bool -> Parser ContainerType
  136 verbatimContainerStart lastLineIsText = scanNonindentSpace *>
  137    (  parseCodeFence
  138   <|> (guard (not lastLineIsText) *> (IndentedCode <$ char ' ' <* nfb scanBlankline))
  139   <|> (guard (not lastLineIsText) *> (RawHtmlBlock <$ parseHtmlBlockStart))
  140   <|> (guard (not lastLineIsText) *> (Reference <$ scanReference))
  141    )
  142 
  143 -- Leaves of the container structure (they don't take children).
  144 data Leaf = TextLine Text
  145           | BlankLine Text
  146           | ATXHeader Int Text
  147           | SetextHeader Int Text
  148           | Rule
  149           deriving (Show)
  150 
  151 type ContainerM = RWS () ReferenceMap ContainerStack
  152 
  153 -- Close the whole container stack, leaving only the root Document container.
  154 closeStack :: ContainerM Container
  155 closeStack = do
  156   ContainerStack top rest  <- get
  157   if null rest
  158      then return top
  159      else closeContainer >> closeStack
  160 
  161 -- Close the top container on the stack.  If the container is a Reference
  162 -- container, attempt to parse the reference and update the reference map.
  163 -- If it is a list item container, move a final BlankLine outside the list
  164 -- item.
  165 closeContainer :: ContainerM ()
  166 closeContainer = do
  167   ContainerStack top rest <- get
  168   case top of
  169        (Container Reference{} cs'') ->
  170          case parse pReference
  171                (T.strip $ joinLines $ map extractText $ toList cs'') of
  172               Right (lab, lnk, tit) -> do
  173                 tell (M.singleton (normalizeReference lab) (lnk, tit))
  174                 case rest of
  175                     (Container ct' cs' : rs) ->
  176                       put $ ContainerStack (Container ct' (cs' |> C top)) rs
  177                     [] -> return ()
  178               Left _ -> -- pass over in silence if ref doesn't parse?
  179                         case rest of
  180                              (c:cs) -> put $ ContainerStack c cs
  181                              []     -> return ()
  182        (Container li@ListItem{} cs'') ->
  183          case rest of
  184               -- move final BlankLine outside of list item
  185               (Container ct' cs' : rs) ->
  186                        case viewr cs'' of
  187                             (zs :> b@(L _ BlankLine{})) ->
  188                               put $ ContainerStack
  189                                    (if Seq.null zs
  190                                        then Container ct' (cs' |> C (Container li zs))
  191                                        else Container ct' (cs' |>
  192                                                C (Container li zs) |> b)) rs
  193                             _ -> put $ ContainerStack (Container ct' (cs' |> C top)) rs
  194               [] -> return ()
  195        _ -> case rest of
  196              (Container ct' cs' : rs) ->
  197                  put $ ContainerStack (Container ct' (cs' |> C top)) rs
  198              [] -> return ()
  199 
  200 -- Add a leaf to the top container.
  201 addLeaf :: LineNumber -> Leaf -> ContainerM ()
  202 addLeaf lineNum lf = do
  203   ContainerStack top rest <- get
  204   case (top, lf) of
  205         (Container ct@(ListItem{}) cs, BlankLine{}) ->
  206           case viewr cs of
  207             (_ :> L _ BlankLine{}) -> -- two blanks break out of list item:
  208                  closeContainer >> addLeaf lineNum lf
  209             _ -> put $ ContainerStack (Container ct (cs |> L lineNum lf)) rest
  210         (Container ct cs, _) ->
  211                  put $ ContainerStack (Container ct (cs |> L lineNum lf)) rest
  212 
  213 -- Add a container to the container stack.
  214 addContainer :: ContainerType -> ContainerM ()
  215 addContainer ct = modify $ \(ContainerStack top rest) ->
  216   ContainerStack (Container ct mempty) (top:rest)
  217 
  218 -- Step 2
  219 
  220 -- Convert Document container and reference map into an AST.
  221 processDocument :: (Container, ReferenceMap) -> Blocks
  222 processDocument (Container ct cs, refmap) =
  223   case ct of
  224     Document -> processElts refmap (toList cs)
  225     _        -> error "top level container is not Document"
  226 
  227 -- Turn the result of `processLines` into a proper AST.
  228 -- This requires grouping text lines into paragraphs
  229 -- and list items into lists, handling blank lines,
  230 -- parsing inline contents of texts and resolving referencess.
  231 processElts :: ReferenceMap -> [Elt] -> Blocks
  232 processElts _ [] = mempty
  233 
  234 processElts refmap (L _lineNumber lf : rest) =
  235   case lf of
  236     -- Special handling of @docs lines in Elm:
  237     TextLine t | Just terms1 <- T.stripPrefix "@docs" t ->
  238         let
  239             docs = terms1 : map (cleanDoc . extractText) docLines
  240         in
  241             singleton (ElmDocs $ filter ((/=) []) $ fmap (filter ((/=) ""). fmap T.strip . T.splitOn ",") docs) <>
  242             processElts refmap rest'
  243         where
  244             (docLines, rest') = span isDocLine rest
  245             isDocLine (L _ (TextLine _)) = True
  246             isDocLine _ = False
  247             cleanDoc lin =
  248                 case T.stripPrefix "@docs" lin of
  249                     Nothing -> lin
  250                     Just stripped -> stripped
  251 
  252     -- Gobble text lines and make them into a Para:
  253     TextLine t -> singleton (Para $ parseInlines refmap txt) <>
  254                   processElts refmap rest'
  255                where txt = T.stripEnd $ joinLines $ map T.stripStart
  256                            $ t : map extractText textlines
  257                      (textlines, rest') = span isTextLine rest
  258                      isTextLine (L _ (TextLine s)) | T.isPrefixOf "@docs" s = False
  259                      isTextLine (L _ (TextLine _)) = True
  260                      isTextLine _ = False
  261 
  262     -- Blanks at outer level are ignored:
  263     BlankLine{} -> processElts refmap rest
  264 
  265     -- Headers:
  266     ATXHeader lvl t -> singleton (Header lvl $ parseInlines refmap t) <>
  267                        processElts refmap rest
  268     SetextHeader lvl t -> singleton (Header lvl $ parseInlines refmap t) <>
  269                           processElts refmap rest
  270 
  271     -- Horizontal rule:
  272     Rule -> singleton HRule <> processElts refmap rest
  273 
  274 processElts refmap (C (Container ct cs) : rest) =
  275   case ct of
  276     Document -> error "Document container found inside Document"
  277 
  278     BlockQuote -> singleton (Blockquote $ processElts refmap (toList cs)) <>
  279                   processElts refmap rest
  280 
  281     -- List item?  Gobble up following list items of the same type
  282     -- (skipping blank lines), determine whether the list is tight or
  283     -- loose, and generate a List.
  284     ListItem { listType = listType' } ->
  285         singleton (List isTight listType' items') <> processElts refmap rest'
  286               where xs = takeListItems rest
  287 
  288                     rest' = drop (length xs) rest
  289 
  290                     -- take list items as long as list type matches and we
  291                     -- don't hit two blank lines:
  292                     takeListItems
  293                       (C c@(Container ListItem { listType = lt' } _) : zs)
  294                       | listTypesMatch lt' listType' = C c : takeListItems zs
  295                     takeListItems (lf@(L _ (BlankLine _)) :
  296                       c@(C (Container ListItem { listType = lt' } _)) : zs)
  297                       | listTypesMatch lt' listType' = lf : c : takeListItems zs
  298                     takeListItems _ = []
  299 
  300                     listTypesMatch (Bullet c1) (Bullet c2) = c1 == c2
  301                     listTypesMatch (Numbered w1 _) (Numbered w2 _) = w1 == w2
  302                     listTypesMatch _ _ = False
  303 
  304                     items = mapMaybe getItem (Container ct cs : [c | C c <- xs])
  305 
  306                     getItem (Container ListItem{} cs') = Just $ toList cs'
  307                     getItem _                          = Nothing
  308 
  309                     items' = map (processElts refmap) items
  310 
  311                     isTight = tightListItem xs && all tightListItem items
  312 
  313     FencedCode _ _ info' -> singleton (CodeBlock attr txt) <>
  314                                processElts refmap rest
  315                   where txt = joinLines $ map extractText $ toList cs
  316                         attr = CodeAttr x (T.strip y)
  317                         (x,y) = T.break (==' ') info'
  318 
  319     IndentedCode -> singleton (CodeBlock (CodeAttr "" "") txt)
  320                     <> processElts refmap rest'
  321                   where txt = joinLines $ stripTrailingEmpties
  322                               $ concatMap extractCode cbs
  323 
  324                         stripTrailingEmpties = reverse .
  325                           dropWhile (T.all (==' ')) . reverse
  326 
  327                         -- explanation for next line:  when we parsed
  328                         -- the blank line, we dropped 0-3 spaces.
  329                         -- but for this, code block context, we want
  330                         -- to have dropped 4 spaces. we simply drop
  331                         -- one more:
  332                         extractCode (L _ (BlankLine t)) = [T.drop 1 t]
  333                         extractCode (C (Container IndentedCode cs')) =
  334                           map extractText $ toList cs'
  335                         extractCode _ = []
  336 
  337                         (cbs, rest') = span isIndentedCodeOrBlank
  338                                        (C (Container ct cs) : rest)
  339 
  340                         isIndentedCodeOrBlank (L _ BlankLine{}) = True
  341                         isIndentedCodeOrBlank (C (Container IndentedCode _))
  342                                                               = True
  343                         isIndentedCodeOrBlank _               = False
  344 
  345     RawHtmlBlock -> singleton (HtmlBlock txt) <> processElts refmap rest
  346                   where txt = joinLines (map extractText (toList cs))
  347 
  348     -- References have already been taken into account in the reference map,
  349     -- so we just skip.
  350     Reference{} ->
  351         processElts' [] (C (Container ct cs) : rest)
  352         where
  353             refs cs' =
  354                 fmap (extractRef . extractText) (toList cs')
  355 
  356             extractRef t =
  357               case parse pReference (T.strip t) of
  358                  Right (lab, lnk, tit) ->
  359                     (lab, lnk, tit)
  360                  Left _ ->
  361                     ("??", "??", "??")
  362 
  363             processElts' :: [[(Text, Text, Text)]] -> [Elt] -> Blocks
  364             processElts' acc (C (Container Reference cs) : rest') =
  365                 processElts' (refs cs : acc) rest'
  366             processElts' acc pass =
  367                 (singleton $ ReferencesBlock $ concat $ reverse acc)
  368                     <> processElts refmap pass
  369 
  370    where isBlankLine (L _ BlankLine{}) = True
  371          isBlankLine _ = False
  372 
  373          tightListItem [] = True
  374          tightListItem xs = not $ any isBlankLine xs
  375 
  376 extractText :: Elt -> Text
  377 extractText (L _ (TextLine t)) = t
  378 extractText _ = mempty
  379 
  380 -- Step 1
  381 
  382 processLines :: Text -> (Container, ReferenceMap)
  383 processLines t = (doc, refmap)
  384   where
  385   (doc, refmap) = evalRWS (mapM_ processLine lns >> closeStack) () startState
  386   lns        = zip [1..] (map tabFilter $ T.lines t)
  387   startState = ContainerStack (Container Document mempty) []
  388 
  389 -- The main block-parsing function.
  390 -- We analyze a line of text and modify the container stack accordingly,
  391 -- adding a new leaf, or closing or opening containers.
  392 processLine :: (LineNumber, Text) -> ContainerM ()
  393 processLine (lineNumber, txt) = do
  394   ContainerStack top@(Container ct cs) rest <- get
  395 
  396   -- Apply the line-start scanners appropriate for each nested container.
  397   -- Return the remainder of the string, and the number of unmatched
  398   -- containers.
  399   let (t', numUnmatched) = tryOpenContainers (reverse $ top:rest) txt
  400 
  401   -- Some new containers can be started only after a blank.
  402   let lastLineIsText = numUnmatched == 0 &&
  403                        case viewr cs of
  404                             (_ :> L _ (TextLine _)) -> True
  405                             _                       -> False
  406 
  407   -- Process the rest of the line in a way that makes sense given
  408   -- the container type at the top of the stack (ct):
  409   case ct of
  410     -- If it's a verbatim line container, add the line.
  411     RawHtmlBlock{} | numUnmatched == 0 -> addLeaf lineNumber (TextLine t')
  412     IndentedCode   | numUnmatched == 0 -> addLeaf lineNumber (TextLine t')
  413     FencedCode{ fence = fence' } ->
  414     -- here we don't check numUnmatched because we allow laziness
  415       if fence' `T.isPrefixOf` t'
  416          -- closing code fence
  417          then closeContainer
  418          else addLeaf lineNumber (TextLine t')
  419 
  420     Reference ->
  421       case tryNewContainers lastLineIsText (T.length txt - T.length t') t' of
  422         (ns, lf) -> do
  423           closeContainer
  424           addNew (ns, lf)
  425 
  426     -- otherwise, parse the remainder to see if we have new container starts:
  427     _ -> case tryNewContainers lastLineIsText (T.length txt - T.length t') t' of
  428 
  429        -- lazy continuation: text line, last line was text, no new containers,
  430        -- some unmatched containers:
  431        ([], TextLine t)
  432            | numUnmatched > 0
  433            , case viewr cs of
  434                   (_ :> L _ (TextLine _)) -> True
  435                   _                       -> False
  436            , ct /= IndentedCode -> addLeaf lineNumber (TextLine t)
  437 
  438        -- if it's a setext header line and the top container has a textline
  439        -- as last child, add a setext header:
  440        ([], SetextHeader lev _) | numUnmatched == 0 ->
  441            case viewr cs of
  442              (cs' :> L _ (TextLine t)) -> -- replace last text line with setext header
  443                put $ ContainerStack (Container ct
  444                         (cs' |> L lineNumber (SetextHeader lev t))) rest
  445                -- Note: the following case should not occur, since
  446                -- we don't add a SetextHeader leaf unless lastLineIsText.
  447              _ -> error "setext header line without preceding text line"
  448 
  449        -- otherwise, close all the unmatched containers, add the new
  450        -- containers, and finally add the new leaf:
  451        (ns, lf) -> do -- close unmatched containers, add new ones
  452            _ <- replicateM numUnmatched closeContainer
  453            addNew (ns, lf)
  454 
  455   where
  456     addNew (ns, lf) = do
  457       mapM_ addContainer ns
  458       case (reverse ns, lf) of
  459         -- don't add extra blank at beginning of fenced code block
  460         (FencedCode{}:_,  BlankLine{}) -> return ()
  461         _ -> addLeaf lineNumber lf
  462 
  463 -- Try to match the scanners corresponding to any currently open containers.
  464 -- Return remaining text after matching scanners, plus the number of open
  465 -- containers whose scanners did not match.  (These will be closed unless
  466 -- we have a lazy text line.)
  467 tryOpenContainers :: [Container] -> Text -> (Text, Int)
  468 tryOpenContainers cs t = case parse (scanners $ map containerContinue cs) t of
  469                          Right (t', n)  -> (t', n)
  470                          Left e         -> error $ "error parsing scanners: " ++
  471                                             show e
  472   where scanners [] = (,) <$> takeText <*> pure 0
  473         scanners (p:ps) = (p *> scanners ps)
  474                       <|> ((,) <$> takeText <*> pure (length (p:ps)))
  475 
  476 -- Try to match parsers for new containers.  Return list of new
  477 -- container types, and the leaf to add inside the new containers.
  478 tryNewContainers :: Bool -> Int -> Text -> ([ContainerType], Leaf)
  479 tryNewContainers lastLineIsText offset t =
  480   case parse newContainers t of
  481        Right (cs,t') -> (cs, t')
  482        Left err      -> error (show err)
  483   where newContainers = do
  484           getPosition >>= \pos -> setPosition pos{ column = offset + 1 }
  485           regContainers <- many (containerStart lastLineIsText)
  486           verbatimContainers <- option []
  487                             $ count 1 (verbatimContainerStart lastLineIsText)
  488           if null verbatimContainers
  489              then (,) <$> pure regContainers <*> leaf lastLineIsText
  490              else (,) <$> pure (regContainers ++ verbatimContainers) <*>
  491                             textLineOrBlank
  492 
  493 textLineOrBlank :: Parser Leaf
  494 textLineOrBlank = consolidate <$> takeText
  495   where consolidate ts | T.all isWhitespace ts = BlankLine ts
  496                        | otherwise        = TextLine  ts
  497 
  498 -- Parse a leaf node.
  499 leaf :: Bool -> Parser Leaf
  500 leaf lastLineIsText = scanNonindentSpace *> (
  501      (ATXHeader <$> parseAtxHeaderStart <*>
  502          (T.strip . removeATXSuffix <$> takeText))
  503    <|> (guard lastLineIsText *> (SetextHeader <$> parseSetextHeaderLine <*> pure mempty))
  504    <|> (Rule <$ scanHRuleLine)
  505    <|> textLineOrBlank
  506   )
  507   where removeATXSuffix t = case T.dropWhileEnd (`elem` (" #" :: String)) t of
  508                                  t' | T.null t' -> t'
  509                                       -- an escaped \#
  510                                     | T.last t' == '\\' -> t' <> "#"
  511                                     | otherwise -> t'
  512 
  513 -- Scanners
  514 
  515 scanReference :: Scanner
  516 scanReference = () <$ lookAhead (pLinkLabel >> scanChar ':')
  517 
  518 -- Scan the beginning of a blockquote:  up to three
  519 -- spaces indent, the `>` character, and an optional space.
  520 scanBlockquoteStart :: Scanner
  521 scanBlockquoteStart = scanChar '>' >> option () (scanChar ' ')
  522 
  523 -- Parse the sequence of `#` characters that begins an ATX
  524 -- header, and return the number of characters.  We require
  525 -- a space after the initial string of `#`s, as not all markdown
  526 -- implementations do. This is because (a) the ATX reference
  527 -- implementation requires a space, and (b) since we're allowing
  528 -- headers without preceding blank lines, requiring the space
  529 -- avoids accidentally capturing a line like `#8 toggle bolt` as
  530 -- a header.
  531 parseAtxHeaderStart :: Parser Int
  532 parseAtxHeaderStart = do
  533   _ <- char '#'
  534   hashes <- upToCountChars 5 (== '#')
  535   -- hashes must be followed by space unless empty header:
  536   notFollowedBy (skip (/= ' '))
  537   return $ T.length hashes + 1
  538 
  539 parseSetextHeaderLine :: Parser Int
  540 parseSetextHeaderLine = do
  541   d <- satisfy (\c -> c == '-' || c == '=')
  542   let lev = if d == '=' then 1 else 2
  543   skipWhile (== d)
  544   scanBlankline
  545   return lev
  546 
  547 -- Scan a horizontal rule line: "...three or more hyphens, asterisks,
  548 -- or underscores on a line by themselves. If you wish, you may use
  549 -- spaces between the hyphens or asterisks."
  550 scanHRuleLine :: Scanner
  551 scanHRuleLine = do
  552   c <- satisfy (\c -> c == '*' || c == '_' || c == '-')
  553   _ <- count 2 $ scanSpaces >> skip (== c)
  554   skipWhile (\x -> x == ' ' || x == c)
  555   endOfInput
  556 
  557 -- Parse an initial code fence line, returning
  558 -- the fence part and the rest (after any spaces).
  559 parseCodeFence :: Parser ContainerType
  560 parseCodeFence = do
  561   col <- column <$> getPosition
  562   cs <- takeWhile1 (=='`') <|> takeWhile1 (=='~')
  563   guard $ T.length cs >= 3
  564   scanSpaces
  565   rawattr <- takeWhile (\c -> c /= '`' && c /= '~')
  566   endOfInput
  567   return $ FencedCode { startColumn = col
  568                       , fence = cs
  569                       , info = rawattr }
  570 
  571 -- Parse the start of an HTML block:  either an HTML tag or an
  572 -- HTML comment, with no indentation.
  573 parseHtmlBlockStart :: Parser ()
  574 parseHtmlBlockStart = () <$ lookAhead
  575      ((do t <- pHtmlTag
  576           guard $ f $ fst t
  577           return $ snd t)
  578     <|> string "<!--"
  579     <|> string "-->"
  580      )
  581  where f (Opening name) = name `Set.member` blockHtmlTags
  582        f (SelfClosing name) = name `Set.member` blockHtmlTags
  583        f (Closing name) = name `Set.member` blockHtmlTags
  584 
  585 -- List of block level tags for HTML 5.
  586 blockHtmlTags :: Set.Set Text
  587 blockHtmlTags = Set.fromList
  588  [ "article", "header", "aside", "hgroup", "blockquote", "hr",
  589    "body", "li", "br", "map", "button", "object", "canvas", "ol",
  590    "caption", "output", "col", "p", "colgroup", "pre", "dd",
  591    "progress", "div", "section", "dl", "table", "dt", "tbody",
  592    "embed", "textarea", "fieldset", "tfoot", "figcaption", "th",
  593    "figure", "thead", "footer", "footer", "tr", "form", "ul",
  594    "h1", "h2", "h3", "h4", "h5", "h6", "video"]
  595 
  596 -- Parse a list marker and return the list type.
  597 parseListMarker :: Parser ContainerType
  598 parseListMarker = do
  599   col <- column <$> getPosition
  600   ty <- parseBullet <|> parseListNumber
  601   -- padding is 1 if list marker followed by a blank line
  602   -- or indented code.  otherwise it's the length of the
  603   -- whitespace between the list marker and the following text:
  604   padding' <- (1 <$ scanBlankline)
  605           <|> (1 <$ (skip (==' ') *> lookAhead (count 4 (char ' '))))
  606           <|> (T.length <$> takeWhile (==' '))
  607   -- text can't immediately follow the list marker:
  608   guard $ padding' > 0
  609   return $ ListItem { listType = ty
  610                     , markerColumn = col
  611                     , padding = padding' + listMarkerWidth ty
  612                     }
  613 
  614 listMarkerWidth :: ListType -> Int
  615 listMarkerWidth (Bullet _) = 1
  616 listMarkerWidth (Numbered _ n) | n < 10    = 2
  617                                | n < 100   = 3
  618                                | n < 1000  = 4
  619                                | otherwise = 5
  620 
  621 -- Parse a bullet and return list type.
  622 parseBullet :: Parser ListType
  623 parseBullet = do
  624   c <- satisfy (\c -> c == '+' || c == '*' || c == '-')
  625   unless (c == '+')
  626     $ nfb $ (count 2 $ scanSpaces >> skip (== c)) >>
  627           skipWhile (\x -> x == ' ' || x == c) >> endOfInput -- hrule
  628   return $ Bullet c
  629 
  630 -- Parse a list number marker and return list type.
  631 parseListNumber :: Parser ListType
  632 parseListNumber = do
  633     num <- (read . T.unpack) <$> takeWhile1 isDigit
  634     wrap <-  PeriodFollowing <$ skip (== '.')
  635          <|> ParenFollowing <$ skip (== ')')
  636     return $ Numbered wrap num