Commit 7ebbfc28 authored by Ben Gamari's avatar Ben Gamari
Browse files

Initial commit

parents
Copyright (c) 2016, Ben Gamari
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following
disclaimer in the documentation and/or other materials provided
with the distribution.
* Neither the name of Ben Gamari nor the names of other
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import Distribution.Simple
main = defaultMain
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE DeriveGeneric #-}
module Parser
( Token(..)
, Attr(..)
, token
, tagStream
) where
import Data.Char hiding (isSpace)
import Data.List (unfoldr)
import GHC.Generics
import Control.Applicative
import Data.Monoid
import Control.Monad (guard)
import Control.DeepSeq
import Data.Attoparsec.Text
import Data.Text (Text)
import qualified Data.Text as T
import Data.Text.Lazy.Builder (Builder)
import qualified Data.Text.Lazy.Builder as B
import Prelude hiding (take, takeWhile)
type TagName = Text
type AttrName = Text
type AttrValue = Text
data Token
-- | An opening tag. Attribute ordering is arbitrary
= TagOpen !TagName ![Attr]
-- | A closing tag.
| TagClose !TagName
-- | A text between tags.
| ContentChar !Char
| ContentText !Text
-- | Contents of a comment.
| Comment !Builder
-- | Two tokens (sometimes useful for implementation reasons)
| Pair !Token !Token
-- | Doctype
| Doctype !Text
deriving (Show, Ord, Eq, Generic)
data Attr = Attr !AttrName !AttrValue
deriving (Show, Eq, Ord)
instance NFData Token where
rnf (Comment b) = rnf $ B.toLazyText b
rnf _ = ()
-- | Start in the data state.
token :: Parser Token
token = dataState
-- | 8.2.4.1: Data state
dataState :: Parser Token
dataState = do
content <- takeWhile (/= '<')
if not $ T.null content
then return $ ContentText content
else char '<' >> tagOpen
-- | 8.2.4.3: Tag open state
tagOpen :: Parser Token
tagOpen =
(char '!' >> markupDeclOpen)
<|> (char '/' >> endTagOpen)
<|> (char '?' >> bogusComment)
<|> tryStartTag
<|> other
where
tryStartTag = do
c <- peekChar'
guard $ isAsciiUpper c || isAsciiLower c
tagName
other = do
return $ ContentChar '<'
-- | 8.2.4.9: End tag open state
-- TODO: This isn't right
endTagOpen :: Parser Token
endTagOpen = do
name <- takeWhile $ \c -> isAsciiUpper c || isAsciiLower c
char '>'
return $ TagClose name
-- | 8.2.4.10: Tag name state
--
-- deviation: no lower-casing
tagName :: Parser Token
tagName = do
tag <- takeWhile $ notInClass "\x09\x0a\x0c />"
id $ (satisfy (inClass "\x09\x0a\x0c ") >> beforeAttrName tag [])
<|> (char '/' >> selfClosingStartTag tag [])
<|> (char '>' >> return (TagOpen tag []))
-- | 8.2.4.43: Self-closing start tag state
selfClosingStartTag :: TagName -> [Attr] -> Parser Token
selfClosingStartTag tag attrs = do
(char '>' >> return (TagOpen tag attrs))
<|> beforeAttrName tag attrs
-- | 8.2.4.34: Before attribute name state
--
-- deviation: no lower-casing
beforeAttrName :: TagName -> [Attr] -> Parser Token
beforeAttrName tag attrs = do
skipWhile $ inClass "\x09\x0a\x0c "
id $ (char '/' >> selfClosingStartTag tag attrs)
<|> (char '>' >> return (TagOpen tag attrs))
-- <|> (char '\x00' >> attrName tag attrs) -- TODO: NULL
<|> attrName tag attrs
-- | 8.2.4.35: Attribute name state
attrName :: TagName -> [Attr] -> Parser Token
attrName tag attrs = do
name <- takeWhile $ notInClass "\x09\x0a\x0c /=>\x00"
id $ (satisfy (inClass "\x09\x0a\x0c ") >> afterAttrName tag attrs name)
<|> (char '/' >> selfClosingStartTag tag attrs)
<|> (char '=' >> beforeAttrValue tag attrs name)
<|> (char '>' >> return (TagOpen tag (Attr name T.empty : attrs)))
-- <|> -- TODO: NULL
-- | 8.2.4.36: After attribute name state
afterAttrName :: TagName -> [Attr] -> AttrName -> Parser Token
afterAttrName tag attrs name = do
skipWhile $ inClass "\x09\x0a\x0c "
id $ (char '/' >> selfClosingStartTag tag attrs)
<|> (char '=' >> beforeAttrValue tag attrs name)
<|> (char '>' >> return (TagOpen tag (Attr name T.empty : attrs)))
<|> attrName tag (Attr name T.empty : attrs) -- not exactly sure this is right
-- | 8.2.4.37: Before attribute value state
beforeAttrValue :: TagName -> [Attr] -> AttrName -> Parser Token
beforeAttrValue tag attrs name = do
skipWhile $ inClass "\x09\x0a\x0c "
id $ (char '"' >> attrValueDQuoted tag attrs name)
<|> (char '\'' >> attrValueSQuoted tag attrs name)
<|> (char '>' >> return (TagOpen tag (Attr name T.empty : attrs)))
<|> attrValueUnquoted tag attrs name
-- | 8.2.4.38: Attribute value (double-quoted) state
attrValueDQuoted :: TagName -> [Attr] -> AttrName -> Parser Token
attrValueDQuoted tag attrs name = do
value <- takeTill (/= '"')
afterAttrValueQuoted tag attrs name value
-- | 8.2.4.39: Attribute value (single-quoted) state
attrValueSQuoted :: TagName -> [Attr] -> AttrName -> Parser Token
attrValueSQuoted tag attrs name = do
value <- takeTill (/= '\'')
afterAttrValueQuoted tag attrs name value
-- | 8.2.4.40: Attribute value (unquoted) state
attrValueUnquoted :: TagName -> [Attr] -> AttrName -> Parser Token
attrValueUnquoted tag attrs name = do
value <- takeTill (inClass "\x09\x0a\x0c >")
id $ (satisfy (inClass "\x09\x0a\x0c ") >> beforeAttrName tag attrs) -- unsure: don't emit?
<|> (char '>' >> return (TagOpen tag (Attr name value : attrs)))
-- | 8.2.4.42: After attribute value (quoted) state
afterAttrValueQuoted :: TagName -> [Attr] -> AttrName -> AttrValue -> Parser Token
afterAttrValueQuoted tag attrs name value =
(satisfy (inClass "\x09\x0a\x0c ") >> beforeAttrName tag attrs')
<|> (char '/' >> selfClosingStartTag tag attrs')
<|> (char '>' >> return (TagOpen name attrs'))
where attrs' = Attr name value : attrs
-- | 8.2.4.45: Markup declaration open state
markupDeclOpen :: Parser Token
markupDeclOpen =
try comment
<|> try docType
-- TODO: Fix the rest
where
comment = string "--" >> commentStart
docType = do
s <- take 7
guard $ T.toLower s == "doctype"
doctype
-- | 8.2.4.46: Comment start state
commentStart :: Parser Token
commentStart = do
(char '-' >> commentStartDash)
<|> (char '>' >> return (Comment mempty))
-- | 8.2.4.47: Comment start dash state
commentStartDash :: Parser Token
commentStartDash =
(char '-' >> commentEnd mempty)
<|> (char '>' >> return (Comment mempty))
<|> (do c <- anyChar
comment (B.singleton '-' <> B.singleton c) )
-- | 8.2.4.48: Comment state
comment :: Builder -> Parser Token
comment content0 = do
content <- B.fromText <$> takeWhile (notInClass "-")
id $ (char '-' >> commentEndDash (content0 <> content))
<|> (char '\x00' >> comment (content0 <> content <> B.singleton '\xfffd'))
-- | 8.2.4.49: Comment end dash state
commentEndDash :: Builder -> Parser Token
commentEndDash content = do
(char '-' >> commentEnd content)
<|> (char '\x00' >> comment (content <> "-\xfffd"))
<|> (anyChar >>= \c -> comment (content <> "-" <> B.singleton c))
-- | 8.2.4.50: Comment end state
commentEnd :: Builder -> Parser Token
commentEnd content = do
(char '>' >> return (Comment content))
<|> (char '\x00' >> comment (content <> "-\xfffd"))
-- <|> () TODO: other cases
<|> (anyChar >>= \c -> comment (content <> "-" <> B.singleton c))
-- | 8.2.4.52: DOCTYPE state
-- FIXME
doctype :: Parser Token
doctype = Doctype <$> takeTill (=='>')
-- | 8.2.4.44: Bogus comment state
bogusComment :: Parser Token
bogusComment = fail "Bogus comment"
-- | Produce a lazy list of tokens.
tagStream :: Text -> [Token]
tagStream = unfoldr f
where
f :: Text -> Maybe (Token, Text)
f t
| T.null t = Nothing
| otherwise =
case parse token t of
Done rest tok -> Just (tok, rest)
_ -> Nothing
name: html-parse
version: 0.1.0.0
-- synopsis:
-- description:
homepage: http://github.com/bgamari/html-parse
license: BSD3
license-file: LICENSE
author: Ben Gamari
maintainer: ben@smart-cactus.org
copyright: (c) 2016 Ben Gamari
category: Text
build-type: Simple
extra-source-files: ChangeLog.md
cabal-version: >=1.10
library
exposed-modules: Text.HTML.Parser
other-extensions: OverloadedStrings, DeriveGeneric
build-depends: base >=4.8 && <4.9
deepseq >=1.4 && <1.5
attoparsec >=0.13 && <0.14
text >=1.2 && <1.3
default-language: Haskell2010
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment