diff --git a/.gitignore b/.gitignore index b8ae847..e2dc727 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,6 @@ cabal-dev .stack-work/ cabal.sandbox.config cabal.config +*.swp +*.swo +*.DS_Store diff --git a/.travis/stack-lts-2.yaml b/.travis/stack-lts-2.yaml index f031ed0..1bea951 100644 --- a/.travis/stack-lts-2.yaml +++ b/.travis/stack-lts-2.yaml @@ -8,4 +8,5 @@ extra-deps: - http-client-0.4.30 - http-client-tls-0.2.4 - pointedlist-0.6.1 +- tagsoup-0.14.8@sha256:56b2023d2e9fdbff093719ce9af5285d2436b234c6c684d6a69f14595a8348ae resolver: lts-2.0 diff --git a/.travis/stack-lts-6.yaml b/.travis/stack-lts-6.yaml index c6e581b..fd55c4f 100644 --- a/.travis/stack-lts-6.yaml +++ b/.travis/stack-lts-6.yaml @@ -6,4 +6,5 @@ packages: extra-deps: - fail-4.9.0.0 - http-client-0.4.30 +- tagsoup-0.14.8@sha256:56b2023d2e9fdbff093719ce9af5285d2436b234c6c684d6a69f14595a8348ae resolver: lts-6.0 diff --git a/.travis/stack-lts-7.yaml b/.travis/stack-lts-7.yaml index cbba31d..6738aa0 100644 --- a/.travis/stack-lts-7.yaml +++ b/.travis/stack-lts-7.yaml @@ -5,4 +5,5 @@ packages: - ../examples/ extra-deps: - fail-4.9.0.0 +- tagsoup-0.14.8@sha256:56b2023d2e9fdbff093719ce9af5285d2436b234c6c684d6a69f14595a8348ae resolver: lts-7.4 diff --git a/examples/complex-predicates/Main.hs b/examples/complex-predicates/Main.hs index 081b0c7..fd1a330 100644 --- a/examples/complex-predicates/Main.hs +++ b/examples/complex-predicates/Main.hs @@ -1,10 +1,13 @@ {-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE ExtendedDefaultRules #-} import Text.HTML.Scalpel import Control.Applicative import Control.Monad import Data.List (isInfixOf) +default (String) + exampleHtml :: String exampleHtml = "\ @@ -37,7 +40,7 @@ catComment :: Scraper String String catComment = -- 1. First narrow the current context to the div containing the comment's -- textual content. - chroot ("div" @: [hasClass "comment", hasClass "text"]) $ do + chroot (("div" :: TagName String) @: [hasClass "comment", hasClass "text"]) $ do -- 2. Any can be used to access the root tag of the current context. contents <- text anySelector -- 3. Skip comment divs that do not contain "cat". diff --git a/examples/example-from-documentation/Main.hs b/examples/example-from-documentation/Main.hs index 57067d6..9589bda 100644 --- a/examples/example-from-documentation/Main.hs +++ b/examples/example-from-documentation/Main.hs @@ -1,8 +1,11 @@ {-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE ExtendedDefaultRules #-} import Text.HTML.Scalpel import Control.Applicative +default (String) + exampleHtml :: String exampleHtml = "\ @@ -35,19 +38,19 @@ main :: IO () main = print $ scrapeStringLike exampleHtml comments where comments :: Scraper String [Comment] - comments = chroots ("div" @: [hasClass "container"]) comment + comments = chroots (("div" :: TagName String) @: [hasClass "container"]) comment comment :: Scraper String Comment comment = textComment <|> imageComment textComment :: Scraper String Comment textComment = do - author <- text $ "span" @: [hasClass "author"] - commentText <- text $ "div" @: [hasClass "text"] + author <- text $ ("span" :: TagName String) @: [hasClass "author"] + commentText <- text $ ("div" :: TagName String) @: [hasClass "text"] return $ TextComment author commentText imageComment :: Scraper String Comment imageComment = do - author <- text $ "span" @: [hasClass "author"] - imageURL <- attr "src" $ "img" @: [hasClass "image"] + author <- text $ ("span" :: TagName String) @: [hasClass "author"] + imageURL <- attr "src" $ ("img" :: TagName String) @: [hasClass "image"] return $ ImageComment author imageURL diff --git a/examples/html-to-markdown/Main.hs b/examples/html-to-markdown/Main.hs index 8d9145c..5aeace0 100644 --- a/examples/html-to-markdown/Main.hs +++ b/examples/html-to-markdown/Main.hs @@ -2,6 +2,7 @@ -- the rough markdown equivalent using serial scrapers. {-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE RankNTypes #-} import Control.Applicative import System.Environment import Text.HTML.Scalpel diff --git a/scalpel-core/benchmarks/Main.hs b/scalpel-core/benchmarks/Main.hs index 4c4faa0..b4fe01b 100644 --- a/scalpel-core/benchmarks/Main.hs +++ b/scalpel-core/benchmarks/Main.hs @@ -1,4 +1,5 @@ {-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE ExtendedDefaultRules #-} import Text.HTML.Scalpel.Core @@ -9,6 +10,8 @@ import Data.Foldable (foldr') import qualified Data.Text as T import qualified Text.HTML.TagSoup as TagSoup +default (String, Int) + main :: IO () main = do diff --git a/scalpel-core/scalpel-core.cabal b/scalpel-core/scalpel-core.cabal index d1670c5..d369b6f 100644 --- a/scalpel-core/scalpel-core.cabal +++ b/scalpel-core/scalpel-core.cabal @@ -50,7 +50,7 @@ library , pointedlist , regex-base , regex-tdfa - , tagsoup >= 0.12.2 + , tagsoup >= 0.14.8 , text , vector default-extensions: diff --git a/scalpel-core/src/Text/HTML/Scalpel/Core.hs b/scalpel-core/src/Text/HTML/Scalpel/Core.hs index eb7f2cd..8cbaef3 100644 --- a/scalpel-core/src/Text/HTML/Scalpel/Core.hs +++ b/scalpel-core/src/Text/HTML/Scalpel/Core.hs @@ -61,10 +61,14 @@ module Text.HTML.Scalpel.Core ( , seekBack , untilNext , untilBack + +-- * Select +, select ) where import Text.HTML.Scalpel.Internal.Scrape import Text.HTML.Scalpel.Internal.Scrape.StringLike import Text.HTML.Scalpel.Internal.Select.Combinators +import Text.HTML.Scalpel.Internal.Select import Text.HTML.Scalpel.Internal.Select.Types import Text.HTML.Scalpel.Internal.Serial diff --git a/scalpel-core/src/Text/HTML/Scalpel/Internal/Scrape.hs b/scalpel-core/src/Text/HTML/Scalpel/Internal/Scrape.hs index c70297a..5a55e46 100644 --- a/scalpel-core/src/Text/HTML/Scalpel/Internal/Scrape.hs +++ b/scalpel-core/src/Text/HTML/Scalpel/Internal/Scrape.hs @@ -22,6 +22,7 @@ import Text.HTML.Scalpel.Internal.Select.Types import Control.Applicative import Control.Monad import Data.Maybe +import Text.StringLike (StringLike, castString) import qualified Control.Monad.Fail as Fail import qualified Data.Vector as Vector @@ -67,7 +68,7 @@ instance Fail.MonadFail (Scraper str) where -- | The 'scrape' function executes a 'Scraper' on a list of -- 'TagSoup.Tag's and produces an optional value. -scrape :: (TagSoup.StringLike str) +scrape :: (StringLike str) => Scraper str a -> [TagSoup.Tag str] -> Maybe a scrape s = scrapeTagSpec s . tagsToSpec . TagSoup.canonicalizeTags @@ -77,7 +78,7 @@ scrape s = scrapeTagSpec s . tagsToSpec . TagSoup.canonicalizeTags -- -- This function will match only the first set of tags matching the selector, to -- match every set of tags, use 'chroots'. -chroot :: (TagSoup.StringLike str) +chroot :: (StringLike str) => Selector -> Scraper str a -> Scraper str a chroot selector inner = do maybeResult <- listToMaybe <$> chroots selector inner @@ -91,14 +92,14 @@ chroot selector inner = do -- -- > s = "
A
" -- > scrapeStringLike s (chroots "div" (pure 0)) == Just [0, 0] -chroots :: (TagSoup.StringLike str) +chroots :: (StringLike str) => Selector -> Scraper str a -> Scraper str [a] chroots selector (MkScraper inner) = MkScraper $ return . mapMaybe inner . select selector -- | The 'matches' function takes a selector and returns `()` if the selector -- matches any node in the DOM. -matches :: (TagSoup.StringLike str) => Selector -> Scraper str () +matches :: (StringLike str) => Selector -> Scraper str () matches s = MkScraper $ withHead (pure ()) . select s -- | The 'text' function takes a selector and returns the inner text from the @@ -106,7 +107,7 @@ matches s = MkScraper $ withHead (pure ()) . select s -- -- This function will match only the first set of tags matching the selector, to -- match every set of tags, use 'texts'. -text :: (TagSoup.StringLike str) => Selector -> Scraper str str +text :: (StringLike str) => Selector -> Scraper str str text s = MkScraper $ withHead tagsToText . select s -- | The 'texts' function takes a selector and returns the inner text from every @@ -114,7 +115,7 @@ text s = MkScraper $ withHead tagsToText . select s -- -- > s = "
Hello
World
" -- > scrapeStringLike s (texts "div") == Just ["Hello World", "World"] -texts :: (TagSoup.StringLike str) +texts :: (StringLike str) => Selector -> Scraper str [str] texts s = MkScraper $ withAll tagsToText . select s @@ -123,7 +124,7 @@ texts s = MkScraper $ withAll tagsToText . select s -- -- This function will match only the first set of tags matching the selector, to -- match every set of tags, use 'htmls'. -html :: (TagSoup.StringLike str) => Selector -> Scraper str str +html :: (StringLike str) => Selector -> Scraper str str html s = MkScraper $ withHead tagsToHTML . select s -- | The 'htmls' function takes a selector and returns the html string from @@ -131,7 +132,7 @@ html s = MkScraper $ withHead tagsToHTML . select s -- -- > s = "
A
" -- > scrapeStringLike s (htmls "div") == Just ["
A
", "
A
"] -htmls :: (TagSoup.StringLike str) +htmls :: (StringLike str) => Selector -> Scraper str [str] htmls s = MkScraper $ withAll tagsToHTML . select s @@ -141,7 +142,7 @@ htmls s = MkScraper $ withAll tagsToHTML . select s -- -- This function will match only the first set of tags matching the selector, to -- match every set of tags, use 'innerHTMLs'. -innerHTML :: (TagSoup.StringLike str) +innerHTML :: (StringLike str) => Selector -> Scraper str str innerHTML s = MkScraper $ withHead tagsToInnerHTML . select s @@ -150,7 +151,7 @@ innerHTML s = MkScraper $ withHead tagsToInnerHTML . select s -- -- > s = "
A
" -- > scrapeStringLike s (innerHTMLs "div") == Just ["
A
", "A"] -innerHTMLs :: (TagSoup.StringLike str) +innerHTMLs :: (StringLike str) => Selector -> Scraper str [str] innerHTMLs s = MkScraper $ withAll tagsToInnerHTML . select s @@ -160,10 +161,10 @@ innerHTMLs s = MkScraper $ withAll tagsToInnerHTML . select s -- -- This function will match only the opening tag matching the selector, to match -- every tag, use 'attrs'. -attr :: (Show str, TagSoup.StringLike str) - => String -> Selector -> Scraper str str +attr :: (Show str, StringLike str) + => str -> Selector -> Scraper str str attr name s = MkScraper - $ join . withHead (tagsToAttr $ TagSoup.castString name) . select s + $ join . withHead (tagsToAttr $ castString name) . select s -- | The 'attrs' function takes an attribute name and a selector and returns the -- value of the attribute of the given name for every opening tag @@ -171,11 +172,11 @@ attr name s = MkScraper -- -- > s = "
" -- > scrapeStringLike s (attrs "id" "div") == Just ["out", "in"] -attrs :: (Show str, TagSoup.StringLike str) - => String -> Selector -> Scraper str [str] +attrs :: (Show str, StringLike str) + => str -> Selector -> Scraper str [str] attrs name s = MkScraper $ fmap catMaybes . withAll (tagsToAttr nameStr) . select s - where nameStr = TagSoup.castString name + where nameStr = castString name -- | The 'position' function is intended to be used within the do-block of a -- `chroots` call. Within the do-block position will return the index of the @@ -211,7 +212,7 @@ attrs name s = MkScraper -- , (2, "Third paragraph.") -- ] -- @ -position :: (TagSoup.StringLike str) => Scraper str Int +position :: (StringLike str) => Scraper str Int position = MkScraper $ Just . tagsToPosition withHead :: (a -> b) -> [a] -> Maybe b @@ -221,27 +222,27 @@ withHead f (x:_) = Just $ f x withAll :: (a -> b) -> [a] -> Maybe [b] withAll f xs = Just $ map f xs -foldSpec :: TagSoup.StringLike str +foldSpec :: StringLike str => (TagSoup.Tag str -> str -> str) -> TagSpec str -> str foldSpec f = Vector.foldr' (f . infoTag) TagSoup.empty . (\(a, _, _) -> a) -tagsToText :: TagSoup.StringLike str => TagSpec str -> str +tagsToText :: StringLike str => TagSpec str -> str tagsToText = foldSpec f where f (TagSoup.TagText str) s = str `TagSoup.append` s f _ s = s -tagsToHTML :: TagSoup.StringLike str => TagSpec str -> str +tagsToHTML :: StringLike str => TagSpec str -> str tagsToHTML = foldSpec (\tag s -> TagSoup.renderTags [tag] `TagSoup.append` s) -tagsToInnerHTML :: TagSoup.StringLike str => TagSpec str -> str +tagsToInnerHTML :: StringLike str => TagSpec str -> str tagsToInnerHTML (tags, tree, ctx) | len < 2 = TagSoup.empty | otherwise = tagsToHTML (Vector.slice 1 (len - 2) tags, tree, ctx) where len = Vector.length tags -tagsToAttr :: (Show str, TagSoup.StringLike str) +tagsToAttr :: (Show str, StringLike str) => str -> TagSpec str -> Maybe str tagsToAttr attr (tags, _, _) = do guard $ 0 < Vector.length tags diff --git a/scalpel-core/src/Text/HTML/Scalpel/Internal/Select.hs b/scalpel-core/src/Text/HTML/Scalpel/Internal/Select.hs index 665a62d..442fbe7 100644 --- a/scalpel-core/src/Text/HTML/Scalpel/Internal/Select.hs +++ b/scalpel-core/src/Text/HTML/Scalpel/Internal/Select.hs @@ -15,13 +15,13 @@ import Text.HTML.Scalpel.Internal.Select.Types import Control.Applicative ((<$>), (<|>)) import Data.Maybe (catMaybes, isJust, fromJust, fromMaybe) +import Text.StringLike (StringLike, castString) import qualified Data.Map.Strict as Map import qualified Data.Text as T import qualified Data.Tree as Tree import qualified Data.Vector as Vector import qualified Text.HTML.TagSoup as TagSoup -import qualified Text.StringLike as TagSoup type Index = Int @@ -73,7 +73,7 @@ type TagSpec str = (TagVector str, TagForest, SelectContext) -- | The 'select' function takes a 'Selectable' value and a list of -- 'TagSoup.Tag's and returns a list of every subsequence of the given list of -- Tags that matches the given selector. -select :: (TagSoup.StringLike str) +select :: (StringLike str) => Selector -> TagSpec str -> [TagSpec str] select s tagSpec = newSpecs where @@ -83,7 +83,7 @@ select s tagSpec = newSpecs applyPosition p (tags, f, _) = (tags, f, SelectContext p True) -- | Creates a TagSpec from a list of tags parsed by TagSoup. -tagsToSpec :: forall str. (TagSoup.StringLike str) +tagsToSpec :: forall str. (StringLike str) => [TagSoup.Tag str] -> TagSpec str tagsToSpec tags = (vector, tree, ctx) where @@ -113,7 +113,7 @@ tagsToSpec tags = (vector, tree, ctx) -- closing offset. -- -- (5) The result set is then sorted by their indices. -tagsToVector :: forall str. (TagSoup.StringLike str) +tagsToVector :: forall str. (StringLike str) => [TagSoup.Tag str] -> TagVector str tagsToVector tags = let indexed = zip tags [0..] total = length indexed @@ -164,9 +164,9 @@ tagsToVector tags = let indexed = zip tags [0..] popTag (Just (_ : y : xs)) = let s = y : xs in s `seq` Just s popTag _ = Nothing -getTagName :: TagSoup.StringLike str => TagSoup.Tag str -> T.Text -getTagName (TagSoup.TagOpen name _) = TagSoup.castString name -getTagName (TagSoup.TagClose name) = TagSoup.castString name +getTagName :: StringLike str => TagSoup.Tag str -> T.Text +getTagName (TagSoup.TagOpen name _) = castString name +getTagName (TagSoup.TagClose name) = castString name getTagName _ = undefined -- | Builds a forest describing the structure of the tags within a given vector. @@ -174,7 +174,7 @@ getTagName _ = undefined -- vector of an open and close pair. The tree is organized such for any node n -- the parent of node n is the smallest span that completely encapsulates the -- span of node n. -vectorToTree :: TagSoup.StringLike str => TagVector str -> TagForest +vectorToTree :: StringLike str => TagVector str -> TagForest vectorToTree tags = fixup $ forestWithin 0 (Vector.length tags) where forestWithin :: Int -> Int -> TagForest @@ -217,7 +217,7 @@ vectorToTree tags = fixup $ forestWithin 0 (Vector.length tags) -- tree the SelectNode is popped and the current node's sub-forest is traversed -- with the remaining SelectNodes. If there is only a single SelectNode then any -- node encountered that satisfies the SelectNode is returned as an answer. -selectNodes :: TagSoup.StringLike str +selectNodes :: StringLike str => [(SelectNode, SelectSettings)] -> TagSpec str -> TagSpec str @@ -305,7 +305,7 @@ boolMatch True = MatchOk boolMatch False = MatchFail -- | Returns True if a tag satisfies a given SelectNode's condition. -nodeMatches :: TagSoup.StringLike str +nodeMatches :: StringLike str => (SelectNode, SelectSettings) -> TagInfo str -> TagSpec str @@ -322,7 +322,7 @@ nodeMatches (SelectText , settings) info cur root = -- | Given a SelectSettings, the current node under consideration, and the last -- matched node, returns true IFF the current node satisfies all of the -- selection settings. -checkSettings :: TagSoup.StringLike str +checkSettings :: StringLike str => SelectSettings -> TagSpec str -> TagSpec str -> MatchResult checkSettings (SelectSettings (Just depth)) (_, curRoot : _, _) @@ -341,7 +341,7 @@ checkSettings (SelectSettings _) _ _ = MatchOk -- | Given a tag name and a list of attribute predicates return a function that -- returns true if a given tag matches the supplied name and predicates. -checkTag :: TagSoup.StringLike str +checkTag :: StringLike str => T.Text -> [AttributePredicate] -> TagInfo str -> MatchResult checkTag name preds (TagInfo tag tagName _) = boolMatch ( @@ -351,7 +351,7 @@ checkTag name preds (TagInfo tag tagName _) ) `andMatch` checkPreds preds tag -- | Returns True if a tag satisfies a list of attribute predicates. -checkPreds :: TagSoup.StringLike str +checkPreds :: StringLike str => [AttributePredicate] -> TagSoup.Tag str -> MatchResult checkPreds [] tag = boolMatch $ TagSoup.isTagOpen tag || TagSoup.isTagText tag diff --git a/scalpel-core/src/Text/HTML/Scalpel/Internal/Select/Combinators.hs b/scalpel-core/src/Text/HTML/Scalpel/Internal/Select/Combinators.hs index 67fa014..697ee75 100644 --- a/scalpel-core/src/Text/HTML/Scalpel/Internal/Select/Combinators.hs +++ b/scalpel-core/src/Text/HTML/Scalpel/Internal/Select/Combinators.hs @@ -14,15 +14,15 @@ module Text.HTML.Scalpel.Internal.Select.Combinators ( ) where import Text.HTML.Scalpel.Internal.Select.Types +import Text.StringLike (StringLike, castString, toString) import qualified Data.Text as T import qualified Text.Regex.Base.RegexLike as RE -import qualified Text.StringLike as TagSoup -- | The '@:' operator creates a 'Selector' by combining a 'TagName' with a list -- of 'AttributePredicate's. -(@:) :: TagName -> [AttributePredicate] -> Selector +(@:) :: StringLike str => TagName str -> [AttributePredicate] -> Selector (@:) tag attrs = MkSelector [(toSelectNode tag attrs, defaultSelectSettings)] infixl 9 @: @@ -31,20 +31,20 @@ infixl 9 @: -- -- If you are attempting to match a specific class of a tag with potentially -- multiple classes, you should use the 'hasClass' utility function. -(@=) :: AttributeName -> String -> AttributePredicate +(@=) :: StringLike str => AttributeName str -> str -> AttributePredicate (@=) key value = anyAttrPredicate $ \(attrKey, attrValue) -> - matchKey key attrKey - && TagSoup.fromString value == attrValue + matchKey key (castString attrKey) + && value == (castString attrValue) infixl 6 @= -- | The '@=~' operator creates an 'AttributePredicate' that will match -- attributes with the given name and whose value matches the given regular -- expression. -(@=~) :: RE.RegexLike re String - => AttributeName -> re -> AttributePredicate +(@=~) :: (RE.RegexLike re String, StringLike str) + => AttributeName str -> re -> AttributePredicate (@=~) key re = anyAttrPredicate $ \(attrKey, attrValue) -> - matchKey key attrKey - && RE.matchTest re (TagSoup.toString attrValue) + matchKey key (castString attrKey) + && RE.matchTest re (toString attrValue) infixl 6 @=~ -- | The 'atDepth' operator constrains a 'Selector' to only match when it is at @@ -73,14 +73,14 @@ infixl 5 // -- | The classes of a tag are defined in HTML as a space separated list given by -- the @class@ attribute. The 'hasClass' function will match a @class@ attribute -- if the given class appears anywhere in the space separated list of classes. -hasClass :: String -> AttributePredicate +hasClass :: StringLike str => str -> AttributePredicate hasClass clazz = anyAttrPredicate hasClass' where hasClass' (attrName, classes) - | "class" == TagSoup.toString attrName = textClass `elem` classList + | "class" == toString attrName = textClass `elem` classList | otherwise = False - where textClass = TagSoup.castString clazz - textClasses = TagSoup.castString classes + where textClass = castString clazz + textClasses = castString classes classList = T.split (== ' ') textClasses -- | Negates an 'AttributePredicate'. @@ -91,6 +91,6 @@ notP (MkAttributePredicate p) = MkAttributePredicate $ not . p -- 'AttributePredicate's. The argument is a function that takes the attribute -- key followed by the attribute value and returns a boolean indicating if the -- attribute satisfies the predicate. -match :: (String -> String -> Bool) -> AttributePredicate +match :: StringLike str => (str -> str -> Bool) -> AttributePredicate match f = anyAttrPredicate $ \(attrKey, attrValue) -> - f (TagSoup.toString attrKey) (TagSoup.toString attrValue) + f (castString attrKey) (castString attrValue) diff --git a/scalpel-core/src/Text/HTML/Scalpel/Internal/Select/Types.hs b/scalpel-core/src/Text/HTML/Scalpel/Internal/Select/Types.hs index f1211ac..716a270 100644 --- a/scalpel-core/src/Text/HTML/Scalpel/Internal/Select/Types.hs +++ b/scalpel-core/src/Text/HTML/Scalpel/Internal/Select/Types.hs @@ -22,37 +22,37 @@ module Text.HTML.Scalpel.Internal.Select.Types ( import Data.Char (toLower) import Data.String (IsString, fromString) +import Text.StringLike (StringLike, strMap, castString) import qualified Text.HTML.TagSoup as TagSoup -import qualified Text.StringLike as TagSoup import qualified Data.Text as T -- | The 'AttributeName' type can be used when creating 'Selector's to specify -- the name of an attribute of a tag. -data AttributeName = AnyAttribute | AttributeString String +data StringLike str => AttributeName str = AnyAttribute | AttributeString str -matchKey :: TagSoup.StringLike str => AttributeName -> str -> Bool -matchKey (AttributeString s) = ((TagSoup.fromString $ map toLower s) ==) +matchKey :: StringLike str => AttributeName str -> str -> Bool +matchKey (AttributeString s) = ((strMap toLower s) ==) matchKey AnyAttribute = const True -instance IsString AttributeName where - fromString = AttributeString +instance StringLike str => IsString (AttributeName str) where + fromString = AttributeString . castString -- | An 'AttributePredicate' is a method that takes a 'TagSoup.Attribute' and -- returns a 'Bool' indicating if the given attribute matches a predicate. data AttributePredicate = MkAttributePredicate - (forall str. TagSoup.StringLike str => [TagSoup.Attribute str] + (forall str. StringLike str => [TagSoup.Attribute str] -> Bool) -checkPred :: TagSoup.StringLike str +checkPred :: StringLike str => AttributePredicate -> [TagSoup.Attribute str] -> Bool checkPred (MkAttributePredicate p) = p -- | Creates an 'AttributePredicate' from a predicate function of a single -- attribute that matches if any one of the attributes matches the predicate. -anyAttrPredicate :: (forall str. TagSoup.StringLike str => (str, str) -> Bool) +anyAttrPredicate :: (forall str. StringLike str => (str, str) -> Bool) -> AttributePredicate anyAttrPredicate p = MkAttributePredicate $ any p @@ -76,7 +76,7 @@ defaultSelectSettings = SelectSettings { selectSettingsDepth = Nothing } -tagSelector :: String -> Selector +tagSelector :: StringLike str => str -> Selector tagSelector tag = MkSelector [ (toSelectNode (TagString tag) [], defaultSelectSettings) ] @@ -98,11 +98,11 @@ data SelectNode = SelectNode !T.Text [AttributePredicate] -- | The 'TagName' type is used when creating a 'Selector' to specify the name -- of a tag. -data TagName = AnyTag | TagString String +data StringLike str => TagName str = AnyTag | TagString str -instance IsString TagName where - fromString = TagString +instance StringLike str => IsString (TagName str) where + fromString = TagString . castString -toSelectNode :: TagName -> [AttributePredicate] -> SelectNode +toSelectNode :: StringLike str => TagName str -> [AttributePredicate] -> SelectNode toSelectNode AnyTag = SelectAny -toSelectNode (TagString str) = SelectNode . TagSoup.fromString $ map toLower str +toSelectNode (TagString str) = SelectNode $ T.map toLower $ castString str diff --git a/scalpel-core/tests/TestMain.hs b/scalpel-core/tests/TestMain.hs index 539e7eb..914c2bc 100644 --- a/scalpel-core/tests/TestMain.hs +++ b/scalpel-core/tests/TestMain.hs @@ -1,9 +1,11 @@ {-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE ExtendedDefaultRules #-} module Main (main) where -import Text.HTML.Scalpel.Core +import Text.HTML.Scalpel.Core hiding ((@:), (@=~)) +import qualified Text.HTML.Scalpel.Core import Control.Applicative import Control.Monad (guard) @@ -14,6 +16,8 @@ import Test.HUnit (Test(..), (@=?), (~:), runTestTT, failures) import qualified Text.HTML.TagSoup as TagSoup import qualified Text.Regex.TDFA +default (String, Int) + main :: IO () main = do n <- runTestTT (TestList [scrapeTests]) @@ -26,6 +30,12 @@ exit n = exitWith $ ExitFailure n re :: String -> Text.Regex.TDFA.Regex re = Text.Regex.TDFA.makeRegex +(@:) :: TagName String -> [AttributePredicate] -> Selector +(@:) = (Text.HTML.Scalpel.Core.@:) + +(@=~) :: AttributeName String -> Text.Regex.TDFA.Regex -> AttributePredicate +(@=~) = (Text.HTML.Scalpel.Core.@=~) + scrapeTests = "scrapeTests" ~: TestList [ scrapeTest "htmls should extract matching tag" diff --git a/scalpel/scalpel.cabal b/scalpel/scalpel.cabal index 6e8779d..e57e53e 100644 --- a/scalpel/scalpel.cabal +++ b/scalpel/scalpel.cabal @@ -44,7 +44,7 @@ library , data-default , http-client >= 0.4.30 , http-client-tls >= 0.2.4 - , tagsoup >= 0.12.2 + , tagsoup >= 0.14.8 , text default-extensions: ParallelListComp diff --git a/scalpel/src/Text/HTML/Scalpel/Internal/Scrape/URL.hs b/scalpel/src/Text/HTML/Scalpel/Internal/Scrape/URL.hs index 16f2723..73a3418 100644 --- a/scalpel/src/Text/HTML/Scalpel/Internal/Scrape/URL.hs +++ b/scalpel/src/Text/HTML/Scalpel/Internal/Scrape/URL.hs @@ -1,4 +1,5 @@ {-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE RankNTypes #-} {-# OPTIONS_HADDOCK hide #-} module Text.HTML.Scalpel.Internal.Scrape.URL ( URL @@ -19,6 +20,7 @@ import Control.Applicative ((<$>)) import Data.CaseInsensitive () import Data.Default (def) import Data.Maybe (fromMaybe, listToMaybe) +import Text.StringLike (StringLike, castString) import qualified Data.ByteString.Lazy as LBS import qualified Data.Default as Default @@ -43,7 +45,7 @@ data Config str = Config { , manager :: Maybe HTTP.Manager } -instance TagSoup.StringLike str => Default.Default (Config str) where +instance StringLike str => Default.Default (Config str) where def = Config { decoder = defaultDecoder , manager = Nothing @@ -55,28 +57,28 @@ instance TagSoup.StringLike str => Default.Default (Config str) where -- The default behavior is to use the global manager provided by -- http-client-tls (via 'HTTP.getGlobalManager'). Any exceptions thrown by -- http-client are not caught and are bubbled up to the caller. -scrapeURL :: (TagSoup.StringLike str) - => URL -> Scraper str a -> IO (Maybe a) +scrapeURL :: (StringLike str, StringLike url) + => url -> Scraper str a -> IO (Maybe a) scrapeURL = scrapeURLWithConfig def -- | The 'scrapeURLWithConfig' function takes a 'Config' record type and -- downloads the contents of the given URL and executes a 'Scraper' on it. -scrapeURLWithConfig :: (TagSoup.StringLike str) - => Config str -> URL -> Scraper str a -> IO (Maybe a) +scrapeURLWithConfig :: (StringLike str, StringLike url) + => Config str -> url -> Scraper str a -> IO (Maybe a) scrapeURLWithConfig config url scraper = do manager <- fromMaybe HTTP.getGlobalManager (return <$> manager config) tags <- downloadAsTags (decoder config) manager url return (scrape scraper tags) where downloadAsTags decoder manager url = do - request <- HTTP.parseRequest url + request <- HTTP.parseRequest $ castString url response <- HTTP.httpLbs request manager return $ TagSoup.parseTags $ decoder response -- | The default response decoder. This decoder attempts to infer the character -- set of the HTTP response body from the `Content-Type` header. If this header -- is not present, then the character set is assumed to be `ISO-8859-1`. -defaultDecoder :: TagSoup.StringLike str => Decoder str +defaultDecoder :: StringLike str => Decoder str defaultDecoder response = TagSoup.castString $ choosenDecoder body where @@ -95,9 +97,9 @@ defaultDecoder response = TagSoup.castString | otherwise = Text.decodeLatin1 . LBS.toStrict -- | A decoder that will always decode using `UTF-8`. -utf8Decoder :: TagSoup.StringLike str => Decoder str -utf8Decoder = TagSoup.castString . Text.decodeUtf8 . LBS.toStrict . HTTP.responseBody +utf8Decoder :: StringLike str => Decoder str +utf8Decoder = castString . Text.decodeUtf8 . LBS.toStrict . HTTP.responseBody -- | A decoder that will always decode using `ISO-8859-1`. -iso88591Decoder :: TagSoup.StringLike str => Decoder str -iso88591Decoder = TagSoup.castString . Text.decodeLatin1 . LBS.toStrict . HTTP.responseBody +iso88591Decoder :: StringLike str => Decoder str +iso88591Decoder = castString . Text.decodeLatin1 . LBS.toStrict . HTTP.responseBody diff --git a/stack.yaml b/stack.yaml index 4415597..a098f9c 100644 --- a/stack.yaml +++ b/stack.yaml @@ -3,4 +3,5 @@ packages: - scalpel/ - scalpel-core/ - examples/ -resolver: lts-13.7 +resolver: lts-13.21 +extra-deps: []