diff --git a/.xrefcheck.yaml b/.xrefcheck.yaml
index 748c1dbb..dc63105f 100644
--- a/.xrefcheck.yaml
+++ b/.xrefcheck.yaml
@@ -6,7 +6,7 @@ exclusions:
ignore:
- tests/markdowns/**/*
- tests/golden/**/*
- - docs/output-sample/**/*
+ - docs/output-sample/**/*.md
scanners:
markdown:
diff --git a/CHANGES.md b/CHANGES.md
index a4d037eb..9f408dac 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -41,6 +41,9 @@ Unreleased
* [#231](https://github.com/serokell/xrefcheck/pull/231)
+ Anchor analysis takes now into account the appropriate case-sensitivity depending on
the configured Markdown flavour.
+* [#259](https://github.com/serokell/xrefcheck/pull/259)
+ + Add support for image tags ``, anchor hyperlinks `Text`
+ and anchor target locations `` or ``.
0.2.2
==========
diff --git a/src/Xrefcheck/Scanners/Markdown.hs b/src/Xrefcheck/Scanners/Markdown.hs
index bcd6d40f..55e624b1 100644
--- a/src/Xrefcheck/Scanners/Markdown.hs
+++ b/src/Xrefcheck/Scanners/Markdown.hs
@@ -22,6 +22,7 @@ import Universum
import CMarkGFM
(Node (..), NodeType (..), PosInfo (..), commonmarkToNode, extAutolink, optFootnotes)
import Control.Lens (_Just, makeLenses, makeLensesFor, (.=))
+import Control.Monad.Trans.RWS.CPS qualified as RWS
import Control.Monad.Trans.Writer.CPS (Writer, runWriter, tell)
import Data.Aeson (FromJSON (..), genericParseJSON)
import Data.ByteString.Lazy qualified as BSL
@@ -195,6 +196,12 @@ removeIgnored = withIgnoreMode . cataNodeWithParentNodeInfo remove
(IMSLink _, IMAGE {}) -> do
ssIgnore .= Nothing
return defNode
+ (IMSLink _, HTML_INLINE text) | isLink text -> do
+ ssIgnore .= Nothing
+ pure defNode
+ (IMSLink _, HTML_BLOCK text) | isLink text -> do
+ ssIgnore .= Nothing
+ pure defNode
(IMSLink ignoreLinkState, _) -> do
when (ignoreLinkState == ExpectingLinkInSubnodes) $
ssIgnore . _Just . ignoreMode .= IMSLink ParentExpectsLink
@@ -264,6 +271,17 @@ removeIgnored = withIgnoreMode . cataNodeWithParentNodeInfo remove
pure node
(node, _) -> pure node
+findAttributes :: [Text] -> [Attribute Text] -> Maybe Text
+findAttributes (map T.toLower -> attrs) =
+ fmap snd . find ((`elem` attrs) . T.toLower . fst)
+
+isLink :: Text -> Bool
+isLink (parseTags -> tags) = case safeHead tags of
+ Just (TagOpen tag attrs) ->
+ T.toLower tag == "a" && isJust (findAttributes ["href"] attrs)
+ || T.toLower tag == "img" && isJust (findAttributes ["src"] attrs)
+ _ -> False
+
-- | Custom `foldMap` for source tree.
foldNode :: (Monoid a, Monad m) => (Node -> m a) -> Node -> m a
foldNode action node@(Node _ _ subs) = do
@@ -271,50 +289,41 @@ foldNode action node@(Node _ _ subs) = do
b <- concatForM subs (foldNode action)
return (a <> b)
-type ExtractorM a = ReaderT MarkdownConfig (Writer [ScanError 'Parse]) a
+type ExtractorM a = RWS.RWS MarkdownConfig [ScanError 'Parse] (Maybe Reference) a
-- | Extract information from source tree.
nodeExtractInfo :: Node -> ExtractorM FileInfo
nodeExtractInfo input@(Node _ _ nSubs) = do
if checkIgnoreAllFile nSubs
then return (diffToFileInfo mempty)
- else diffToFileInfo <$> (foldNode extractor =<< lift (removeIgnored input))
+ else diffToFileInfo <$> (foldNode extractor =<< (RWS.writer . runWriter $ removeIgnored input))
where
extractor :: Node -> ExtractorM FileInfoDiff
- extractor node@(Node pos ty _) =
- case ty of
- HTML_BLOCK _ -> do
- return mempty
+ extractor node@(Node pos ty _) = do
+ reference' <- RWS.get
+ -- If current state is not `Nothing`, try extracting associated text
+ let fileInfoDiff = case (reference', ty) of
+ (Just ref, TEXT text) ->
+ mempty & fidReferences .~ DList.singleton ref {rName = text}
+ (Just ref, _) -> mempty & fidReferences .~ DList.singleton ref
+ _ -> mempty
+ RWS.put Nothing
+ fmap (fileInfoDiff <>) case ty of
+ HTML_BLOCK text | isLink text -> extractHtmlLink text
+
+ HTML_BLOCK text -> extractAnchor text
HEADING lvl -> do
- flavor <- asks mcFlavor
+ flavor <- RWS.asks mcFlavor
let aType = HeaderAnchor lvl
let aName = headerToAnchor flavor $ nodeExtractText node
let aPos = toPosition pos
return $ FileInfoDiff DList.empty $ DList.singleton $ Anchor {aType, aName, aPos}
- HTML_INLINE text -> do
- let
- mName = do
- tag <- safeHead $ parseTags text
- attributes <- case tag of
- TagOpen a attrs
- | T.toLower a == "a" -> Just attrs
- _ -> Nothing
- (_, name) <- find (\(field, _) -> T.toLower field `elem` ["name", "id"]) attributes
- pure name
-
- case mName of
- Just aName -> do
- let aType = HandAnchor
- aPos = toPosition pos
- return $ FileInfoDiff
- mempty
- (pure $ Anchor {aType, aName, aPos})
+ HTML_INLINE text | isLink text -> extractHtmlLink text
- Nothing -> do
- return mempty
+ HTML_INLINE text -> extractAnchor text
LINK url _ -> extractLink url
@@ -328,10 +337,7 @@ nodeExtractInfo input@(Node _ _ nSubs) = do
rPos = toPosition pos
link = if null url then rName else url
- let (rLink, rAnchor) = case T.splitOn "#" link of
- [t] -> (t, Nothing)
- t : ts -> (t, Just $ T.intercalate "#" ts)
- [] -> error "impossible"
+ let (rLink, rAnchor) = splitLink link
let rInfo = referenceInfo rLink
@@ -339,6 +345,61 @@ nodeExtractInfo input@(Node _ _ nSubs) = do
(DList.singleton $ Reference {rName, rPos, rLink, rAnchor, rInfo})
DList.empty
+ extractAnchor :: Text -> ExtractorM FileInfoDiff
+ extractAnchor text = do
+ let mName = do
+ tag <- safeHead $ parseTags text
+ attributes <- case tag of
+ TagOpen a attrs | T.toLower a == "a" -> Just attrs
+ _ -> Nothing
+ findAttributes ["name", "id"] attributes
+
+ case mName of
+ Just aName -> do
+ let aType = HandAnchor
+ aPos = toPosition pos
+ return $ FileInfoDiff
+ mempty
+ (pure $ Anchor {aType, aName, aPos})
+
+ Nothing -> do
+ return mempty
+
+ extractHtmlReference :: [Attribute Text] -> Maybe PosInfo -> DList.DList Reference
+ extractHtmlReference attrs tagPos = fromMaybe mempty do
+ link <- findAttributes ["href"] attrs
+ let (rLink, rAnchor) = splitLink link
+ pure . DList.singleton $ Reference "" rLink rAnchor (toPosition tagPos) (referenceInfo rLink)
+
+ splitLink :: Text -> (Text, Maybe Text)
+ splitLink link = case T.splitOn "#" link of
+ [t] -> (t, Nothing)
+ t : ts -> (t, Just $ T.intercalate "#" ts)
+ [] -> error "impossible"
+
+ extractHtmlImage :: [Attribute Text] -> Maybe PosInfo -> DList.DList Reference
+ extractHtmlImage attrs tagPos = fromMaybe mempty do
+ link <- findAttributes ["src"] attrs
+ pure . DList.singleton $ Reference "" link Nothing (toPosition tagPos) (referenceInfo link)
+
+ extractHtmlLink :: Text -> ExtractorM FileInfoDiff
+ extractHtmlLink text =
+ case safeHead $ parseTags text of
+ Just (TagOpen tag attrs) | T.toLower tag == "img" ->
+ pure $ mempty & fidReferences .~ extractHtmlImage attrs pos
+ Just (TagOpen tag attrs) | T.toLower tag == "a" -> do
+ let reference = extractHtmlReference attrs pos
+ case DList.toList reference of
+ [ref] -> do
+ -- The `cmark-gfm` package parses the link tag as three separate nodes:
+ -- `HTML_INLINE` with an opening tag, a `TEXT` with text in between,
+ -- and `HTML_INLINE` with a closing tag. So we keep the extracted link in a state and
+ -- try to get associated text in the next node.
+ RWS.put $ Just ref
+ pure mempty
+ _ -> pure mempty
+ _ -> pure mempty
+
-- | Check if there is `ignore all` at the beginning of the file,
-- ignoring preceding comments if there are any.
checkIgnoreAllFile :: [Node] -> Bool
@@ -406,11 +467,10 @@ textToMode _ = NotAnAnnotation
parseFileInfo :: MarkdownConfig -> LT.Text -> (FileInfo, [ScanError 'Parse])
parseFileInfo config input
- = runWriter
- $ flip runReaderT config
- $ nodeExtractInfo
+ = RWS.evalRWS
+ (nodeExtractInfo
$ commonmarkToNode [optFootnotes] [extAutolink]
- $ toStrict input
+ $ toStrict input) config Nothing
markdownScanner :: MarkdownConfig -> ScanAction
markdownScanner config canonicalFile =
diff --git a/tests/golden/check-html/check-html.md b/tests/golden/check-html/check-html.md
index 957aa5fc..8ab27fc9 100644
--- a/tests/golden/check-html/check-html.md
+++ b/tests/golden/check-html/check-html.md
@@ -6,8 +6,38 @@
## Title1
+
+
+text text
+
[One](#one)
[Two](#two)
[Three](#three)
[Four](#four)
[Five](#five)
+[Six](#six)
+[Seven](#seven)
+
+
+
+text text
+
+serokell
+
+text serokell text
+
+Six
+
+text Seven text
+
+
+serokell404
+
+
+text serokell404 text
+
+
+
+
+
+text text
diff --git a/tests/golden/check-html/check.html.bats b/tests/golden/check-html/check.html.bats
index 06ea97ca..5a0e91a0 100644
--- a/tests/golden/check-html/check.html.bats
+++ b/tests/golden/check-html/check.html.bats
@@ -11,7 +11,7 @@ load '../helpers'
@test "All HTML anchors should be valid" {
- run xrefcheck
+ to_temp xrefcheck -v
- assert_output --partial "All repository links are valid."
+ assert_diff expected.gold
}
diff --git a/tests/golden/check-html/expected.gold b/tests/golden/check-html/expected.gold
new file mode 100644
index 00000000..0d394cf6
--- /dev/null
+++ b/tests/golden/check-html/expected.gold
@@ -0,0 +1,67 @@
+=== Repository data ===
+
+ check-html.md:
+ - references:
+ - reference (file-local) at src:13:1-11:
+ - text: "One"
+ - link: -
+ - anchor: one
+ - reference (file-local) at src:14:1-11:
+ - text: "Two"
+ - link: -
+ - anchor: two
+ - reference (file-local) at src:15:1-15:
+ - text: "Three"
+ - link: -
+ - anchor: three
+ - reference (file-local) at src:16:1-13:
+ - text: "Four"
+ - link: -
+ - anchor: four
+ - reference (file-local) at src:17:1-13:
+ - text: "Five"
+ - link: -
+ - anchor: five
+ - reference (file-local) at src:18:1-11:
+ - text: "Six"
+ - link: -
+ - anchor: six
+ - reference (file-local) at src:19:1-15:
+ - text: "Seven"
+ - link: -
+ - anchor: seven
+ - reference (external) at src:21:1-144:
+ - text: ""
+ - link: https://user-images.githubusercontent.com/5394217/70820564-06b06e00-1dea-11ea-9680-27f661ca2a58.png
+ - anchor: -
+ - reference (external) at src:23:6-149:
+ - text: ""
+ - link: https://user-images.githubusercontent.com/5394217/70820564-06b06e00-1dea-11ea-9680-27f661ca2a58.png
+ - anchor: -
+ - reference (external) at src:25:1-29:
+ - text: "serokell"
+ - link: https://serokell.io/
+ - anchor: -
+ - reference (external) at src:27:6-34:
+ - text: "serokell"
+ - link: https://serokell.io/
+ - anchor: -
+ - reference (file-local) at src:29:1-13:
+ - text: "Six"
+ - link: -
+ - anchor: six
+ - reference (file-local) at src:31:6-20:
+ - text: "Seven"
+ - link: -
+ - anchor: seven
+ - anchors:
+ - title1 (header II) at src:7:1-96
+ - one (hand made) at src:7:4-17
+ - two (hand made) at src:7:19-30
+ - three (hand made) at src:7:32-47
+ - four (hand made) at src:7:49-63
+ - five (hand made) at src:7:69-88
+ - six (hand made) at src:9:1-12
+ - seven (hand made) at src:11:6-17
+
+All repository links are valid.