From 8e6369377c9ae2453318b91b0b491b3b0e0e800e Mon Sep 17 00:00:00 2001 From: Hugo McNally <45573837+HU90m@users.noreply.github.com> Date: Mon, 31 Jul 2023 15:04:00 +0100 Subject: [PATCH] Introduce fragment checking for links to markdown files. (#1126) - Implemented enhancements to include fragments in file links - Checked links to markdown files with fragments, generating unique kebab case and heading attributes. - Made code more idiomatic and added an integration test. - Updated documentation. - Fixed issues with heading attributes fragments and ensured proper handling of file errors. --- README.md | 3 + fixtures/TEST.md | 2 +- fixtures/fragments/empty_file | 0 fixtures/fragments/file1.md | 42 +++++++++ fixtures/fragments/file2.md | 7 ++ lychee-bin/src/client.rs | 1 + lychee-bin/src/options.rs | 6 ++ lychee-bin/tests/cli.rs | 38 +++++++-- lychee-lib/src/client.rs | 45 ++++++++-- lychee-lib/src/extract/markdown.rs | 104 ++++++++++++++++++++++- lychee-lib/src/extract/mod.rs | 2 +- lychee-lib/src/types/error.rs | 18 +++- lychee-lib/src/types/input.rs | 7 +- lychee-lib/src/utils/fragment_checker.rs | 75 ++++++++++++++++ lychee-lib/src/utils/mod.rs | 1 + lychee-lib/src/utils/path.rs | 4 +- lychee-lib/src/utils/request.rs | 23 +++-- lychee-lib/src/utils/url.rs | 56 ++++++------ 18 files changed, 374 insertions(+), 60 deletions(-) create mode 100644 fixtures/fragments/empty_file create mode 100644 fixtures/fragments/file1.md create mode 100644 fixtures/fragments/file2.md create mode 100644 lychee-lib/src/utils/fragment_checker.rs diff --git a/README.md b/README.md index 32bc32c795..ce7e3a7e6e 100644 --- a/README.md +++ b/README.md @@ -386,6 +386,9 @@ Options: -a, --accept Comma-separated list of accepted status codes for valid links + --include-fragments + Enable the checking of fragments in links + -t, --timeout Website timeout in seconds from connect to response finished diff --git a/fixtures/TEST.md b/fixtures/TEST.md index e6e44bf713..15efe7cd01 100644 --- a/fixtures/TEST.md +++ b/fixtures/TEST.md @@ -1,7 +1,7 @@ Check file link ![Logo](../assets/banner.svg) -![Anchors should be ignored](#awesome) +![Fragment only link](#awesome) Normal link, which should work as expected. [Wikipedia](https://en.wikipedia.org/wiki/Static_program_analysis) diff --git a/fixtures/fragments/empty_file b/fixtures/fragments/empty_file new file mode 100644 index 0000000000..e69de29bb2 diff --git a/fixtures/fragments/file1.md b/fixtures/fragments/file1.md new file mode 100644 index 0000000000..623e61fcee --- /dev/null +++ b/fixtures/fragments/file1.md @@ -0,0 +1,42 @@ +# Fragment Test File 1 + +This is a test file for the fragment loader. + +## Fragment 1 + +[Link to fragment 2](#fragment-2) + +## Fragment 2 + +[Link to fragment 1 in file2](file2.md#fragment-1) + +## Fragment 3 + +[Link to missing fragment](#missing-fragment) + +[Link to missing fragment in file2](file2.md#missing-fragment) + +## HTML Fragments + +Explicit fragment links are currently not supported. +Therefore we put the test into a code block for now to prevent false positives. + +``` + + +[Link to explicit fragment](#explicit-fragment) +``` + +## Custom Fragments + +[Custom fragment id in file2](file2.md#custom-id) + +# Kebab Case Fragment + +[Link to kebab-case fragment](#kebab-case-fragment) + +[Link to second kebab-case fragment](#kebab-case-fragment-1) + +# Kebab Case Fragment + +[Link to another file type](empty_file#fragment) diff --git a/fixtures/fragments/file2.md b/fixtures/fragments/file2.md new file mode 100644 index 0000000000..76ac82ecd6 --- /dev/null +++ b/fixtures/fragments/file2.md @@ -0,0 +1,7 @@ +# Fragment Test File 2 + +This is a test file for the fragment loader. + +### Some other heading with custom id {#custom-id} + +#### Fragment 1 diff --git a/lychee-bin/src/client.rs b/lychee-bin/src/client.rs index 1bb41a3550..e2297c8ea5 100644 --- a/lychee-bin/src/client.rs +++ b/lychee-bin/src/client.rs @@ -77,6 +77,7 @@ pub(crate) fn create(cfg: &Config, cookie_jar: Option<&Arc>) - .accepted(accepted) .require_https(cfg.require_https) .cookie_jar(cookie_jar.cloned()) + .include_fragments(cfg.include_fragments) .build() .client() .context("Failed to create request client") diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs index c33b3a27a9..52c2b2e9b1 100644 --- a/lychee-bin/src/options.rs +++ b/lychee-bin/src/options.rs @@ -307,6 +307,11 @@ pub(crate) struct Config { #[serde(default)] pub(crate) accept: Option>, + /// Enable the checking of fragments in links. + #[arg(long)] + #[serde(default)] + pub(crate) include_fragments: bool, + /// Website timeout in seconds from connect to response finished #[arg(short, long, default_value = &TIMEOUT_STR)] #[serde(default = "timeout")] @@ -426,6 +431,7 @@ impl Config { output: None; require_https: false; cookie_jar: None; + include_fragments: false; } if self diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs index 0d82872507..5612fe1a8f 100644 --- a/lychee-bin/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -222,8 +222,8 @@ mod cli { .env_clear() .assert() .success() - .stdout(contains("3 Total")) - .stdout(contains("3 OK")); + .stdout(contains("4 Total")) + .stdout(contains("4 OK")); } #[test] @@ -489,8 +489,8 @@ mod cli { test_json_output!( "TEST.md", MockResponseStats { - total: 11, - successful: 9, + total: 12, + successful: 10, excludes: 2, ..MockResponseStats::default() } @@ -518,7 +518,7 @@ mod cli { // Running the command from the command line will print 9 links, // because the actual `--dump` command filters out the two // http(s)://example.com links - assert_eq!(output.lines().count(), 11); + assert_eq!(output.lines().count(), 12); fs::remove_file(outfile)?; Ok(()) } @@ -534,7 +534,7 @@ mod cli { .arg(".*") .assert() .success() - .stdout(contains("11 Excluded")); + .stdout(contains("12 Excluded")); Ok(()) } @@ -1399,4 +1399,30 @@ mod cli { Ok(()) } + + #[test] + fn test_fragments() { + let mut cmd = main_command(); + let input = fixtures_path().join("fragments"); + + cmd.arg("--verbose") + .arg("--include-fragments") + .arg(input) + .assert() + .failure() + .stderr(contains("fixtures/fragments/file1.md#fragment-2")) + .stderr(contains("fixtures/fragments/file2.md#custom-id")) + .stderr(contains("fixtures/fragments/file1.md#missing-fragment")) + .stderr(contains("fixtures/fragments/file2.md#fragment-1")) + .stderr(contains("fixtures/fragments/file1.md#kebab-case-fragment")) + .stderr(contains("fixtures/fragments/file2.md#missing-fragment")) + .stderr(contains("fixtures/fragments/empty_file#fragment")) + .stderr(contains( + "fixtures/fragments/file1.md#kebab-case-fragment-1", + )) + .stdout(contains("8 Total")) + .stdout(contains("6 OK")) + // 2 failures because of missing fragments + .stdout(contains("2 Errors")); + } } diff --git a/lychee-lib/src/client.rs b/lychee-lib/src/client.rs index 80b5e25240..3bc1b04e38 100644 --- a/lychee-lib/src/client.rs +++ b/lychee-lib/src/client.rs @@ -13,7 +13,7 @@ clippy::default_trait_access, clippy::used_underscore_binding )] -use std::{collections::HashSet, sync::Arc, time::Duration}; +use std::{collections::HashSet, path::Path, sync::Arc, time::Duration}; #[cfg(all(feature = "email-check", feature = "native-tls"))] use check_if_email_exists::{check_email, CheckEmailInput, Reachable}; @@ -22,7 +22,7 @@ use http::{ header::{HeaderMap, HeaderValue, AUTHORIZATION}, StatusCode, }; -use log::debug; +use log::{debug, warn}; use octocrab::Octocrab; use regex::RegexSet; use reqwest::{header, redirect, Url}; @@ -36,6 +36,7 @@ use crate::{ remap::Remaps, retry::RetryExt, types::uri::github::GithubUri, + utils::fragment_checker::FragmentChecker, BasicAuthCredentials, ErrorKind, Request, Response, Result, Status, Uri, }; @@ -270,6 +271,9 @@ pub struct ClientBuilder { /// /// See https://docs.rs/reqwest/latest/reqwest/struct.ClientBuilder.html#method.cookie_store cookie_jar: Option>, + + /// Enable the checking of fragments in links. + include_fragments: bool, } impl Default for ClientBuilder { @@ -383,6 +387,8 @@ impl ClientBuilder { accepted: self.accepted, require_https: self.require_https, quirks, + include_fragments: self.include_fragments, + fragment_checker: FragmentChecker::new(), }) } } @@ -429,6 +435,12 @@ pub struct Client { /// Override behaviors for certain known issues with special URIs. quirks: Quirks, + + /// Enable the checking of fragments in links. + include_fragments: bool, + + /// Caches Fragments + fragment_checker: FragmentChecker, } impl Client { @@ -472,7 +484,7 @@ impl Client { } let status = match uri.scheme() { - _ if uri.is_file() => self.check_file(uri), + _ if uri.is_file() => self.check_file(uri).await, _ if uri.is_mail() => self.check_mail(uri).await, _ => self.check_website(uri, credentials).await?, }; @@ -659,13 +671,30 @@ impl Client { } /// Check a `file` URI. - pub fn check_file(&self, uri: &Uri) -> Status { - if let Ok(path) = uri.url.to_file_path() { - if path.exists() { - return Status::Ok(StatusCode::OK); + pub async fn check_file(&self, uri: &Uri) -> Status { + let Ok(path) = uri.url.to_file_path() else { + return ErrorKind::InvalidFilePath(uri.clone()).into(); + }; + if !path.exists() { + return ErrorKind::InvalidFilePath(uri.clone()).into(); + } + if self.include_fragments { + self.check_fragment(&path, uri).await + } else { + Status::Ok(StatusCode::OK) + } + } + + /// Checks a `file` URI's fragment. + pub async fn check_fragment(&self, path: &Path, uri: &Uri) -> Status { + match self.fragment_checker.check(path, &uri.url).await { + Ok(true) => Status::Ok(StatusCode::OK), + Ok(false) => ErrorKind::InvalidFragment(uri.clone()).into(), + Err(err) => { + warn!("Skipping fragment check due to the following error: {err}"); + Status::Ok(StatusCode::OK) } } - ErrorKind::InvalidFilePath(uri.clone()).into() } /// Check a mail address, or equivalently a `mailto` URI. diff --git a/lychee-lib/src/extract/markdown.rs b/lychee-lib/src/extract/markdown.rs index ee938cdd8b..06e53d4654 100644 --- a/lychee-lib/src/extract/markdown.rs +++ b/lychee-lib/src/extract/markdown.rs @@ -1,4 +1,7 @@ -use pulldown_cmark::{Event, Parser, Tag}; +//! Extract things from markdown documents +use std::collections::{HashMap, HashSet}; + +use pulldown_cmark::{Event, Options, Parser, Tag}; use crate::{extract::plaintext::extract_plaintext, types::uri::raw::RawUri}; @@ -77,6 +80,79 @@ pub(crate) fn extract_markdown(input: &str, include_verbatim: bool) -> Vec HashSet { + let mut in_heading = false; + let mut heading = String::new(); + let mut id_generator = HeadingIdGenerator::default(); + + let mut out = HashSet::new(); + + for event in Parser::new_ext(input, Options::ENABLE_HEADING_ATTRIBUTES) { + match event { + Event::Start(Tag::Heading(..)) => { + in_heading = true; + } + Event::End(Tag::Heading(_level, id, _classes)) => { + if let Some(frag) = id { + out.insert(frag.to_string()); + } + + if !heading.is_empty() { + let id = id_generator.generate(&heading); + out.insert(id); + heading.clear(); + } + + in_heading = false; + } + Event::Text(text) => { + if in_heading { + heading.push_str(&text); + }; + } + + // Silently skip over other events + _ => (), + } + } + out +} + +#[derive(Default)] +struct HeadingIdGenerator { + counter: HashMap, +} + +impl HeadingIdGenerator { + fn generate(&mut self, heading: &str) -> String { + let mut id = Self::into_kebab_case(heading); + let count = self.counter.entry(id.clone()).or_insert(0); + if *count != 0 { + id = format!("{}-{}", id, *count); + } + *count += 1; + + id + } + + /// Converts text into kebab case + #[must_use] + fn into_kebab_case(text: &str) -> String { + text.chars() + .filter_map(|ch| { + if ch.is_alphanumeric() || ch == '_' || ch == '-' { + Some(ch.to_ascii_lowercase()) + } else if ch.is_whitespace() { + Some('-') + } else { + None + } + }) + .collect::() + } +} + #[cfg(test)] mod tests { use super::*; @@ -148,12 +224,12 @@ or inline like `https://bar.org` for instance. #[test] #[ignore] fn test_skip_verbatim_html() { - let input = " + let input = " http://link.com
-Some pre-formatted http://pre.com 
+Some pre-formatted http://pre.com
 
"; let expected = vec![]; @@ -161,4 +237,26 @@ Some pre-formatted http://pre.com let uris = extract_markdown(input, false); assert_eq!(uris, expected); } + + #[test] + fn test_kebab_case() { + let check = |input, expected| { + let actual = HeadingIdGenerator::into_kebab_case(input); + assert_eq!(actual, expected); + }; + check("A Heading", "a-heading"); + check( + "This header has a :thumbsup: in it", + "this-header-has-a-thumbsup-in-it", + ); + check( + "Header with 한글 characters (using unicode)", + "header-with-한글-characters-using-unicode", + ); + check( + "Underscores foo_bar_, dots . and numbers 1.7e-3", + "underscores-foo_bar_-dots--and-numbers-17e-3", + ); + check("Many spaces", "many----------spaces"); + } } diff --git a/lychee-lib/src/extract/mod.rs b/lychee-lib/src/extract/mod.rs index 4b30ef4112..3bf92c8588 100644 --- a/lychee-lib/src/extract/mod.rs +++ b/lychee-lib/src/extract/mod.rs @@ -1,7 +1,7 @@ use crate::types::{uri::raw::RawUri, FileType, InputContent}; mod html; -mod markdown; +pub mod markdown; mod plaintext; use markdown::extract_markdown; diff --git a/lychee-lib/src/types/error.rs b/lychee-lib/src/types/error.rs index 161fac24fd..48575574a7 100644 --- a/lychee-lib/src/types/error.rs +++ b/lychee-lib/src/types/error.rs @@ -63,6 +63,10 @@ pub enum ErrorKind { #[error("Cannot find file")] InvalidFilePath(Uri), + /// The given URI cannot be converted to a file path + #[error("Cannot find fragment")] + InvalidFragment(Uri), + /// The given path cannot be converted to a URI #[error("Invalid path to URL conversion: {0}")] InvalidUrlFromPath(PathBuf), @@ -87,7 +91,7 @@ pub enum ErrorKind { /// The given path does not resolve to a valid file #[error("Cannot find local file {0}")] - FileNotFound(PathBuf), + InvalidFile(PathBuf), /// Error while traversing an input directory #[error("Cannot traverse input directory: {0}")] @@ -132,6 +136,7 @@ pub enum ErrorKind { /// Basic auth extractor error #[error("Basic auth extractor error")] BasicAuthExtractorError(#[from] BasicAuthExtractorError), + /// Cannot load cookies #[error("Cannot load cookies")] Cookies(String), @@ -225,6 +230,14 @@ impl PartialEq for ErrorKind { (Self::Regex(e1), Self::Regex(e2)) => e1.to_string() == e2.to_string(), (Self::DirTraversal(e1), Self::DirTraversal(e2)) => e1.to_string() == e2.to_string(), (Self::Channel(_), Self::Channel(_)) => true, + (Self::TooManyRedirects(e1), Self::TooManyRedirects(e2)) => { + e1.to_string() == e2.to_string() + } + (Self::BasicAuthExtractorError(e1), Self::BasicAuthExtractorError(e2)) => { + e1.to_string() == e2.to_string() + } + (Self::Cookies(e1), Self::Cookies(e2)) => e1 == e2, + (Self::InvalidFile(p1), Self::InvalidFile(p2)) => p1 == p2, _ => false, } } @@ -249,13 +262,14 @@ impl Hash for ErrorKind { Self::GithubRequest(e) => e.to_string().hash(state), Self::InvalidGithubUrl(s) => s.hash(state), Self::DirTraversal(e) => e.to_string().hash(state), - Self::FileNotFound(e) => e.to_string_lossy().hash(state), + Self::InvalidFile(e) => e.to_string_lossy().hash(state), Self::EmptyUrl => "Empty URL".hash(state), Self::ParseUrl(e, s) => (e.to_string(), s).hash(state), Self::InvalidURI(u) => u.hash(state), Self::InvalidUrlFromPath(p) => p.hash(state), Self::Utf8(e) => e.to_string().hash(state), Self::InvalidFilePath(u) => u.hash(state), + Self::InvalidFragment(u) => u.hash(state), Self::UnreachableEmailAddress(u, ..) => u.hash(state), Self::InsecureURL(u, ..) => u.hash(state), Self::InvalidBase(base, e) => (base, e).hash(state), diff --git a/lychee-lib/src/types/input.rs b/lychee-lib/src/types/input.rs index 3549f2c1c4..2db32e48b4 100644 --- a/lychee-lib/src/types/input.rs +++ b/lychee-lib/src/types/input.rs @@ -153,7 +153,7 @@ impl Input { // and exit early if it does // This check might not be sufficient to cover all cases // but it catches the most common ones - return Err(ErrorKind::FileNotFound(path)); + return Err(ErrorKind::InvalidFile(path)); } else { // Invalid path; check if a valid URL can be constructed from the input // by prefixing it with a `http://` scheme. @@ -442,10 +442,7 @@ mod tests { let input = Input::new(test_file, None, false, None); assert!(input.is_err()); - assert!(matches!( - input, - Err(ErrorKind::FileNotFound(PathBuf { .. })) - )); + assert!(matches!(input, Err(ErrorKind::InvalidFile(PathBuf { .. })))); } #[test] diff --git a/lychee-lib/src/utils/fragment_checker.rs b/lychee-lib/src/utils/fragment_checker.rs new file mode 100644 index 0000000000..b7c0a250ac --- /dev/null +++ b/lychee-lib/src/utils/fragment_checker.rs @@ -0,0 +1,75 @@ +use std::{ + collections::{hash_map::Entry, HashMap, HashSet}, + path::Path, + sync::Arc, +}; + +use crate::{extract::markdown::extract_markdown_fragments, types::FileType, Result}; +use tokio::{fs, sync::Mutex}; +use url::Url; + +/// Holds a cache of fragments for a given URL. +/// +/// Fragments, also known as anchors, are used to link to a specific +/// part of a page. For example, the URL `https://example.com#foo` +/// will link to the element with the `id` of `foo`. +/// +/// This cache is used to avoid having to re-parse the same file +/// multiple times when checking if a given URL contains a fragment. +/// +/// The cache is stored in a `HashMap` with the URL as the key and +/// a `HashSet` of fragments as the value. +#[derive(Default, Clone, Debug)] +pub(crate) struct FragmentChecker { + cache: Arc>>>, +} + +impl FragmentChecker { + /// Creates a new `FragmentChecker`. + pub(crate) fn new() -> Self { + Self { + cache: Arc::default(), + } + } + + /// Checks the given path contains the given fragment. + /// + /// Returns false, if there is a fragment in the link and the path is to a markdown file which + /// doesn't contain the given fragment. + /// + /// In all other cases, returns true. + pub(crate) async fn check(&self, path: &Path, url: &Url) -> Result { + match (FileType::from(path), url.fragment()) { + (FileType::Markdown, Some(fragment)) => { + let url_without_frag = Self::remove_fragment(url.clone()); + self.populate_cache_if_vacant(url_without_frag, path, fragment) + .await + } + _ => Ok(true), + } + } + + fn remove_fragment(mut url: Url) -> String { + url.set_fragment(None); + url.into() + } + + /// Populates the fragment cache with the given URL if it + /// is not already in the cache. + async fn populate_cache_if_vacant( + &self, + url_without_frag: String, + path: &Path, + fragment: &str, + ) -> Result { + let mut fragment_cache = self.cache.lock().await; + match fragment_cache.entry(url_without_frag.clone()) { + Entry::Vacant(entry) => { + let content = fs::read_to_string(path).await?; + let file_frags = extract_markdown_fragments(&content); + Ok(entry.insert(file_frags).contains(fragment)) + } + Entry::Occupied(entry) => Ok(entry.get().contains(fragment)), + } + } +} diff --git a/lychee-lib/src/utils/mod.rs b/lychee-lib/src/utils/mod.rs index fe6aec3356..d75d20c064 100644 --- a/lychee-lib/src/utils/mod.rs +++ b/lychee-lib/src/utils/mod.rs @@ -1,3 +1,4 @@ +pub(crate) mod fragment_checker; pub(crate) mod path; pub(crate) mod request; pub(crate) mod reqwest; diff --git a/lychee-lib/src/utils/path.rs b/lychee-lib/src/utils/path.rs index 7966ee820f..b03b0164e2 100644 --- a/lychee-lib/src/utils/path.rs +++ b/lychee-lib/src/utils/path.rs @@ -45,7 +45,7 @@ pub(crate) fn resolve(src: &Path, dst: &Path, base: &Option) -> Result { // Find `dst` in the parent directory of `src` let Some(parent) = src.parent() else { - return Err(ErrorKind::FileNotFound(relative.to_path_buf())) + return Err(ErrorKind::InvalidFile(relative.to_path_buf())) }; parent.join(relative) } @@ -62,7 +62,7 @@ pub(crate) fn resolve(src: &Path, dst: &Path, base: &Option) -> Result return Err(ErrorKind::FileNotFound(dst.to_path_buf())), + _ => return Err(ErrorKind::InvalidFile(dst.to_path_buf())), }; Ok(Some(absolute_path(resolved))) } diff --git a/lychee-lib/src/utils/request.rs b/lychee-lib/src/utils/request.rs index 8f8f5b6084..f092b72b1c 100644 --- a/lychee-lib/src/utils/request.rs +++ b/lychee-lib/src/utils/request.rs @@ -70,10 +70,19 @@ pub(crate) fn create( credentials, ))) } else if let InputSource::FsPath(root) = &input_content.source { - if is_anchor { - // Silently ignore anchor links for now - Ok(None) - } else if let Some(url) = create_uri_from_path(root, &text, base)? { + let path = if is_anchor { + match root.file_name() { + Some(file_name) => match file_name.to_str() { + Some(valid_str) => valid_str.to_string() + &text, + None => return Err(ErrorKind::InvalidFile(root.clone())), + }, + None => return Err(ErrorKind::InvalidFile(root.clone())), + } + } else { + text + }; + + if let Some(url) = create_uri_from_path(root, &path, base)? { let uri = Uri { url }; let credentials = credentials(extractor, &uri); @@ -122,7 +131,7 @@ fn construct_url(base: &Option, text: &str) -> Option> { } fn create_uri_from_path(src: &Path, dst: &str, base: &Option) -> Result> { - let dst = url::remove_get_params_and_fragment(dst); + let (dst, frag) = url::remove_get_params_and_seperate_fragment(dst); // Avoid double-encoding already encoded destination paths by removing any // potential encoding (e.g. `web%20site` becomes `web site`). // That's because Url::from_file_path will encode the full URL in the end. @@ -136,6 +145,10 @@ fn create_uri_from_path(src: &Path, dst: &str, base: &Option) -> Result Url::from_file_path(&path) + .map(|mut url| { + url.set_fragment(frag); + url + }) .map(Some) .map_err(|_e| ErrorKind::InvalidUrlFromPath(path)), None => Ok(None), diff --git a/lychee-lib/src/utils/url.rs b/lychee-lib/src/utils/url.rs index 4e76cc3801..4eb40f76bb 100644 --- a/lychee-lib/src/utils/url.rs +++ b/lychee-lib/src/utils/url.rs @@ -4,18 +4,18 @@ use once_cell::sync::Lazy; static LINK_FINDER: Lazy = Lazy::new(LinkFinder::new); -/// Remove all GET parameters from a URL. +/// Remove all GET parameters from a URL and seperates out the fragment. /// The link is not a URL but a String as it may not have a base domain. -pub(crate) fn remove_get_params_and_fragment(url: &str) -> &str { - let path = match url.split_once('#') { - Some((path_without_fragment, _fragment)) => path_without_fragment, - None => url, +pub(crate) fn remove_get_params_and_seperate_fragment(url: &str) -> (&str, Option<&str>) { + let (path, frag) = match url.split_once('#') { + Some((path, fragment)) => (path, Some(fragment)), + None => (url, None), }; let path = match path.split_once('?') { Some((path_without_params, _params)) => path_without_params, None => path, }; - path + (path, frag) } // Use `LinkFinder` to offload the raw link searching in plaintext @@ -29,47 +29,49 @@ mod test_fs_tree { #[test] fn test_remove_get_params_and_fragment() { - assert_eq!(remove_get_params_and_fragment("/"), "/"); + assert_eq!(remove_get_params_and_seperate_fragment("/"), ("/", None)); assert_eq!( - remove_get_params_and_fragment("index.html?foo=bar"), - "index.html" + remove_get_params_and_seperate_fragment("index.html?foo=bar"), + ("index.html", None) ); assert_eq!( - remove_get_params_and_fragment("/index.html?foo=bar"), - "/index.html" + remove_get_params_and_seperate_fragment("/index.html?foo=bar"), + ("/index.html", None) ); assert_eq!( - remove_get_params_and_fragment("/index.html?foo=bar&baz=zorx?bla=blub"), - "/index.html" + remove_get_params_and_seperate_fragment("/index.html?foo=bar&baz=zorx?bla=blub"), + ("/index.html", None) ); assert_eq!( - remove_get_params_and_fragment("https://example.com/index.html?foo=bar"), - "https://example.com/index.html" + remove_get_params_and_seperate_fragment("https://example.com/index.html?foo=bar"), + ("https://example.com/index.html", None) ); assert_eq!( - remove_get_params_and_fragment("test.png?foo=bar"), - "test.png" + remove_get_params_and_seperate_fragment("test.png?foo=bar"), + ("test.png", None) ); assert_eq!( - remove_get_params_and_fragment("https://example.com/index.html#anchor"), - "https://example.com/index.html" + remove_get_params_and_seperate_fragment("https://example.com/index.html#anchor"), + ("https://example.com/index.html", Some("anchor")) ); assert_eq!( - remove_get_params_and_fragment("https://example.com/index.html?foo=bar#anchor"), - "https://example.com/index.html" + remove_get_params_and_seperate_fragment( + "https://example.com/index.html?foo=bar#anchor" + ), + ("https://example.com/index.html", Some("anchor")) ); assert_eq!( - remove_get_params_and_fragment("test.png?foo=bar#anchor"), - "test.png" + remove_get_params_and_seperate_fragment("test.png?foo=bar#anchor"), + ("test.png", Some("anchor")) ); assert_eq!( - remove_get_params_and_fragment("test.png#anchor?anchor!?"), - "test.png" + remove_get_params_and_seperate_fragment("test.png#anchor?anchor!?"), + ("test.png", Some("anchor?anchor!?")) ); assert_eq!( - remove_get_params_and_fragment("test.png?foo=bar#anchor?anchor!"), - "test.png" + remove_get_params_and_seperate_fragment("test.png?foo=bar#anchor?anchor!"), + ("test.png", Some("anchor?anchor!")) ); } }