From bdccb9008e7aed54bfb8ef59b3eb1ffb1410628b Mon Sep 17 00:00:00 2001 From: Matthias Date: Thu, 7 Nov 2024 18:49:20 +0100 Subject: [PATCH 1/2] Add support for custom file extensions in link checking. This adds support for overwriting extensions: ``` lychee . --extensions md,html,txt,json,yaml ``` The above would only check these extensions. This was enabled by moving to `ignore` (#1500 by @thomas-zahner). Fixes #410 --- README.md | 7 ++++ lychee-bin/src/main.rs | 2 +- lychee-bin/src/options.rs | 25 +++++++++++- lychee-lib/src/collector.rs | 34 +++++++++++++---- lychee-lib/src/types/file.rs | 71 ++++++++++++++++++++++++----------- lychee-lib/src/types/input.rs | 41 +++++++++----------- 6 files changed, 124 insertions(+), 56 deletions(-) diff --git a/README.md b/README.md index 779e9f7d53..3a46a06f9a 100644 --- a/README.md +++ b/README.md @@ -327,6 +327,13 @@ Options: Do not show progress bar. This is recommended for non-interactive shells (e.g. for continuous integration) + --extensions + Test the specified file extensions for URIs when checking files locally. + + Multiple extensions can be separated by commas. Note that if you want to check filetypes, + which have multiple extensions, e.g. HTML files with both .html and .htm extensions, you need to + specify both extensions explicitly. + --cache Use request cache stored on disk at `.lycheecache` diff --git a/lychee-bin/src/main.rs b/lychee-bin/src/main.rs index cd96c27f75..c548da4b06 100644 --- a/lychee-bin/src/main.rs +++ b/lychee-bin/src/main.rs @@ -314,7 +314,7 @@ async fn run(opts: &LycheeOptions) -> Result { collector }; - let requests = collector.collect_links(inputs); + let requests = collector.collect_links_with_ext(inputs, opts.config.extensions.clone()); let cache = load_cache(&opts.config).unwrap_or_default(); let cache = Arc::new(cache); diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs index 60c259d2ea..ff424005fd 100644 --- a/lychee-bin/src/options.rs +++ b/lychee-bin/src/options.rs @@ -6,8 +6,9 @@ use clap::builder::PossibleValuesParser; use clap::{arg, builder::TypedValueParser, Parser}; use const_format::{concatcp, formatcp}; use lychee_lib::{ - Base, BasicAuthSelector, Input, StatusCodeExcluder, StatusCodeSelector, DEFAULT_MAX_REDIRECTS, - DEFAULT_MAX_RETRIES, DEFAULT_RETRY_WAIT_TIME_SECS, DEFAULT_TIMEOUT_SECS, DEFAULT_USER_AGENT, + Base, BasicAuthSelector, FileType, Input, StatusCodeExcluder, StatusCodeSelector, + DEFAULT_MAX_REDIRECTS, DEFAULT_MAX_RETRIES, DEFAULT_RETRY_WAIT_TIME_SECS, DEFAULT_TIMEOUT_SECS, + DEFAULT_USER_AGENT, }; use secrecy::{ExposeSecret, SecretString}; use serde::Deserialize; @@ -218,6 +219,25 @@ pub(crate) struct Config { #[serde(default)] pub(crate) no_progress: bool, + /// A list of custom extensions for link checking + /// + /// E.g. a user can specify `--extensions html,htm,php,asp,aspx,jsp,cgi` + /// to check for links in files with these extensions. + /// + /// This is useful when the default extensions are not enough and you don't + /// want to provide a long list of inputs (e.g. file1.html, file2.md, etc.) + #[arg( + long, + value_delimiter = ',', + long_help = "Test the specified file extensions for URIs when checking files locally. + +Multiple extensions can be separated by commas. Note that if you want to check filetypes, +which have multiple extensions, e.g. HTML files with both .html and .htm extensions, you need to +specify both extensions explicitly." + )] + #[serde(default = "FileType::default_extensions")] + pub(crate) extensions: Vec, + #[arg(help = HELP_MSG_CACHE)] #[arg(long)] #[serde(default)] @@ -563,6 +583,7 @@ impl Config { cookie_jar: None; include_fragments: false; accept: StatusCodeSelector::default(); + extensions: FileType::default_extensions(); } if self diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs index 955bdd24e7..9d6ba951e3 100644 --- a/lychee-lib/src/collector.rs +++ b/lychee-lib/src/collector.rs @@ -92,6 +92,12 @@ impl Collector { .flatten() } + /// Convenience method to fetch all unique links from inputs + /// with the default extensions. + pub fn collect_links(self, inputs: Vec) -> impl Stream> { + self.collect_links_with_ext(inputs, crate::types::FileType::default_extensions()) + } + /// Fetch all unique links from inputs /// All relative URLs get prefixed with `base` (if given). /// (This can be a directory or a base URL) @@ -99,7 +105,11 @@ impl Collector { /// # Errors /// /// Will return `Err` if links cannot be extracted from an input - pub fn collect_links(self, inputs: Vec) -> impl Stream> { + pub fn collect_links_with_ext( + self, + inputs: Vec, + extensions: Vec, + ) -> impl Stream> { let skip_missing_inputs = self.skip_missing_inputs; let skip_hidden = self.skip_hidden; let skip_ignored = self.skip_ignored; @@ -107,13 +117,14 @@ impl Collector { stream::iter(inputs) .par_then_unordered(None, move |input| { let default_base = global_base.clone(); + let extensions = extensions.clone(); async move { let base = match &input.source { InputSource::RemoteUrl(url) => Base::try_from(url.as_str()).ok(), _ => default_base, }; input - .get_contents(skip_missing_inputs, skip_hidden, skip_ignored) + .get_contents(skip_missing_inputs, skip_hidden, skip_ignored, extensions) .map(move |content| (content, base.clone())) } }) @@ -153,11 +164,18 @@ mod tests { responses.map(|r| r.unwrap().uri).collect().await } - // Helper function for collecting verbatim links - async fn collect_verbatim(inputs: Vec, base: Option) -> HashSet { + /// Helper function for collecting verbatim links + /// + /// A verbatim link is a link that is not parsed by the HTML parser. + /// For example, a link in a code block or a script tag. + async fn collect_verbatim( + inputs: Vec, + base: Option, + extensions: Vec, + ) -> HashSet { let responses = Collector::new(base) .include_verbatim(true) - .collect_links(inputs); + .collect_links_with_ext(inputs, extensions); responses.map(|r| r.unwrap().uri).collect().await } @@ -175,7 +193,7 @@ mod tests { let _file = File::create(&file_path).unwrap(); let input = Input::new(&file_path.as_path().display().to_string(), None, true, None)?; let contents: Vec<_> = input - .get_contents(true, true, true) + .get_contents(true, true, true, FileType::default_extensions()) .collect::>() .await; @@ -188,7 +206,7 @@ mod tests { async fn test_url_without_extension_is_html() -> Result<()> { let input = Input::new("https://example.com/", None, true, None)?; let contents: Vec<_> = input - .get_contents(true, true, true) + .get_contents(true, true, true, FileType::default_extensions()) .collect::>() .await; @@ -246,7 +264,7 @@ mod tests { }, ]; - let links = collect_verbatim(inputs, None).await; + let links = collect_verbatim(inputs, None, FileType::default_extensions()).await; let expected_links = HashSet::from_iter([ website(TEST_STRING), diff --git a/lychee-lib/src/types/file.rs b/lychee-lib/src/types/file.rs index 7cdeff6c80..68d773fce1 100644 --- a/lychee-lib/src/types/file.rs +++ b/lychee-lib/src/types/file.rs @@ -12,28 +12,46 @@ pub enum FileType { Plaintext, } +impl FileType { + /// All known Markdown extensions + const MARKDOWN_EXTENSIONS: &'static [&'static str] = &[ + "markdown", "mkdown", "mkdn", "mdwn", "mdown", "mdx", "mkd", "md", + ]; + + /// All known HTML extensions + const HTML_EXTENSIONS: &'static [&'static str] = &["htm", "html"]; + + /// Default extensions which are supported by lychee + #[must_use] + pub fn default_extensions() -> Vec { + let mut extensions = Vec::new(); + extensions.extend(Self::MARKDOWN_EXTENSIONS.iter().map(|&s| s.to_string())); + extensions.extend(Self::HTML_EXTENSIONS.iter().map(|&s| s.to_string())); + extensions + } + + /// Get the [`FileType`] from an extension string + fn from_extension(ext: &str) -> Option { + let ext = ext.to_lowercase(); + if Self::MARKDOWN_EXTENSIONS.contains(&ext.as_str()) { + Some(Self::Markdown) + } else if Self::HTML_EXTENSIONS.contains(&ext.as_str()) { + Some(Self::Html) + } else { + None + } + } +} + impl Default for FileType { fn default() -> Self { + // This is the default file type when no other type can be determined. + // It represents a generic text file with no specific syntax. Self::Plaintext } } impl> From

for FileType { - /// Detect if the given path points to a Markdown, HTML, or plaintext file. - // - // Assume HTML in case of no extension. - // - // This is only reasonable for URLs, not paths on disk. For example, - // a file named `README` without an extension is more likely to be a - // plaintext file. - // - // A better solution would be to also implement `From for - // FileType`. Unfortunately that's not possible without refactoring, as - // `AsRef` could be implemented for `Url` in the future, which is - // why `From for FileType` is not allowed (orphan rule). - // - // As a workaround, we check if the scheme is `http` or `https` and - // assume HTML in that case. fn from(p: P) -> FileType { let path = p.as_ref(); match path @@ -41,12 +59,9 @@ impl> From

for FileType { .and_then(std::ffi::OsStr::to_str) .map(str::to_lowercase) .as_deref() + .and_then(FileType::from_extension) { - // https://superuser.com/a/285878 - Some("markdown" | "mkdown" | "mkdn" | "mdwn" | "mdown" | "mdx" | "mkd" | "md") => { - FileType::Markdown - } - Some("htm" | "html") => FileType::Html, + Some(file_type) => file_type, None if is_url(path) => FileType::Html, _ => FileType::default(), } @@ -54,7 +69,6 @@ impl> From

for FileType { } /// Helper function to check if a path is likely a URL. - fn is_url(path: &Path) -> bool { path.to_str() .and_then(|s| Url::parse(s).ok()) @@ -89,6 +103,21 @@ mod tests { ); } + #[test] + fn test_default_extensions() { + let extensions = FileType::default_extensions(); + // Test some known extensions + assert!(extensions.contains(&"md".to_string())); + assert!(extensions.contains(&"html".to_string())); + assert!(extensions.contains(&"markdown".to_string())); + assert!(extensions.contains(&"htm".to_string())); + // Test the count matches our static arrays + assert_eq!( + extensions.len(), + FileType::MARKDOWN_EXTENSIONS.len() + FileType::HTML_EXTENSIONS.len() + ); + } + #[test] fn test_is_url() { // Valid URLs diff --git a/lychee-lib/src/types/input.rs b/lychee-lib/src/types/input.rs index c32be7feb8..47aa3a1758 100644 --- a/lychee-lib/src/types/input.rs +++ b/lychee-lib/src/types/input.rs @@ -3,6 +3,7 @@ use crate::{utils, ErrorKind, Result}; use async_stream::try_stream; use futures::stream::Stream; use glob::glob_with; +use ignore::types::TypesBuilder; use ignore::WalkBuilder; use reqwest::Url; use serde::{Deserialize, Serialize}; @@ -14,12 +15,6 @@ use tokio::io::{stdin, AsyncReadExt}; const STDIN: &str = "-"; -// Check the extension of the given path against the list of known/accepted -// file extensions -fn valid_extension(p: &Path) -> bool { - matches!(FileType::from(p), FileType::Markdown | FileType::Html) -} - #[derive(Debug)] /// Encapsulates the content for a given input pub struct InputContent { @@ -209,6 +204,7 @@ impl Input { skip_missing: bool, skip_hidden: bool, skip_gitignored: bool, + extensions: Vec, ) -> impl Stream> { try_stream! { match self.source { @@ -231,24 +227,32 @@ impl Input { } InputSource::FsPath(ref path) => { if path.is_dir() { - for entry in WalkBuilder::new(path).standard_filters(skip_gitignored).hidden(skip_hidden).build() { - let entry = entry?; + let mut types_builder = TypesBuilder::new(); + for ext in extensions { + types_builder.add(&ext, &format!("*.{ext}"))?; + } + + for entry in WalkBuilder::new(path) + .standard_filters(skip_gitignored) + .types(types_builder.select("all").build()?) + .hidden(skip_hidden) + .build() + { + let entry = entry?; if self.is_excluded_path(&entry.path().to_path_buf()) { continue; } - match entry.file_type() { None => continue, Some(file_type) => { - if !file_type.is_file() || !valid_extension(entry.path()) { + if !file_type.is_file() { continue; } } - }; - + } let content = Self::path_content(entry.path()).await?; - yield content + yield content; } } else { if self.is_excluded_path(path) { @@ -459,17 +463,6 @@ mod tests { assert!(matches!(input, Err(ErrorKind::InvalidFile(PathBuf { .. })))); } - #[test] - fn test_valid_extension() { - assert!(valid_extension(Path::new("file.md"))); - assert!(valid_extension(Path::new("file.markdown"))); - assert!(valid_extension(Path::new("file.html"))); - assert!(valid_extension(Path::new("file.htm"))); - assert!(valid_extension(Path::new("file.HTM"))); - assert!(!valid_extension(Path::new("file.txt"))); - assert!(!valid_extension(Path::new("file"))); - } - #[test] fn test_no_exclusions() { let dir = tempfile::tempdir().unwrap(); From fc0ffde6a156058a6e0f4bea4526fcde5164d319 Mon Sep 17 00:00:00 2001 From: Matthias Endler Date: Sat, 9 Nov 2024 00:47:24 +0100 Subject: [PATCH 2/2] Update lychee-bin/src/options.rs Co-authored-by: Thomas Zahner --- lychee-bin/src/options.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs index ff424005fd..d29877ae25 100644 --- a/lychee-bin/src/options.rs +++ b/lychee-bin/src/options.rs @@ -219,7 +219,7 @@ pub(crate) struct Config { #[serde(default)] pub(crate) no_progress: bool, - /// A list of custom extensions for link checking + /// A list of file extensions. Files not matching the specified extensions are skipped. /// /// E.g. a user can specify `--extensions html,htm,php,asp,aspx,jsp,cgi` /// to check for links in files with these extensions.