From f284af40a75517ec82518dc6bd9d643ffe448b42 Mon Sep 17 00:00:00 2001 From: Techassi Date: Sat, 15 Jul 2023 13:45:36 +0200 Subject: [PATCH 1/8] Add support for --dump-inputs --- README.md | 3 +++ lychee-bin/src/commands/dump.rs | 28 ++++++++++++++++++++++++++-- lychee-bin/src/commands/mod.rs | 1 + lychee-bin/src/main.rs | 7 +++++++ lychee-bin/src/options.rs | 6 ++++++ lychee-lib/src/collector.rs | 7 +++++++ lychee-lib/src/types/input.rs | 24 ++++++++++++++++++++++++ 7 files changed, 74 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 3e274162ad..5f0437847a 100644 --- a/README.md +++ b/README.md @@ -285,6 +285,9 @@ Options: --dump Don't perform any link checking. Instead, dump all the links extracted from inputs that would be checked + --dump-inputs + Don't perform any link collection and checking. Instead, dump all input sources from which links would be collected + --archive Specify the use of a specific web archive. Can be used in combination with `--suggest` diff --git a/lychee-bin/src/commands/dump.rs b/lychee-bin/src/commands/dump.rs index d16f7250ca..ba6ba5012f 100644 --- a/lychee-bin/src/commands/dump.rs +++ b/lychee-bin/src/commands/dump.rs @@ -37,8 +37,8 @@ where let requests = params.requests; tokio::pin!(requests); - if let Some(outfile) = ¶ms.cfg.output { - fs::File::create(outfile)?; + if let Some(out_file) = ¶ms.cfg.output { + fs::File::create(out_file)?; } let mut writer = create_writer(params.cfg.output)?; @@ -70,6 +70,30 @@ where Ok(ExitCode::Success) } +/// Dump all input sources to stdout without detecting any links and checking +/// them. +pub(crate) async fn dump_inputs(sources: S, output: Option<&PathBuf>) -> Result +where + S: futures::Stream>, +{ + let sources = sources; + tokio::pin!(sources); + + if let Some(out_file) = output { + fs::File::create(out_file)?; + } + + let mut writer = create_writer(output.cloned())?; + + while let Some(source) = sources.next().await { + let source = source?; + + writeln!(writer, "{source}")?; + } + + Ok(ExitCode::Success) +} + /// Dump request to stdout fn write( writer: &mut Box, diff --git a/lychee-bin/src/commands/mod.rs b/lychee-bin/src/commands/mod.rs index f5648bfc09..248a9a97ec 100644 --- a/lychee-bin/src/commands/mod.rs +++ b/lychee-bin/src/commands/mod.rs @@ -3,6 +3,7 @@ pub(crate) mod dump; pub(crate) use check::check; pub(crate) use dump::dump; +pub(crate) use dump::dump_inputs; use std::sync::Arc; diff --git a/lychee-bin/src/main.rs b/lychee-bin/src/main.rs index 04961cf4b7..353a9591f4 100644 --- a/lychee-bin/src/main.rs +++ b/lychee-bin/src/main.rs @@ -293,6 +293,13 @@ async fn run(opts: &LycheeOptions) -> Result { // File a bug if you rely on this envvar! It's going to go away eventually. .use_html5ever(std::env::var("LYCHEE_USE_HTML5EVER").map_or(false, |x| x == "1")); + if opts.config.dump_inputs { + let sources = collector.collect_sources(inputs).await; + let exit_code = commands::dump_inputs(sources, opts.config.output.as_ref()).await?; + + return Ok(exit_code as i32); + } + collector = if let Some(ref basic_auth) = opts.config.basic_auth { collector.basic_auth_extractor(BasicAuthExtractor::new(basic_auth)?) } else { diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs index 7b212d55c9..121cd254fa 100644 --- a/lychee-bin/src/options.rs +++ b/lychee-bin/src/options.rs @@ -181,6 +181,12 @@ pub(crate) struct Config { #[serde(default)] pub(crate) dump: bool, + /// Don't perform any link collection and checking. + /// Instead, dump all input sources from which links would be collected + #[arg(long)] + #[serde(default)] + pub(crate) dump_inputs: bool, + /// Specify the use of a specific web archive. /// Can be used in combination with `--suggest` #[arg(long, value_parser = clap::builder::PossibleValuesParser::new(Archive::VARIANTS).map(|s| s.parse::().unwrap()))] diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs index de4e42188b..532e9fb239 100644 --- a/lychee-lib/src/collector.rs +++ b/lychee-lib/src/collector.rs @@ -63,6 +63,13 @@ impl Collector { self } + /// TODO + pub async fn collect_sources(self, inputs: Vec) -> impl Stream> { + stream::iter(inputs) + .par_then_unordered(None, move |input| async move { input.get_sources().await }) + .flatten() + } + /// Fetch all unique links from inputs /// All relative URLs get prefixed with `base` (if given). /// (This can be a directory or a base URL) diff --git a/lychee-lib/src/types/input.rs b/lychee-lib/src/types/input.rs index 25052ee413..f0819bb5d6 100644 --- a/lychee-lib/src/types/input.rs +++ b/lychee-lib/src/types/input.rs @@ -260,6 +260,30 @@ impl Input { } } + /// Retrieve all sources from this input. + pub async fn get_sources(self) -> impl Stream> { + try_stream! { + match self.source { + InputSource::RemoteUrl(url) => yield url.to_string(), + InputSource::FsGlob { pattern, ignore_case } => { + let glob_expanded = tilde(&pattern).to_string(); + let mut match_opts = glob::MatchOptions::new(); + + match_opts.case_sensitive = !ignore_case; + + for entry in glob_with(&glob_expanded, match_opts)? { + match entry { + Ok(path) => yield path.to_string_lossy().to_string(), + Err(e) => eprintln!("{e:?}") + } + } + }, + InputSource::FsPath(path) => yield path.to_string_lossy().to_string(), + _ => return, + } + } + } + async fn url_contents(url: &Url) -> Result { // Assume HTML for default paths let file_type = if url.path().is_empty() || url.path() == "/" { From 37e21e3070de18b47cca4d22a18299f1e73f28db Mon Sep 17 00:00:00 2001 From: Techassi Date: Sat, 15 Jul 2023 13:50:56 +0200 Subject: [PATCH 2/8] Fix clippy errors --- lychee-lib/src/types/input.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lychee-lib/src/types/input.rs b/lychee-lib/src/types/input.rs index f0819bb5d6..33a5db0f98 100644 --- a/lychee-lib/src/types/input.rs +++ b/lychee-lib/src/types/input.rs @@ -279,7 +279,7 @@ impl Input { } }, InputSource::FsPath(path) => yield path.to_string_lossy().to_string(), - _ => return, + _ => (), } } } From 32824636a9916613fd4fe7a8bf326f37ec2e6134 Mon Sep 17 00:00:00 2001 From: Techassi Date: Sat, 15 Jul 2023 15:23:05 +0200 Subject: [PATCH 3/8] Add missing documentation --- lychee-lib/src/collector.rs | 3 ++- lychee-lib/src/types/input.rs | 12 +++++++++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs index 532e9fb239..9d94eb1561 100644 --- a/lychee-lib/src/collector.rs +++ b/lychee-lib/src/collector.rs @@ -63,7 +63,8 @@ impl Collector { self } - /// TODO + /// Collect all sources from a list of [`Input`]s. For further details, + /// see also [`Input::get_sources`](crate::Input#method.get_sources). pub async fn collect_sources(self, inputs: Vec) -> impl Stream> { stream::iter(inputs) .par_then_unordered(None, move |input| async move { input.get_sources().await }) diff --git a/lychee-lib/src/types/input.rs b/lychee-lib/src/types/input.rs index 33a5db0f98..3053705cd5 100644 --- a/lychee-lib/src/types/input.rs +++ b/lychee-lib/src/types/input.rs @@ -260,7 +260,17 @@ impl Input { } } - /// Retrieve all sources from this input. + /// Retrieve all sources from this input. The output depends on the type of + /// input: + /// + /// - Remote URLs are returned as is, in their full form + /// - Filepath Glob Patterns are expanded and each matched entry is returned + /// - Absolute or relative filepaths are returned as is + /// - All other input types are not returned + /// + /// # Errors + /// + /// Returns an error if the globbing fails with the expanded pattern. pub async fn get_sources(self) -> impl Stream> { try_stream! { match self.source { From 60f3cb02702c77f6f6cecfa4db2b17ee62181380 Mon Sep 17 00:00:00 2001 From: Techassi Date: Sat, 15 Jul 2023 21:12:59 +0200 Subject: [PATCH 4/8] Apply suggestions Co-authored-by: Matthias Endler --- README.md | 2 +- lychee-bin/src/commands/dump.rs | 2 +- lychee-bin/src/options.rs | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 5f0437847a..c28bd9b772 100644 --- a/README.md +++ b/README.md @@ -286,7 +286,7 @@ Options: Don't perform any link checking. Instead, dump all the links extracted from inputs that would be checked --dump-inputs - Don't perform any link collection and checking. Instead, dump all input sources from which links would be collected + Don't perform any link collection and checking. Instead, dump all input sources from which links would be extracted --archive Specify the use of a specific web archive. Can be used in combination with `--suggest` diff --git a/lychee-bin/src/commands/dump.rs b/lychee-bin/src/commands/dump.rs index ba6ba5012f..580cdb9988 100644 --- a/lychee-bin/src/commands/dump.rs +++ b/lychee-bin/src/commands/dump.rs @@ -70,7 +70,7 @@ where Ok(ExitCode::Success) } -/// Dump all input sources to stdout without detecting any links and checking +/// Dump all input sources to stdout without extracting any links and checking /// them. pub(crate) async fn dump_inputs(sources: S, output: Option<&PathBuf>) -> Result where diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs index 121cd254fa..e2abb9e3e1 100644 --- a/lychee-bin/src/options.rs +++ b/lychee-bin/src/options.rs @@ -181,7 +181,7 @@ pub(crate) struct Config { #[serde(default)] pub(crate) dump: bool, - /// Don't perform any link collection and checking. + /// Don't perform any link extraction and checking. /// Instead, dump all input sources from which links would be collected #[arg(long)] #[serde(default)] From fce4a2eb40f173b54ca2c6a53cf13820fb8853f2 Mon Sep 17 00:00:00 2001 From: Techassi Date: Sat, 15 Jul 2023 21:21:32 +0200 Subject: [PATCH 5/8] Fix usage guide in README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c28bd9b772..5f5a1575fb 100644 --- a/README.md +++ b/README.md @@ -286,7 +286,7 @@ Options: Don't perform any link checking. Instead, dump all the links extracted from inputs that would be checked --dump-inputs - Don't perform any link collection and checking. Instead, dump all input sources from which links would be extracted + Don't perform any link extraction and checking. Instead, dump all input sources from which links would be collected --archive Specify the use of a specific web archive. Can be used in combination with `--suggest` From 07fe746bf168a02bd4706cd288d0a3ec493dc245 Mon Sep 17 00:00:00 2001 From: Techassi Date: Sat, 15 Jul 2023 21:28:35 +0200 Subject: [PATCH 6/8] Add Stdin and Raw String output --- lychee-lib/src/types/input.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/lychee-lib/src/types/input.rs b/lychee-lib/src/types/input.rs index 3053705cd5..3549f2c1c4 100644 --- a/lychee-lib/src/types/input.rs +++ b/lychee-lib/src/types/input.rs @@ -289,7 +289,8 @@ impl Input { } }, InputSource::FsPath(path) => yield path.to_string_lossy().to_string(), - _ => (), + InputSource::Stdin => yield "Stdin".into(), + InputSource::String(_) => yield "Raw String".into(), } } } @@ -316,10 +317,10 @@ impl Input { async fn glob_contents( &self, - path_glob: &str, + pattern: &str, ignore_case: bool, ) -> impl Stream> + '_ { - let glob_expanded = tilde(&path_glob).to_string(); + let glob_expanded = tilde(&pattern).to_string(); let mut match_opts = glob::MatchOptions::new(); match_opts.case_sensitive = !ignore_case; From ac3fa4127c69b0ec573858aab977dca73c9fcc3b Mon Sep 17 00:00:00 2001 From: Techassi Date: Sat, 15 Jul 2023 22:38:36 +0200 Subject: [PATCH 7/8] Add integration tests --- lychee-bin/tests/cli.rs | 63 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs index a039eb4af2..0a1be26d34 100644 --- a/lychee-bin/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -1347,4 +1347,67 @@ mod cli { Ok(()) } + + #[test] + fn test_dump_inputs_glob() -> Result<()> { + let pattern = fixtures_path().join("**/*.md"); + + let mut cmd = main_command(); + cmd.arg("--dump-inputs") + .arg(pattern) + .assert() + .success() + .stdout(contains("fixtures/INTERNET_ARCHIVE.md")) + .stdout(contains("fixtures/TEST.md")) + .stdout(contains("fixtures/TEST_ALL_PRIVATE.md")) + .stdout(contains("fixtures/TEST_CODE_BLOCKS.md")) + .stdout(contains("fixtures/TEST_EMAIL.md")) + .stdout(contains("fixtures/TEST_EMAIL_QUERY_PARAMS.md")) + .stdout(contains("fixtures/TEST_EXAMPLE_DOMAINS.md")) + .stdout(contains("fixtures/TEST_GITHUB.md")) + .stdout(contains("fixtures/TEST_GITHUB_404.md")) + .stdout(contains("fixtures/TEST_SCHEMES.md")) + .stdout(contains("fixtures/exclude-path/dir1/TEST.md")) + .stdout(contains("fixtures/exclude-path/dir2/TEST.md")) + .stdout(contains("fixtures/exclude-path/dir2/subdir/TEST.md")) + .stdout(contains("fixtures/ignore/TEST.md")); + + Ok(()) + } + + #[test] + fn test_dump_inputs_url() -> Result<()> { + let mut cmd = main_command(); + cmd.arg("--dump-inputs") + .arg("https://example.com") + .assert() + .success() + .stdout(contains("https://example.com")); + + Ok(()) + } + + #[test] + fn test_dump_inputs_path() -> Result<()> { + let mut cmd = main_command(); + cmd.arg("--dump-inputs") + .arg("fixtures") + .assert() + .success() + .stdout(contains("fixtures")); + + Ok(()) + } + + #[test] + fn test_dump_inputs_stdin() -> Result<()> { + let mut cmd = main_command(); + cmd.arg("--dump-inputs") + .arg("-") + .assert() + .success() + .stdout(contains("Stdin")); + + Ok(()) + } } From 9f68bf9de692db8f26533e26c3abc172f4e2eea5 Mon Sep 17 00:00:00 2001 From: Matthias Date: Sun, 16 Jul 2023 17:56:54 +0200 Subject: [PATCH 8/8] Use subfolder for dump-inputs test --- fixtures/dump_inputs/markdown.md | 0 fixtures/dump_inputs/some_file.txt | 0 fixtures/dump_inputs/subfolder/example.bin | 0 fixtures/dump_inputs/subfolder/file2.md | 0 fixtures/dump_inputs/subfolder/test.html | 0 lychee-bin/tests/cli.rs | 37 +++++++++++++--------- 6 files changed, 22 insertions(+), 15 deletions(-) create mode 100644 fixtures/dump_inputs/markdown.md create mode 100644 fixtures/dump_inputs/some_file.txt create mode 100644 fixtures/dump_inputs/subfolder/example.bin create mode 100644 fixtures/dump_inputs/subfolder/file2.md create mode 100644 fixtures/dump_inputs/subfolder/test.html diff --git a/fixtures/dump_inputs/markdown.md b/fixtures/dump_inputs/markdown.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/fixtures/dump_inputs/some_file.txt b/fixtures/dump_inputs/some_file.txt new file mode 100644 index 0000000000..e69de29bb2 diff --git a/fixtures/dump_inputs/subfolder/example.bin b/fixtures/dump_inputs/subfolder/example.bin new file mode 100644 index 0000000000..e69de29bb2 diff --git a/fixtures/dump_inputs/subfolder/file2.md b/fixtures/dump_inputs/subfolder/file2.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/fixtures/dump_inputs/subfolder/test.html b/fixtures/dump_inputs/subfolder/test.html new file mode 100644 index 0000000000..e69de29bb2 diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs index 0a1be26d34..1702401c8d 100644 --- a/lychee-bin/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -1349,7 +1349,7 @@ mod cli { } #[test] - fn test_dump_inputs_glob() -> Result<()> { + fn test_dump_inputs_glob_md() -> Result<()> { let pattern = fixtures_path().join("**/*.md"); let mut cmd = main_command(); @@ -1357,20 +1357,27 @@ mod cli { .arg(pattern) .assert() .success() - .stdout(contains("fixtures/INTERNET_ARCHIVE.md")) - .stdout(contains("fixtures/TEST.md")) - .stdout(contains("fixtures/TEST_ALL_PRIVATE.md")) - .stdout(contains("fixtures/TEST_CODE_BLOCKS.md")) - .stdout(contains("fixtures/TEST_EMAIL.md")) - .stdout(contains("fixtures/TEST_EMAIL_QUERY_PARAMS.md")) - .stdout(contains("fixtures/TEST_EXAMPLE_DOMAINS.md")) - .stdout(contains("fixtures/TEST_GITHUB.md")) - .stdout(contains("fixtures/TEST_GITHUB_404.md")) - .stdout(contains("fixtures/TEST_SCHEMES.md")) - .stdout(contains("fixtures/exclude-path/dir1/TEST.md")) - .stdout(contains("fixtures/exclude-path/dir2/TEST.md")) - .stdout(contains("fixtures/exclude-path/dir2/subdir/TEST.md")) - .stdout(contains("fixtures/ignore/TEST.md")); + .stdout(contains("fixtures/dump_inputs/subfolder/file2.md")) + .stdout(contains("fixtures/dump_inputs/markdown.md")); + + Ok(()) + } + + #[test] + fn test_dump_inputs_glob_all() -> Result<()> { + let pattern = fixtures_path().join("**/*"); + + let mut cmd = main_command(); + cmd.arg("--dump-inputs") + .arg(pattern) + .assert() + .success() + .stdout(contains("fixtures/dump_inputs/subfolder/test.html")) + .stdout(contains("fixtures/dump_inputs/subfolder/file2.md")) + .stdout(contains("fixtures/dump_inputs/subfolder")) + .stdout(contains("fixtures/dump_inputs/markdown.md")) + .stdout(contains("fixtures/dump_inputs/subfolder/example.bin")) + .stdout(contains("fixtures/dump_inputs/some_file.txt")); Ok(()) }