From b281f8bef9b9c5005706eeb24a90304d3eea7cfc Mon Sep 17 00:00:00 2001 From: Mark McDonald Date: Thu, 5 Apr 2018 12:27:45 -0400 Subject: [PATCH] Make crawl-within optional --- README.md | 23 ++++++++++++++++------- bin/doc_doc | 15 ++++++++++++--- lib/doc_doc/configuration/crawling.rb | 7 +++++++ 3 files changed, 35 insertions(+), 10 deletions(-) create mode 100644 lib/doc_doc/configuration/crawling.rb diff --git a/README.md b/README.md index a2f8e57..a131e59 100644 --- a/README.md +++ b/README.md @@ -48,26 +48,35 @@ $ doc_doc 'https://github.com/SeleniumHQ/selenium/wiki/Logging' > invalid_links. $ doc_doc 'https://github.com/SeleniumHQ/selenium/wiki/Logging' --crawl-within 'https://github.com/SeleniumHQ/selenium/wiki' > invalid_links.json ``` -When the `crawl-within` option is set, Doc doc will also check links on pages linked out to from the starting page, so long as the url of those pages are prefixed with the `crawl-within` value. +By default, Doc doc only checks links on the starting page. -By default, Doc doc only branches out once. +You can pass the `max-spiderings` option to crawl outwards from the starting site. Given a web site that has links like so: `https://www.example.com` -> `https://www.example.com/page1` -> `https://www.example.com/page2` -> `https://www.example.com/page3` -Doc doc will check links on both `https://www.example.com` and `https://www.example.com/page1`, but not the other pages. +` +$ doc_doc 'https://www.example.com' --max-spiderings 1 +` +would also include `https://www.example.com/page2`, but not `https://www.example.com/page3`. + +`$ doc_doc 'https://www.example.com' --max-spiderings 2` -You can override the max amount of "spidering" with the `max-spidering` option. +would also include `https://www.example.com/page3` -For example, -`$ doc_doc 'https://www.example.com' --crawl-within 'https://www.example.com' --max-spidering 2` +#### Ignoring links on external sites + +You can pass the `crawl-within` option to stop crawling or checking links after you land on a page outside the boundary. + +`$ doc_doc 'https://www.example.com' --max-spiderings 1 --crawl-within 'https://www.example.com'` would also include `https://www.example.com/page2`, but not `https://www.example.com/page3` -There is currently no concept of a "unique page". If `https://www.example.com/page1` links back to `https://www.example.com` and `max-spidering` is set to 2 or higher, then sick links on `https://www.example.com` will be included twice. +There is currently no concept of a "unique page". If `https://www.example.com/page1` links back to `https://www.example.com` and `max-spiderings` is set to 2 or higher, then sick links on `https://www.example.com` will be included twice. +setting `crawl_within` without `max-spiderings` is undefined behavior. ## Development diff --git a/bin/doc_doc b/bin/doc_doc index 0721101..5bc442d 100755 --- a/bin/doc_doc +++ b/bin/doc_doc @@ -5,14 +5,23 @@ require 'doc_doc' options = DocDoc::Configuration::Options.new( ARGV[0], - DocDoc::HorseAndBuggy::DEFAULT_THROTTLE + DocDoc::HorseAndBuggy::DEFAULT_THROTTLE, + DocDoc::Configuration::Crawling::DEFAULT_OPTIONS ) OptionParser.new do |parser| parser.banner = 'Usage: doc_doc http://www.some-documentation.example [options]' - parser.on("-tTHROTTLE", "--throttle=THROTTLE", Integer, "Seconds to wait between requests") do |n| - options.throttle = n + parser.on("-tTHROTTLE", "--throttle=THROTTLE", Integer, "Seconds to wait between requests") do |throttle| + options.throttle = throttle + end + + parser.on("-bBOUNDARY", "--boundary=BOUNDARY", URI, "Only websites within these bounds will be checked. (Straight prefix check, beware protocol mismatches)") do |boundary| + parser.crawling_options.boundary = boundary + end + + parser.on("-msMAX_SPIDERINGS", "--max-spiderings=MAX_SPIDERINGS", Integer, "Max links away from starting site") do |max_spiderings| + options.crawling_options.max_spiderings = max_spiderings end parser.on("-h", "--help", "Prints this help") do diff --git a/lib/doc_doc/configuration/crawling.rb b/lib/doc_doc/configuration/crawling.rb new file mode 100644 index 0000000..28e51c0 --- /dev/null +++ b/lib/doc_doc/configuration/crawling.rb @@ -0,0 +1,7 @@ +module DocDoc + module Configuration + module Crawling + Options = Struct.new(:boundary, :max_spiderings) + end + end +end \ No newline at end of file