diff --git a/Gemfile b/Gemfile index b4e2a20..2c56be5 100644 --- a/Gemfile +++ b/Gemfile @@ -1,3 +1,5 @@ source "https://rubygems.org" gemspec + +gem "retryable", "~> 3.0" diff --git a/bin/wayback_machine_downloader b/bin/wayback_machine_downloader index fd213b2..833d4b2 100755 --- a/bin/wayback_machine_downloader +++ b/bin/wayback_machine_downloader @@ -46,6 +46,10 @@ option_parser = OptionParser.new do |opts| options[:wait_randomize] = true end + opts.on("--tries NUMBER", Integer, "Number of times to retry for non-fatal connection errors (Default is 20)") do |t| + options[:tries] = t + end + opts.on("-x", "--exclude EXCLUDE_FILTER", String, "Skip downloading of urls that match this filter", "(use // notation for the filter to be treated as a regex)") do |t| options[:exclude_filter] = t end diff --git a/lib/wayback_machine_downloader.rb b/lib/wayback_machine_downloader.rb index a52fb58..dc4363c 100644 --- a/lib/wayback_machine_downloader.rb +++ b/lib/wayback_machine_downloader.rb @@ -6,6 +6,7 @@ require 'fileutils' require 'cgi' require 'json' +require 'retryable' require_relative 'wayback_machine_downloader/tidy_bytes' require_relative 'wayback_machine_downloader/to_regex' require_relative 'wayback_machine_downloader/archive_api' @@ -34,6 +35,7 @@ def initialize params @threads_count = params[:threads_count].to_i @wait_seconds = params[:wait_seconds].to_i @wait_randomized = params[:wait_randomized] + @tries = params[:tries] ? params[:tries].to_i : 20 end def backup_name @@ -271,10 +273,12 @@ def download_file file_remote_info begin structure_dir_path dir_path open(file_path, "wb") do |file| + file_url_escaped = CGI.escape file_url begin - file_url_escaped = CGI.escape file_url - URI.open("http://web.archive.org/web/#{file_timestamp}id_/#{file_url_escaped}", "Accept-Encoding" => "plain") do |uri| - file.write(uri.read) + Retryable.retryable(tries: @tries, on: Net::ReadTimeout, sleep_method: self.method(:wait)) do + URI.open("http://web.archive.org/web/#{file_timestamp}id_/#{file_url_escaped}", "Accept-Encoding" => "plain") do |uri| + file.write(uri.read) + end end rescue OpenURI::HTTPError => e puts "#{file_url} # #{e}" diff --git a/lib/wayback_machine_downloader/archive_api.rb b/lib/wayback_machine_downloader/archive_api.rb index 75c5715..fde1b32 100644 --- a/lib/wayback_machine_downloader/archive_api.rb +++ b/lib/wayback_machine_downloader/archive_api.rb @@ -5,7 +5,9 @@ def get_raw_list_from_api url, page_index request_url += CGI.escape url request_url += parameters_for_api page_index - URI.open(request_url).read + Retryable.retryable(tries: @tries, on: Net::ReadTimeout, sleep_method: self.method(:wait)) do + URI.open(request_url).read + end end def parameters_for_api page_index