-
Notifications
You must be signed in to change notification settings - Fork 24
/
tumblr-photo-downloader.rb
82 lines (65 loc) · 2 KB
/
tumblr-photo-downloader.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
require 'rubygems'
require 'bundler'
require 'digest/md5'
Bundler.require
site = ARGV[0]
directory = ARGV[1] ? ARGV[1] : site
if site.nil? || site.empty?
puts
puts "Usage: #{File.basename(__FILE__)} URL [directory to save in]"
puts "eg. #{File.basename(__FILE__)} jamiew.tumblr.com"
puts "eg. #{File.basename(__FILE__)} jamiew.tumblr.com ~/pictures/jamiew-tumblr-images/"
puts
exit 1
end
concurrency = 8
# Create a log directory
logs = [directory, 'logs'].join('/')
FileUtils.mkdir_p(logs)
num = 50
start = 0
puts "Downloading photos from #{site.inspect}, concurrency=#{concurrency} ..."
loop do
url = "http://#{site}/api/read?type=photo&num=#{num}&start=#{start}"
page = Mechanize.new.get(url)
doc = Nokogiri::XML.parse(page.body)
md5 = Digest::MD5.hexdigest(doc.to_s)
# Log the content that we are getting
File.open([logs, md5].join('/'), 'w') { | f |
f.write(doc.to_s)
}
images = (doc/'post photo-url').select{|x| x if x['max-width'].to_i == 1280 }
image_urls = images.map {|x| x.content }
already_had = 0
image_urls.each_slice(concurrency).each do |group|
threads = []
group.each do |url|
threads << Thread.new {
begin
filename = File.basename(url.split('?')[0])
if File.exists?("#{directory}/#{filename}") and Mechanize.new.head(url)["content-length"].to_i === File.stat("#{directory}/#{filename}").size.to_i
puts "Already have #{url}"
already_had += 1
else
puts "Saving photo #{url}"
file = Mechanize.new.get(url)
file.save_as("#{directory}/#{filename}")
end
rescue
puts "Error getting file, #{$!}"
end
}
end
threads.each{|t| t.join }
end
puts "#{images.count} images found (num=#{num})"
if images.count < num
puts "Our work here is done"
break
elsif already_had == num
puts "Had already downloaded the last #{already_had} of #{num} most recent images - done."
break
else
start += num
end
end