Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

crawl delay support #4

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 13 additions & 15 deletions lib/robotstxt/parser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
module Robotstxt
class Parser
attr_accessor :robot_id
attr_reader :found, :body, :sitemaps, :rules
attr_reader :found, :body, :sitemaps, :rules, :crawl_delay

# Initializes a new Robots::Robotstxtistance with <tt>robot_id</tt> option.
#
Expand All @@ -31,6 +31,7 @@ def initialize(robot_id = nil)
@robot_id = '*'
@rules = []
@sitemaps = []
@crawl_delay = 0
@robot_id = robot_id.downcase if !robot_id.nil?

end
Expand Down Expand Up @@ -139,28 +140,25 @@ def parse()
@body = @body.downcase

@body.each_line {|r|

case r
when /^#.+$/

when /^\s*user-agent\s*:.+$/

@rules << [ r.split(':')[1].strip, [], []]

when /^\s*useragent\s*:.+$/

@rules << [ r.split(':')[1].strip, [], []]

when /^\s*user-?agent\s*:.+$/
@rules << [ r.split(':')[1].strip, [], []]

when /^\s*crawl-?delay\s*:.+$/
@crawl_delay = r.split(':')[1].strip.to_i

when /^\s*disallow\s*:.+$/
r = r.split(':')[1].strip
@rules.last[1]<< r.gsub(/\*/,'.+') if r.length > 0
r = r.split(':')[1].strip
@rules.last[1]<< r.gsub(/\*/,'.+') if r.length > 0

when /^\s*allow\s*:.+$/
r = r.split(':')[1].strip
@rules.last[2]<< r.gsub(/\*/,'.+') if r.length > 0
r = r.split(':')[1].strip
@rules.last[2]<< r.gsub(/\*/,'.+') if r.length > 0

when /^\s*sitemap\s*:.+$/
@sitemaps<< r.split(':')[1].strip + ((r.split(':')[2].nil?) ? '' : r.split(':')[2].strip) if r.length > 0
@sitemaps<< r.split(':')[1].strip + ((r.split(':')[2].nil?) ? '' : r.split(':')[2].strip) if r.length > 0

end

Expand Down
39 changes: 39 additions & 0 deletions robotstxt.gemspec
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# -*- encoding: utf-8 -*-

Gem::Specification.new do |s|
s.name = %q{robotstxt}
s.version = "0.6.0"

s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
s.authors = ["Simone Rinzivillo"]
s.date = %q{2013-03-03}
s.description = %q{ Robotstxt Parser allows you to the check the accessibility of URLs and get other data. Full support for the robots.txt RFC, wildcards and Sitemap: rules.
}
s.email = %q{[email protected]}
s.extra_rdoc_files = ["LICENSE.rdoc", "README.rdoc", "lib/robotstxt.rb", "lib/robotstxt/parser.rb"]
s.files = ["LICENSE.rdoc", "Manifest", "README.rdoc", "Rakefile", "lib/robotstxt.rb", "lib/robotstxt/parser.rb", "test/parser_test.rb", "test/robotstxt_test.rb", "robotstxt.gemspec"]
s.homepage = %q{http://www.simonerinzivillo.it}
s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Robotstxt", "--main", "README.rdoc"]
s.require_paths = ["lib"]
s.rubyforge_project = %q{robotstxt}
s.rubygems_version = %q{1.3.5}
s.summary = %q{Robotstxt is an Ruby robots.txt file parser}
s.test_files = ["test/parser_test.rb", "test/robotstxt_test.rb"]

if s.respond_to? :specification_version then
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
s.specification_version = 3

if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
s.add_development_dependency(%q<rake>, ["~> 0.8"])
s.add_development_dependency(%q<echoe>, ["~> 3.1"])
s.add_development_dependency(%q<webmock>, ["~> 1.11.0"])
else
s.add_dependency(%q<rake>, ["~> 0.8"])
s.add_dependency(%q<echoe>, ["~> 3.1"])
end
else
s.add_dependency(%q<rake>, ["~> 0.8"])
s.add_dependency(%q<echoe>, ["~> 3.1"])
end
end
26 changes: 26 additions & 0 deletions test/crawl_delay_test.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
$:.unshift(File.dirname(__FILE__) + '/../lib')

require 'test/unit'
require 'webmock/test_unit'
require 'robotstxt'

class TestParser < Test::Unit::TestCase


def setup
@client = Robotstxt::Parser.new('rubytest')
@client.get('http://www.simonerinzivillo.it')
end

def test_crawl_delay
raw_response_file = File.new("test/robots_fixture.txt")
stub_request(:get, "www.simonerinzivillo.it/robots.txt").to_return(raw_response_file)
client = Robotstxt::Parser.new('rubytest')
client.get('http://www.simonerinzivillo.it')

p client

assert client.crawl_delay == 100
end

end
4 changes: 4 additions & 0 deletions test/parser_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,15 @@
class TestParser < Test::Unit::TestCase

def setup
raw_response_file = File.new("test/robots_fixture.txt")
stub_request(:get, "www.simonerinzivillo.it/robots.txt").to_return(raw_response_file)
@client = Robotstxt::Parser.new('rubytest')
@client.get('http://www.simonerinzivillo.it')
end

def test_initialize
raw_response_file = File.new("test/robots_fixture.txt")
stub_request(:get, "www.simonerinzivillo.it/robots.txt").to_return(raw_response_file)
client = Robotstxt::Parser.new('*')
assert_instance_of Robotstxt::Parser, client
end
Expand Down
24 changes: 24 additions & 0 deletions test/robots_fixture.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
HTTP/1.1 200 OK
Server: nginx/0.7.65
Date: Mon, 04 Mar 2013 19:25:33 GMT
Content-Type: text/plain
Content-Length: 248
Last-Modified: Sat, 23 Jul 2011 20:04:52 GMT
Connection: keep-alive
Accept-Ranges: bytes

User-agent: rubytest
Disallow: /no-dir/
Disallow: /no-page.php
Disallow: /*-no-dir/
Disallow: /dir/*.php
Disallow: *?var
Disallow: /dir/*?var

# this is a test
useragent: *
disalow: /test/
disallow: /mt4/
sitemap: /sitemapxml.xml

Crawl-Delay: 100
5 changes: 4 additions & 1 deletion test/robotstxt_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,16 @@

class TestRobotstxt < Test::Unit::TestCase


def test_allowed
raw_response_file = File.new("test/robots_fixture.txt")
stub_request(:get, "www.simonerinzivillo.it/robots.txt").to_return(raw_response_file)
assert true == Robotstxt.allowed?('http://www.simonerinzivillo.it/', 'rubytest')
assert false == Robotstxt.allowed?('http://www.simonerinzivillo.it/no-dir/', 'rubytest')
end

def test_sitemaps
raw_response_file = File.new("test/robots_fixture.txt")
stub_request(:get, "www.simonerinzivillo.it/robots.txt").to_return(raw_response_file)
assert Robotstxt.sitemaps('http://www.simonerinzivillo.it/', 'rubytest').length > 0
end

Expand Down