diff --git a/lib/robotstxt/parser.rb b/lib/robotstxt/parser.rb
index 644f0cf..43cfdef 100644
--- a/lib/robotstxt/parser.rb
+++ b/lib/robotstxt/parser.rb
@@ -20,7 +20,7 @@
module Robotstxt
class Parser
attr_accessor :robot_id
- attr_reader :found, :body, :sitemaps, :rules
+ attr_reader :found, :body, :sitemaps, :rules, :crawl_delay
# Initializes a new Robots::Robotstxtistance with robot_id option.
#
@@ -31,6 +31,7 @@ def initialize(robot_id = nil)
@robot_id = '*'
@rules = []
@sitemaps = []
+ @crawl_delay = 0
@robot_id = robot_id.downcase if !robot_id.nil?
end
@@ -139,28 +140,25 @@ def parse()
@body = @body.downcase
@body.each_line {|r|
-
case r
when /^#.+$/
- when /^\s*user-agent\s*:.+$/
-
- @rules << [ r.split(':')[1].strip, [], []]
-
- when /^\s*useragent\s*:.+$/
-
- @rules << [ r.split(':')[1].strip, [], []]
-
+ when /^\s*user-?agent\s*:.+$/
+ @rules << [ r.split(':')[1].strip, [], []]
+
+ when /^\s*crawl-?delay\s*:.+$/
+ @crawl_delay = r.split(':')[1].strip.to_i
+
when /^\s*disallow\s*:.+$/
- r = r.split(':')[1].strip
- @rules.last[1]<< r.gsub(/\*/,'.+') if r.length > 0
+ r = r.split(':')[1].strip
+ @rules.last[1]<< r.gsub(/\*/,'.+') if r.length > 0
when /^\s*allow\s*:.+$/
- r = r.split(':')[1].strip
- @rules.last[2]<< r.gsub(/\*/,'.+') if r.length > 0
+ r = r.split(':')[1].strip
+ @rules.last[2]<< r.gsub(/\*/,'.+') if r.length > 0
when /^\s*sitemap\s*:.+$/
- @sitemaps<< r.split(':')[1].strip + ((r.split(':')[2].nil?) ? '' : r.split(':')[2].strip) if r.length > 0
+ @sitemaps<< r.split(':')[1].strip + ((r.split(':')[2].nil?) ? '' : r.split(':')[2].strip) if r.length > 0
end
diff --git a/robotstxt.gemspec b/robotstxt.gemspec
new file mode 100644
index 0000000..fcbc82e
--- /dev/null
+++ b/robotstxt.gemspec
@@ -0,0 +1,39 @@
+# -*- encoding: utf-8 -*-
+
+Gem::Specification.new do |s|
+ s.name = %q{robotstxt}
+ s.version = "0.6.0"
+
+ s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
+ s.authors = ["Simone Rinzivillo"]
+ s.date = %q{2013-03-03}
+ s.description = %q{ Robotstxt Parser allows you to the check the accessibility of URLs and get other data. Full support for the robots.txt RFC, wildcards and Sitemap: rules.
+}
+ s.email = %q{srinzivillo@gmail.com}
+ s.extra_rdoc_files = ["LICENSE.rdoc", "README.rdoc", "lib/robotstxt.rb", "lib/robotstxt/parser.rb"]
+ s.files = ["LICENSE.rdoc", "Manifest", "README.rdoc", "Rakefile", "lib/robotstxt.rb", "lib/robotstxt/parser.rb", "test/parser_test.rb", "test/robotstxt_test.rb", "robotstxt.gemspec"]
+ s.homepage = %q{http://www.simonerinzivillo.it}
+ s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Robotstxt", "--main", "README.rdoc"]
+ s.require_paths = ["lib"]
+ s.rubyforge_project = %q{robotstxt}
+ s.rubygems_version = %q{1.3.5}
+ s.summary = %q{Robotstxt is an Ruby robots.txt file parser}
+ s.test_files = ["test/parser_test.rb", "test/robotstxt_test.rb"]
+
+ if s.respond_to? :specification_version then
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
+ s.specification_version = 3
+
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
+ s.add_development_dependency(%q, ["~> 0.8"])
+ s.add_development_dependency(%q, ["~> 3.1"])
+ s.add_development_dependency(%q, ["~> 1.11.0"])
+ else
+ s.add_dependency(%q, ["~> 0.8"])
+ s.add_dependency(%q, ["~> 3.1"])
+ end
+ else
+ s.add_dependency(%q, ["~> 0.8"])
+ s.add_dependency(%q, ["~> 3.1"])
+ end
+end
diff --git a/test/crawl_delay_test.rb b/test/crawl_delay_test.rb
new file mode 100644
index 0000000..3449435
--- /dev/null
+++ b/test/crawl_delay_test.rb
@@ -0,0 +1,26 @@
+$:.unshift(File.dirname(__FILE__) + '/../lib')
+
+require 'test/unit'
+require 'webmock/test_unit'
+require 'robotstxt'
+
+class TestParser < Test::Unit::TestCase
+
+
+ def setup
+ @client = Robotstxt::Parser.new('rubytest')
+ @client.get('http://www.simonerinzivillo.it')
+ end
+
+ def test_crawl_delay
+ raw_response_file = File.new("test/robots_fixture.txt")
+ stub_request(:get, "www.simonerinzivillo.it/robots.txt").to_return(raw_response_file)
+ client = Robotstxt::Parser.new('rubytest')
+ client.get('http://www.simonerinzivillo.it')
+
+ p client
+
+ assert client.crawl_delay == 100
+ end
+
+end
\ No newline at end of file
diff --git a/test/parser_test.rb b/test/parser_test.rb
index 0683203..ea010fe 100644
--- a/test/parser_test.rb
+++ b/test/parser_test.rb
@@ -6,11 +6,15 @@
class TestParser < Test::Unit::TestCase
def setup
+ raw_response_file = File.new("test/robots_fixture.txt")
+ stub_request(:get, "www.simonerinzivillo.it/robots.txt").to_return(raw_response_file)
@client = Robotstxt::Parser.new('rubytest')
@client.get('http://www.simonerinzivillo.it')
end
def test_initialize
+ raw_response_file = File.new("test/robots_fixture.txt")
+ stub_request(:get, "www.simonerinzivillo.it/robots.txt").to_return(raw_response_file)
client = Robotstxt::Parser.new('*')
assert_instance_of Robotstxt::Parser, client
end
diff --git a/test/robots_fixture.txt b/test/robots_fixture.txt
new file mode 100644
index 0000000..428c88e
--- /dev/null
+++ b/test/robots_fixture.txt
@@ -0,0 +1,24 @@
+HTTP/1.1 200 OK
+Server: nginx/0.7.65
+Date: Mon, 04 Mar 2013 19:25:33 GMT
+Content-Type: text/plain
+Content-Length: 248
+Last-Modified: Sat, 23 Jul 2011 20:04:52 GMT
+Connection: keep-alive
+Accept-Ranges: bytes
+
+User-agent: rubytest
+Disallow: /no-dir/
+Disallow: /no-page.php
+Disallow: /*-no-dir/
+Disallow: /dir/*.php
+Disallow: *?var
+Disallow: /dir/*?var
+
+# this is a test
+useragent: *
+disalow: /test/
+disallow: /mt4/
+sitemap: /sitemapxml.xml
+
+Crawl-Delay: 100
\ No newline at end of file
diff --git a/test/robotstxt_test.rb b/test/robotstxt_test.rb
index f824b98..d3bd033 100644
--- a/test/robotstxt_test.rb
+++ b/test/robotstxt_test.rb
@@ -5,13 +5,16 @@
class TestRobotstxt < Test::Unit::TestCase
-
def test_allowed
+ raw_response_file = File.new("test/robots_fixture.txt")
+ stub_request(:get, "www.simonerinzivillo.it/robots.txt").to_return(raw_response_file)
assert true == Robotstxt.allowed?('http://www.simonerinzivillo.it/', 'rubytest')
assert false == Robotstxt.allowed?('http://www.simonerinzivillo.it/no-dir/', 'rubytest')
end
def test_sitemaps
+ raw_response_file = File.new("test/robots_fixture.txt")
+ stub_request(:get, "www.simonerinzivillo.it/robots.txt").to_return(raw_response_file)
assert Robotstxt.sitemaps('http://www.simonerinzivillo.it/', 'rubytest').length > 0
end