diff --git a/lib/robotstxt/parser.rb b/lib/robotstxt/parser.rb index 644f0cf..43cfdef 100644 --- a/lib/robotstxt/parser.rb +++ b/lib/robotstxt/parser.rb @@ -20,7 +20,7 @@ module Robotstxt class Parser attr_accessor :robot_id - attr_reader :found, :body, :sitemaps, :rules + attr_reader :found, :body, :sitemaps, :rules, :crawl_delay # Initializes a new Robots::Robotstxtistance with robot_id option. # @@ -31,6 +31,7 @@ def initialize(robot_id = nil) @robot_id = '*' @rules = [] @sitemaps = [] + @crawl_delay = 0 @robot_id = robot_id.downcase if !robot_id.nil? end @@ -139,28 +140,25 @@ def parse() @body = @body.downcase @body.each_line {|r| - case r when /^#.+$/ - when /^\s*user-agent\s*:.+$/ - - @rules << [ r.split(':')[1].strip, [], []] - - when /^\s*useragent\s*:.+$/ - - @rules << [ r.split(':')[1].strip, [], []] - + when /^\s*user-?agent\s*:.+$/ + @rules << [ r.split(':')[1].strip, [], []] + + when /^\s*crawl-?delay\s*:.+$/ + @crawl_delay = r.split(':')[1].strip.to_i + when /^\s*disallow\s*:.+$/ - r = r.split(':')[1].strip - @rules.last[1]<< r.gsub(/\*/,'.+') if r.length > 0 + r = r.split(':')[1].strip + @rules.last[1]<< r.gsub(/\*/,'.+') if r.length > 0 when /^\s*allow\s*:.+$/ - r = r.split(':')[1].strip - @rules.last[2]<< r.gsub(/\*/,'.+') if r.length > 0 + r = r.split(':')[1].strip + @rules.last[2]<< r.gsub(/\*/,'.+') if r.length > 0 when /^\s*sitemap\s*:.+$/ - @sitemaps<< r.split(':')[1].strip + ((r.split(':')[2].nil?) ? '' : r.split(':')[2].strip) if r.length > 0 + @sitemaps<< r.split(':')[1].strip + ((r.split(':')[2].nil?) ? '' : r.split(':')[2].strip) if r.length > 0 end diff --git a/robotstxt.gemspec b/robotstxt.gemspec new file mode 100644 index 0000000..fcbc82e --- /dev/null +++ b/robotstxt.gemspec @@ -0,0 +1,39 @@ +# -*- encoding: utf-8 -*- + +Gem::Specification.new do |s| + s.name = %q{robotstxt} + s.version = "0.6.0" + + s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version= + s.authors = ["Simone Rinzivillo"] + s.date = %q{2013-03-03} + s.description = %q{ Robotstxt Parser allows you to the check the accessibility of URLs and get other data. Full support for the robots.txt RFC, wildcards and Sitemap: rules. +} + s.email = %q{srinzivillo@gmail.com} + s.extra_rdoc_files = ["LICENSE.rdoc", "README.rdoc", "lib/robotstxt.rb", "lib/robotstxt/parser.rb"] + s.files = ["LICENSE.rdoc", "Manifest", "README.rdoc", "Rakefile", "lib/robotstxt.rb", "lib/robotstxt/parser.rb", "test/parser_test.rb", "test/robotstxt_test.rb", "robotstxt.gemspec"] + s.homepage = %q{http://www.simonerinzivillo.it} + s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Robotstxt", "--main", "README.rdoc"] + s.require_paths = ["lib"] + s.rubyforge_project = %q{robotstxt} + s.rubygems_version = %q{1.3.5} + s.summary = %q{Robotstxt is an Ruby robots.txt file parser} + s.test_files = ["test/parser_test.rb", "test/robotstxt_test.rb"] + + if s.respond_to? :specification_version then + current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION + s.specification_version = 3 + + if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then + s.add_development_dependency(%q, ["~> 0.8"]) + s.add_development_dependency(%q, ["~> 3.1"]) + s.add_development_dependency(%q, ["~> 1.11.0"]) + else + s.add_dependency(%q, ["~> 0.8"]) + s.add_dependency(%q, ["~> 3.1"]) + end + else + s.add_dependency(%q, ["~> 0.8"]) + s.add_dependency(%q, ["~> 3.1"]) + end +end diff --git a/test/crawl_delay_test.rb b/test/crawl_delay_test.rb new file mode 100644 index 0000000..3449435 --- /dev/null +++ b/test/crawl_delay_test.rb @@ -0,0 +1,26 @@ +$:.unshift(File.dirname(__FILE__) + '/../lib') + +require 'test/unit' +require 'webmock/test_unit' +require 'robotstxt' + +class TestParser < Test::Unit::TestCase + + + def setup + @client = Robotstxt::Parser.new('rubytest') + @client.get('http://www.simonerinzivillo.it') + end + + def test_crawl_delay + raw_response_file = File.new("test/robots_fixture.txt") + stub_request(:get, "www.simonerinzivillo.it/robots.txt").to_return(raw_response_file) + client = Robotstxt::Parser.new('rubytest') + client.get('http://www.simonerinzivillo.it') + + p client + + assert client.crawl_delay == 100 + end + +end \ No newline at end of file diff --git a/test/parser_test.rb b/test/parser_test.rb index 0683203..ea010fe 100644 --- a/test/parser_test.rb +++ b/test/parser_test.rb @@ -6,11 +6,15 @@ class TestParser < Test::Unit::TestCase def setup + raw_response_file = File.new("test/robots_fixture.txt") + stub_request(:get, "www.simonerinzivillo.it/robots.txt").to_return(raw_response_file) @client = Robotstxt::Parser.new('rubytest') @client.get('http://www.simonerinzivillo.it') end def test_initialize + raw_response_file = File.new("test/robots_fixture.txt") + stub_request(:get, "www.simonerinzivillo.it/robots.txt").to_return(raw_response_file) client = Robotstxt::Parser.new('*') assert_instance_of Robotstxt::Parser, client end diff --git a/test/robots_fixture.txt b/test/robots_fixture.txt new file mode 100644 index 0000000..428c88e --- /dev/null +++ b/test/robots_fixture.txt @@ -0,0 +1,24 @@ +HTTP/1.1 200 OK +Server: nginx/0.7.65 +Date: Mon, 04 Mar 2013 19:25:33 GMT +Content-Type: text/plain +Content-Length: 248 +Last-Modified: Sat, 23 Jul 2011 20:04:52 GMT +Connection: keep-alive +Accept-Ranges: bytes + +User-agent: rubytest +Disallow: /no-dir/ +Disallow: /no-page.php +Disallow: /*-no-dir/ +Disallow: /dir/*.php +Disallow: *?var +Disallow: /dir/*?var + +# this is a test +useragent: * +disalow: /test/ +disallow: /mt4/ +sitemap: /sitemapxml.xml + +Crawl-Delay: 100 \ No newline at end of file diff --git a/test/robotstxt_test.rb b/test/robotstxt_test.rb index f824b98..d3bd033 100644 --- a/test/robotstxt_test.rb +++ b/test/robotstxt_test.rb @@ -5,13 +5,16 @@ class TestRobotstxt < Test::Unit::TestCase - def test_allowed + raw_response_file = File.new("test/robots_fixture.txt") + stub_request(:get, "www.simonerinzivillo.it/robots.txt").to_return(raw_response_file) assert true == Robotstxt.allowed?('http://www.simonerinzivillo.it/', 'rubytest') assert false == Robotstxt.allowed?('http://www.simonerinzivillo.it/no-dir/', 'rubytest') end def test_sitemaps + raw_response_file = File.new("test/robots_fixture.txt") + stub_request(:get, "www.simonerinzivillo.it/robots.txt").to_return(raw_response_file) assert Robotstxt.sitemaps('http://www.simonerinzivillo.it/', 'rubytest').length > 0 end