diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..156e487 --- /dev/null +++ b/.gitignore @@ -0,0 +1,19 @@ +# See https://help.github.com/articles/ignoring-files for more about ignoring files. +# +# If you find yourself ignoring temporary files generated by your text editor +# or operating system, you probably want to add a global ignore instead: +# git config --global core.excludesfile '~/.gitignore_global' + +# Ignore bundler config. +/.bundle + +# Ignore the default SQLite database. +/db/*.sqlite3 +/db/*.sqlite3-journal + +# Ignore all logfiles and tempfiles. +/log/*.log +/tmp +vendor/all_urls.txt +vendor/domain_lookups.csv +.env diff --git a/.rspec b/.rspec new file mode 100644 index 0000000..4e1e0d2 --- /dev/null +++ b/.rspec @@ -0,0 +1 @@ +--color diff --git a/.ruby-gemset b/.ruby-gemset new file mode 100644 index 0000000..2bd3253 --- /dev/null +++ b/.ruby-gemset @@ -0,0 +1 @@ +link_scraper \ No newline at end of file diff --git a/.ruby-version b/.ruby-version new file mode 100644 index 0000000..867e7ce --- /dev/null +++ b/.ruby-version @@ -0,0 +1 @@ +ruby-2.0.0-p195 \ No newline at end of file diff --git a/Gemfile b/Gemfile new file mode 100644 index 0000000..20c3831 --- /dev/null +++ b/Gemfile @@ -0,0 +1,35 @@ +source 'https://rubygems.org' + +gem "decent_exposure" +gem "decent_generators" +gem 'dotenv-rails' +gem "haml" +gem "haml-rails" +gem "librato-logreporter" +gem "pg" +gem "pry" +gem "pry-rails" +gem "twitter-bootstrap-rails" +gem 'addressable', require: 'addressable/uri' +gem 'coffee-rails', '~> 4.0.0' +gem 'jbuilder', '~> 1.2' +gem 'jquery-rails' +gem 'nokogiri' +gem 'rails' +gem 'sass-rails', '~> 4.0.0' +gem 'stringex' +gem 'turbolinks' +gem 'typhoeus' +gem 'uglifier', '>= 1.3.0' +gem 'whois' + +group :test, :development do + gem "factory_girl" + gem "fivemat" + gem "rspec-rails" + gem "rspec" +end + +group :test do + gem "shoulda-matchers" +end diff --git a/Gemfile.lock b/Gemfile.lock new file mode 100644 index 0000000..ad1ec3b --- /dev/null +++ b/Gemfile.lock @@ -0,0 +1,193 @@ +GEM + remote: https://rubygems.org/ + specs: + actionmailer (4.0.2) + actionpack (= 4.0.2) + mail (~> 2.5.4) + actionpack (4.0.2) + activesupport (= 4.0.2) + builder (~> 3.1.0) + erubis (~> 2.7.0) + rack (~> 1.5.2) + rack-test (~> 0.6.2) + activemodel (4.0.2) + activesupport (= 4.0.2) + builder (~> 3.1.0) + activerecord (4.0.2) + activemodel (= 4.0.2) + activerecord-deprecated_finders (~> 1.0.2) + activesupport (= 4.0.2) + arel (~> 4.0.0) + activerecord-deprecated_finders (1.0.3) + activesupport (4.0.2) + i18n (~> 0.6, >= 0.6.4) + minitest (~> 4.2) + multi_json (~> 1.3) + thread_safe (~> 0.1) + tzinfo (~> 0.3.37) + addressable (2.3.5) + arel (4.0.1) + atomic (1.1.14) + builder (3.1.4) + coderay (1.1.0) + coffee-rails (4.0.1) + coffee-script (>= 2.2.0) + railties (>= 4.0.0, < 5.0) + coffee-script (2.2.0) + coffee-script-source + execjs + coffee-script-source (1.6.3) + decent_exposure (2.3.0) + decent_generators (0.0.1) + rails (~> 4.0.0) + diff-lcs (1.2.5) + dotenv (0.9.0) + dotenv-rails (0.9.0) + dotenv (= 0.9.0) + erubis (2.7.0) + ethon (0.6.2) + ffi (>= 1.3.0) + mime-types (~> 1.18) + execjs (2.0.2) + factory_girl (4.3.0) + activesupport (>= 3.0.0) + ffi (1.9.3) + fivemat (1.2.1) + haml (4.0.4) + tilt + haml-rails (0.5.3) + actionpack (>= 4.0.1) + activesupport (>= 4.0.1) + haml (>= 3.1, < 5.0) + railties (>= 4.0.1) + hike (1.2.3) + i18n (0.6.9) + jbuilder (1.5.3) + activesupport (>= 3.0.0) + multi_json (>= 1.2.0) + jquery-rails (3.0.4) + railties (>= 3.0, < 5.0) + thor (>= 0.14, < 2.0) + json (1.8.1) + librato-logreporter (0.2.1) + mail (2.5.4) + mime-types (~> 1.16) + treetop (~> 1.4.8) + method_source (0.8.2) + mime-types (1.25.1) + mini_portile (0.5.2) + minitest (4.7.5) + multi_json (1.8.2) + nokogiri (1.6.1) + mini_portile (~> 0.5.0) + pg (0.17.1) + polyglot (0.3.3) + pry (0.9.12.4) + coderay (~> 1.0) + method_source (~> 0.8) + slop (~> 3.4) + pry-rails (0.3.2) + pry (>= 0.9.10) + rack (1.5.2) + rack-test (0.6.2) + rack (>= 1.0) + rails (4.0.2) + actionmailer (= 4.0.2) + actionpack (= 4.0.2) + activerecord (= 4.0.2) + activesupport (= 4.0.2) + bundler (>= 1.3.0, < 2.0) + railties (= 4.0.2) + sprockets-rails (~> 2.0.0) + railties (4.0.2) + actionpack (= 4.0.2) + activesupport (= 4.0.2) + rake (>= 0.8.7) + thor (>= 0.18.1, < 2.0) + rake (10.1.1) + rspec (2.14.1) + rspec-core (~> 2.14.0) + rspec-expectations (~> 2.14.0) + rspec-mocks (~> 2.14.0) + rspec-core (2.14.7) + rspec-expectations (2.14.4) + diff-lcs (>= 1.1.3, < 2.0) + rspec-mocks (2.14.4) + rspec-rails (2.14.0) + actionpack (>= 3.0) + activesupport (>= 3.0) + railties (>= 3.0) + rspec-core (~> 2.14.0) + rspec-expectations (~> 2.14.0) + rspec-mocks (~> 2.14.0) + sass (3.2.13) + sass-rails (4.0.1) + railties (>= 4.0.0, < 5.0) + sass (>= 3.1.10) + sprockets-rails (~> 2.0.0) + shoulda-matchers (2.4.0) + activesupport (>= 3.0.0) + slop (3.4.7) + sprockets (2.10.1) + hike (~> 1.2) + multi_json (~> 1.0) + rack (~> 1.0) + tilt (~> 1.1, != 1.3.0) + sprockets-rails (2.0.1) + actionpack (>= 3.0) + activesupport (>= 3.0) + sprockets (~> 2.8) + stringex (2.1.2) + thor (0.18.1) + thread_safe (0.1.3) + atomic + tilt (1.4.1) + treetop (1.4.15) + polyglot + polyglot (>= 0.3.1) + turbolinks (2.1.0) + coffee-rails + twitter-bootstrap-rails (2.2.8) + actionpack (>= 3.1) + execjs + rails (>= 3.1) + railties (>= 3.1) + typhoeus (0.6.7) + ethon (~> 0.6.2) + tzinfo (0.3.38) + uglifier (2.4.0) + execjs (>= 0.3.0) + json (>= 1.8.0) + whois (3.4.2) + +PLATFORMS + ruby + +DEPENDENCIES + addressable + coffee-rails (~> 4.0.0) + decent_exposure + decent_generators + dotenv-rails + factory_girl + fivemat + haml + haml-rails + jbuilder (~> 1.2) + jquery-rails + librato-logreporter + nokogiri + pg + pry + pry-rails + rails + rspec + rspec-rails + sass-rails (~> 4.0.0) + shoulda-matchers + stringex + turbolinks + twitter-bootstrap-rails + typhoeus + uglifier (>= 1.3.0) + whois diff --git a/Procfile b/Procfile new file mode 100644 index 0000000..d22cc20 --- /dev/null +++ b/Procfile @@ -0,0 +1 @@ +worker: bundle exec rake pages:work diff --git a/README.md b/README.md new file mode 100644 index 0000000..3632520 --- /dev/null +++ b/README.md @@ -0,0 +1,26 @@ +### Rap Genius Trackback Scraper + +This is the tool we used to scrape 178k URLs in 15 minutes in order to find which pages were hosting potentially spammy Rap Genius links. Given a list of URLs to scrape, it creates aggregate information that identifies the spammiest sites for manual review. + +For more details on the motivation and background for this repository, check out [the blog post on Rap Genius](http://news.rapgenius.com/Rap-genius-founders-rap-genius-is-back-on-google-lyrics) + +### Setup + +You can run the scrape process using a set of sample data in vendor/urls.txt. To get started: + +```sh +$ bundle install && rake db:create db:migrate urls:import +$ gem install foreman +$ mkdir tmp +$ foreman start worker +``` + +Then, once the pages have all been scraped (i.e., `Page.unscraped.count == 0`): + +```ruby +# from the console +Page.write_report! +``` + +### License +MIT diff --git a/Rakefile b/Rakefile new file mode 100644 index 0000000..fe32c9d --- /dev/null +++ b/Rakefile @@ -0,0 +1,6 @@ +# Add your own tasks in files placed in lib/tasks ending in .rake, +# for example lib/tasks/capistrano.rake, and they will automatically be available to Rake. + +require File.expand_path('../config/application', __FILE__) + +TrackbackScraper::Application.load_tasks diff --git a/app/assets/images/.keep b/app/assets/images/.keep new file mode 100644 index 0000000..e69de29 diff --git a/app/assets/javascripts/application.js b/app/assets/javascripts/application.js new file mode 100644 index 0000000..3885622 --- /dev/null +++ b/app/assets/javascripts/application.js @@ -0,0 +1,17 @@ +// This is a manifest file that'll be compiled into application.js, which will include all the files +// listed below. +// +// Any JavaScript/Coffee file within this directory, lib/assets/javascripts, vendor/assets/javascripts, +// or vendor/assets/javascripts of plugins, if any, can be referenced here using a relative path. +// +// It's not advisable to add code directly here, but if you do, it'll appear at the bottom of the +// compiled file. +// +// Read Sprockets README (https://github.com/sstephenson/sprockets#sprockets-directives) for details +// about supported directives. +// +//= require jquery +//= require jquery_ujs +//= require twitter/bootstrap +//= require turbolinks +//= require_tree . diff --git a/app/assets/javascripts/bootstrap.js.coffee b/app/assets/javascripts/bootstrap.js.coffee new file mode 100644 index 0000000..9440679 --- /dev/null +++ b/app/assets/javascripts/bootstrap.js.coffee @@ -0,0 +1,3 @@ +jQuery -> + $("a[rel~=popover], .has-popover").popover() + $("a[rel~=tooltip], .has-tooltip").tooltip() diff --git a/app/assets/javascripts/static.js.coffee b/app/assets/javascripts/static.js.coffee new file mode 100644 index 0000000..24f83d1 --- /dev/null +++ b/app/assets/javascripts/static.js.coffee @@ -0,0 +1,3 @@ +# Place all the behaviors and hooks related to the matching controller here. +# All this logic will automatically be available in application.js. +# You can use CoffeeScript in this file: http://coffeescript.org/ diff --git a/app/assets/stylesheets/application.css b/app/assets/stylesheets/application.css new file mode 100644 index 0000000..3192ec8 --- /dev/null +++ b/app/assets/stylesheets/application.css @@ -0,0 +1,13 @@ +/* + * This is a manifest file that'll be compiled into application.css, which will include all the files + * listed below. + * + * Any CSS and SCSS file within this directory, lib/assets/stylesheets, vendor/assets/stylesheets, + * or vendor/assets/stylesheets of plugins, if any, can be referenced here using a relative path. + * + * You're free to add application-wide styles to this file and they'll appear at the top of the + * compiled file, but it's generally better to create a new file per style scope. + * + *= require_self + *= require_tree . + */ diff --git a/app/assets/stylesheets/bootstrap_and_overrides.css b/app/assets/stylesheets/bootstrap_and_overrides.css new file mode 100644 index 0000000..131fcfd --- /dev/null +++ b/app/assets/stylesheets/bootstrap_and_overrides.css @@ -0,0 +1,7 @@ +/* + =require twitter-bootstrap-static/bootstrap + + Use Font Awesome icons (default) + To use Glyphicons sprites instead of Font Awesome, replace with "require twitter-bootstrap-static/sprites" + =require twitter-bootstrap-static/fontawesome + */ \ No newline at end of file diff --git a/app/assets/stylesheets/global.sass b/app/assets/stylesheets/global.sass new file mode 100644 index 0000000..6cf3f92 --- /dev/null +++ b/app/assets/stylesheets/global.sass @@ -0,0 +1,2 @@ +.container-fluid + margin-top: 50px diff --git a/app/assets/stylesheets/static.css.scss b/app/assets/stylesheets/static.css.scss new file mode 100644 index 0000000..5a803c8 --- /dev/null +++ b/app/assets/stylesheets/static.css.scss @@ -0,0 +1,3 @@ +// Place all the styles related to the static controller here. +// They will automatically be included in application.css. +// You can use Sass (SCSS) here: http://sass-lang.com/ diff --git a/app/controllers/application_controller.rb b/app/controllers/application_controller.rb new file mode 100644 index 0000000..d83690e --- /dev/null +++ b/app/controllers/application_controller.rb @@ -0,0 +1,5 @@ +class ApplicationController < ActionController::Base + # Prevent CSRF attacks by raising an exception. + # For APIs, you may want to use :null_session instead. + protect_from_forgery with: :exception +end diff --git a/app/controllers/concerns/.keep b/app/controllers/concerns/.keep new file mode 100644 index 0000000..e69de29 diff --git a/app/controllers/static_controller.rb b/app/controllers/static_controller.rb new file mode 100644 index 0000000..c6df11e --- /dev/null +++ b/app/controllers/static_controller.rb @@ -0,0 +1,2 @@ +class StaticController < ApplicationController +end diff --git a/app/helpers/application_helper.rb b/app/helpers/application_helper.rb new file mode 100644 index 0000000..de6be79 --- /dev/null +++ b/app/helpers/application_helper.rb @@ -0,0 +1,2 @@ +module ApplicationHelper +end diff --git a/app/helpers/static_helper.rb b/app/helpers/static_helper.rb new file mode 100644 index 0000000..8cfc9af --- /dev/null +++ b/app/helpers/static_helper.rb @@ -0,0 +1,2 @@ +module StaticHelper +end diff --git a/app/helpers/string_helper.rb b/app/helpers/string_helper.rb new file mode 100644 index 0000000..05ec5f7 --- /dev/null +++ b/app/helpers/string_helper.rb @@ -0,0 +1,12 @@ +module StringHelper + extend self + + def coerce_to_utf8(input) + output = input.dup.force_encoding("UTF-8") + + return output if output.valid_encoding? + + output = output.force_encoding("BINARY") + output.encode("UTF-8", invalid: :replace, undef: :replace) + end +end diff --git a/app/mailers/.keep b/app/mailers/.keep new file mode 100644 index 0000000..e69de29 diff --git a/app/mailers/notification_mailer.rb b/app/mailers/notification_mailer.rb new file mode 100644 index 0000000..5ae9d38 --- /dev/null +++ b/app/mailers/notification_mailer.rb @@ -0,0 +1,26 @@ +class NotificationMailer < ActionMailer::Base + default from: "notifications@link_scraper.com" + + DEFAULT_TO_ADDRESS = ENV['SEND_NOTIFICATIONS_TO_EMAIL'] + + def self.configured? + DEFAULT_TO_ADDRESS.present? && ENV['MAILGUN_SMTP_LOGIN'].present? + end + + def notify_success(scraped) + mail to: DEFAULT_TO_ADDRESS, + subject: "Finished #{scraped} pages" + end + + def notify_error(message) + mail to: DEFAULT_TO_ADDRESS, + subject: "An error happened: #{message.first(50)}..." + end + + def report(report, subject: "Completed Report", to: DEFAULT_TO_ADDRESS) + attachments['report.csv'] = report + + mail to: to, + subject: subject + end +end diff --git a/app/models/.keep b/app/models/.keep new file mode 100644 index 0000000..e69de29 diff --git a/app/models/concerns/.keep b/app/models/concerns/.keep new file mode 100644 index 0000000..e69de29 diff --git a/app/models/page.rb b/app/models/page.rb new file mode 100644 index 0000000..2b63119 --- /dev/null +++ b/app/models/page.rb @@ -0,0 +1,195 @@ +class Page < ActiveRecord::Base + scope :locked, -> { where("locked_at IS NOT NULL") } + scope :not_locked, -> { where(locked_at: nil) } + def locked? + locked_at? + end + + scope :unscraped, -> { where(fetched: false).not_locked } + scope :scraped, -> { where(fetched: true) } + scope :errored, -> { scraped.where('error_code IS NOT NULL OR error_message IS NOT NULL') } + def errored? + error_code.present? || error_message.present? + end + + scope :timeout, -> { scraped.where("error_message = 'timeout'") } + + scope :not_errored, -> { scraped.where(error_code: nil, error_message: nil) } + + serialize :links, Hash + + def self.forget_everything! + update_all(reset_attributes.merge(updated_at: Time.now)) + end + + def self.reset_attributes + { + fetched: false, + locked_at: nil, + links: nil, + total_links_to_rg: nil, + count_of_links_to_rg_song_pages: nil, + count_of_links_with_rg_format: nil, + count_of_annotation_links: nil, + count_of_links_with_text_ending_in_lyrics: nil, + error_code: nil, + error_message: nil, + count_of_link_clumps_fuzzy_match: nil, + largest_link_clump_size_fuzzy_match: nil, + count_of_link_clumps: nil, + largest_link_clump_size: nil + } + end + delegate :reset_attributes, to: 'self.class' + + def self.error_code_summary + scraped.where('error_code IS NOT NULL').pluck(:error_code).each_with_object(Hash.new(0)) do |code, h| + h[code] += 1 + end + end + + def self.reserve_batch_for_scraping(limit) + pages_subquery = unscraped.limit(limit).order(:id).select(:id).lock(true).to_sql + db_time_now = Time.now.utc + + find_by_sql [<<-SQL, db_time_now, db_time_now] + UPDATE pages SET locked_at = ?, updated_at = ? + WHERE id IN (#{pages_subquery}) + RETURNING * + SQL + end + + def self.hydra + @hydra ||= Typhoeus::Hydra.new(max_concurrency: ENV.fetch('HTTP_CONCURRENCY', 200).to_i) + end + delegate :hydra, to: 'self.class' + + def self.scrape_batch(batch_size) + pages = reserve_batch_for_scraping(batch_size) + + pages.each do |page| + hydra.queue(request = page.new_request) + + request.on_complete do |response| + page.scraped!(response) + + yield(response) if block_given? + end + end + + hydra.run + rescue GracefulShutdown + hydra.abort + pages.each { |p| p.unlock! if p.locked? } + raise + rescue => e + Rails.logger.error([e.message] + e.backtrace) + NotificationMailer.notify_error(e.message).deliver! if NotificationMailer.configured? + + raise e + end + + def self.scrape_batch_with_open_uri(batch_size) + OpenUriScrape.new(batch_size).scrape_batch + end + + CSV_COLUMNS = %w(url count_of_link_clumps count_of_link_clumps_fuzzy_match count_of_links_with_rg_format count_of_links_with_text_ending_in_lyrics count_of_links_to_rg_song_pages count_of_annotation_links total_links_to_rg largest_link_clump_size largest_link_clump_size_fuzzy_match) + def self.write_report!(file_path = Rails.root.join('tmp/report.csv')) + File.open(file_path, 'wb') do |file| + file.write(generate_report) + end + end + + def self.generate_report(limit = nil) + CSV.generate do |csv| + csv << CSV_COLUMNS + + not_errored.limit(limit).order((CSV_COLUMNS - ['url']).map { |c| "#{c} desc" }.join(', ')).each do |page| + csv << CSV_COLUMNS.map { |c| page.__send__(c) } + end + end + end + + def self.send_email_report + raise unless NotificationMailer.configured? + + NotificationMailer.report(generate_report).deliver! + end + + def self.send_abbreviated_report(limit = 20_000) + raise unless NotificationMailer.configured? + + NotificationMailer.report(generate_report(limit), subject: "Abbreviated report").deliver! + end + + def scraped!(response, raise_errors = false) + Librato.measure('scrape.request.time', response.total_time * 1000) unless response.total_time.zero? + + return handle_scrape_error(response) unless response.success? + + begin + scraped_attributes = parse_and_find_rg_links(response.body) + rescue => e + Librato.increment('scrape.error') + tap(&:mark_fetched).update_attributes!(error_message: { :exception => e }.to_yaml) + raise e if raise_errors + else + Librato.increment('scrape.success') + + tap(&:mark_fetched).update_attributes!(scraped_attributes.merge!(error_code: nil, error_message: nil)) + end + end + + def handle_scrape_error(response) + Librato.increment('scrape.error') + + mark_fetched + + if response.timed_out? + Librato.increment('scrape.error.timeout') + update_attributes!(error_message: "timeout") + elsif response.code == 0 + Librato.increment('scrape.error.unknown') + update_attributes!(error_message: response.return_message) + else + Librato.increment('scrape.error.http') + update_attributes!(error_message: nil, error_code: response.code) + end + end + + def scrape! + hydra.queue(request = new_request) + request.on_complete { |r| scraped!(r, :raise_errors) } + hydra.run + end + + def new_request + Typhoeus::Request.new(url, followlocation: true, timeout: ENV.fetch('HTTP_TIMEOUT', 20).to_i, headers: request_headers) + end + + def request_headers + { 'User-Agent' => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36', + 'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9', + 'Cache-Control' => 'max-age=0', 'Accept-Language' => 'en-US,en;q=0.8'} + end + + def mark_fetched + self.attributes = reset_attributes + + self.fetched = true + self.locked_at = nil + end + + def unlock! + self.locked_at = nil + save! + end + + private + + def parse_and_find_rg_links(body) + Librato.measure('parse_and_find_rg_links') do + PageParser.new(body).parse_and_find_rg_links + end + end +end diff --git a/app/views/layouts/application.html.haml b/app/views/layouts/application.html.haml new file mode 100644 index 0000000..1c34d31 --- /dev/null +++ b/app/views/layouts/application.html.haml @@ -0,0 +1,33 @@ +!!! 5 +- application_name = Rails.application.class.to_s.split('::').first +%html(lang="en-US" class="#{controller_name}-#{action_name}") + %head + %title + - if content_for?(:title) + = yield (:title) + - else + = application_name + = stylesheet_link_tag "application", :media => "all" + = javascript_include_tag "application" + = csrf_meta_tags + %body + .navbar.navbar-fixed-top + .navbar-inner + .container + %a.brand{href: '/'}= application_name + .container-fluid + .row-fluid + .span8.offset3 + - flash.keys.each do |key| + .alert{ class: "alert-#{key}" } + %a.close{ href: '#', "data-dismiss" => "alert" } x + %h4.alert-heading= key.capitalize + - if flash[key].respond_to?(:each) + - flash[key].each do |msg| + = msg + %br + - else + = flash[key] + - flash.delete(key) + = yield + = yield(:page_javascript) if content_for? :page_javascript diff --git a/app/views/notification_mailer/notify_error.html.haml b/app/views/notification_mailer/notify_error.html.haml new file mode 100644 index 0000000..e69de29 diff --git a/app/views/notification_mailer/notify_success.html.haml b/app/views/notification_mailer/notify_success.html.haml new file mode 100644 index 0000000..e69de29 diff --git a/app/views/notification_mailer/report.html.haml b/app/views/notification_mailer/report.html.haml new file mode 100644 index 0000000..e69de29 diff --git a/app/views/static/index.html.haml b/app/views/static/index.html.haml new file mode 100644 index 0000000..52667f9 --- /dev/null +++ b/app/views/static/index.html.haml @@ -0,0 +1 @@ +%h1 Hello, link_scraper! diff --git a/bin/bundle b/bin/bundle new file mode 100755 index 0000000..66e9889 --- /dev/null +++ b/bin/bundle @@ -0,0 +1,3 @@ +#!/usr/bin/env ruby +ENV['BUNDLE_GEMFILE'] ||= File.expand_path('../../Gemfile', __FILE__) +load Gem.bin_path('bundler', 'bundle') diff --git a/bin/rails b/bin/rails new file mode 100755 index 0000000..728cd85 --- /dev/null +++ b/bin/rails @@ -0,0 +1,4 @@ +#!/usr/bin/env ruby +APP_PATH = File.expand_path('../../config/application', __FILE__) +require_relative '../config/boot' +require 'rails/commands' diff --git a/bin/rake b/bin/rake new file mode 100755 index 0000000..1724048 --- /dev/null +++ b/bin/rake @@ -0,0 +1,4 @@ +#!/usr/bin/env ruby +require_relative '../config/boot' +require 'rake' +Rake.application.run diff --git a/config.ru b/config.ru new file mode 100644 index 0000000..5bc2a61 --- /dev/null +++ b/config.ru @@ -0,0 +1,4 @@ +# This file is used by Rack-based servers to start the application. + +require ::File.expand_path('../config/environment', __FILE__) +run Rails.application diff --git a/config/application.rb b/config/application.rb new file mode 100644 index 0000000..2e08da9 --- /dev/null +++ b/config/application.rb @@ -0,0 +1,32 @@ +require File.expand_path('../boot', __FILE__) + +# Pick the frameworks you want: +require "active_record/railtie" +require "action_controller/railtie" +require "action_mailer/railtie" +require "sprockets/railtie" +# require "rails/test_unit/railtie" + +# Require the gems listed in Gemfile, including any gems +# you've limited to :test, :development, or :production. +Bundler.require(:default, Rails.env) + +require 'csv' + +module TrackbackScraper + class Application < Rails::Application + # Settings in config/environments/* take precedence over those specified here. + # Application configuration should go into files in config/initializers + # -- all .rb files in that directory are automatically loaded. + + # Set Time.zone default to the specified zone and make Active Record auto-convert to this zone. + # Run "rake -D time" for a list of tasks for finding time zone names. Default is UTC. + # config.time_zone = 'Central Time (US & Canada)' + + # The default locale is :en and all translations from config/locales/*.rb,yml are auto loaded. + # config.i18n.load_path += Dir[Rails.root.join('my', 'locales', '*.{rb,yml}').to_s] + # config.i18n.default_locale = :de + + config.autoload_paths += ['lib'] + end +end diff --git a/config/boot.rb b/config/boot.rb new file mode 100644 index 0000000..3596736 --- /dev/null +++ b/config/boot.rb @@ -0,0 +1,4 @@ +# Set up gems listed in the Gemfile. +ENV['BUNDLE_GEMFILE'] ||= File.expand_path('../../Gemfile', __FILE__) + +require 'bundler/setup' if File.exists?(ENV['BUNDLE_GEMFILE']) diff --git a/config/database.yml b/config/database.yml new file mode 100644 index 0000000..3774c09 --- /dev/null +++ b/config/database.yml @@ -0,0 +1,11 @@ +development: + adapter: postgresql + database: trackback_scraper-dev + pool: 10 + timeout: 5000 + +test: + adapter: postgresql + database: trackback_scraper-test + pool: 10 + timeout: 5000 diff --git a/config/environment.rb b/config/environment.rb new file mode 100644 index 0000000..6dcc9bf --- /dev/null +++ b/config/environment.rb @@ -0,0 +1,5 @@ +# Load the Rails application. +require File.expand_path('../application', __FILE__) + +# Initialize the Rails application. +TrackbackScraper::Application.initialize! diff --git a/config/environments/development.rb b/config/environments/development.rb new file mode 100644 index 0000000..9b0f346 --- /dev/null +++ b/config/environments/development.rb @@ -0,0 +1,33 @@ +TrackbackScraper::Application.configure do + # Settings specified here will take precedence over those in config/application.rb. + + # In the development environment your application's code is reloaded on + # every request. This slows down response time but is perfect for development + # since you don't have to restart the web server when you make code changes. + config.cache_classes = false + + # Do not eager load code on boot. + config.eager_load = false + + # Show full error reports and disable caching. + config.consider_all_requests_local = true + config.action_controller.perform_caching = false + + # Don't care if the mailer can't send. + config.action_mailer.raise_delivery_errors = true + config.action_mailer.perform_deliveries = true + config.action_mailer.delivery_method = :smtp + + # Print deprecation notices to the Rails logger. + config.active_support.deprecation = :log + + # Raise an error on page load if there are pending migrations + config.active_record.migration_error = :page_load + + # Debug mode disables concatenation and preprocessing of assets. + # This option may cause significant delays in view rendering with a large + # number of complex assets. + config.assets.debug = true + + config.action_mailer.default_url_options = { host: 'link_scraper_dev.com' } +end diff --git a/config/environments/production.rb b/config/environments/production.rb new file mode 100644 index 0000000..0d9ebe9 --- /dev/null +++ b/config/environments/production.rb @@ -0,0 +1,80 @@ +TrackbackScraper::Application.configure do + # Settings specified here will take precedence over those in config/application.rb. + + # Code is not reloaded between requests. + config.cache_classes = true + + # Eager load code on boot. This eager loads most of Rails and + # your application in memory, allowing both thread web servers + # and those relying on copy on write to perform better. + # Rake tasks automatically ignore this option for performance. + config.eager_load = true + + # Full error reports are disabled and caching is turned on. + config.consider_all_requests_local = false + config.action_controller.perform_caching = true + + # Enable Rack::Cache to put a simple HTTP cache in front of your application + # Add `rack-cache` to your Gemfile before enabling this. + # For large-scale production use, consider using a caching reverse proxy like nginx, varnish or squid. + # config.action_dispatch.rack_cache = true + + # Disable Rails's static asset server (Apache or nginx will already do this). + config.serve_static_assets = false + + # Compress JavaScripts and CSS. + config.assets.js_compressor = :uglifier + # config.assets.css_compressor = :sass + + # Do not fallback to assets pipeline if a precompiled asset is missed. + config.assets.compile = false + + # Generate digests for assets URLs. + config.assets.digest = true + + # Version of your assets, change this if you want to expire all your assets. + config.assets.version = '1.0' + + # Specifies the header that your server uses for sending files. + # config.action_dispatch.x_sendfile_header = "X-Sendfile" # for apache + # config.action_dispatch.x_sendfile_header = 'X-Accel-Redirect' # for nginx + + # Force all access to the app over SSL, use Strict-Transport-Security, and use secure cookies. + # config.force_ssl = true + + # Set to :debug to see everything in the log. + config.log_level = :info + + # Prepend all log lines with the following tags. + # config.log_tags = [ :subdomain, :uuid ] + + # Use a different logger for distributed setups. + # config.logger = ActiveSupport::TaggedLogging.new(SyslogLogger.new) + + # Use a different cache store in production. + # config.cache_store = :mem_cache_store + + # Enable serving of images, stylesheets, and JavaScripts from an asset server. + # config.action_controller.asset_host = "http://assets.example.com" + + # Precompile additional assets. + # application.js, application.css, and all non-JS/CSS in app/assets folder are already added. + # config.assets.precompile += %w( search.js ) + + # Ignore bad email addresses and do not raise email delivery errors. + # Set this to true and configure the email server for immediate delivery to raise delivery errors. + # config.action_mailer.raise_delivery_errors = false + + # Enable locale fallbacks for I18n (makes lookups for any locale fall back to + # the I18n.default_locale when a translation can not be found). + config.i18n.fallbacks = true + + # Send deprecation notices to registered listeners. + config.active_support.deprecation = :notify + + # Disable automatic flushing of the log to improve performance. + # config.autoflush_log = false + + # Use default logging formatter so that PID and timestamp are not suppressed. + config.log_formatter = ::Logger::Formatter.new +end diff --git a/config/environments/test.rb b/config/environments/test.rb new file mode 100644 index 0000000..4649674 --- /dev/null +++ b/config/environments/test.rb @@ -0,0 +1,36 @@ +TrackbackScraper::Application.configure do + # Settings specified here will take precedence over those in config/application.rb. + + # The test environment is used exclusively to run your application's + # test suite. You never need to work with it otherwise. Remember that + # your test database is "scratch space" for the test suite and is wiped + # and recreated between test runs. Don't rely on the data there! + config.cache_classes = true + + # Do not eager load code on boot. This avoids loading your whole application + # just for the purpose of running a single test. If you are using a tool that + # preloads Rails for running tests, you may have to set it to true. + config.eager_load = false + + # Configure static asset server for tests with Cache-Control for performance. + config.serve_static_assets = true + config.static_cache_control = "public, max-age=3600" + + # Show full error reports and disable caching. + config.consider_all_requests_local = true + config.action_controller.perform_caching = false + + # Raise exceptions instead of rendering exception templates. + config.action_dispatch.show_exceptions = false + + # Disable request forgery protection in test environment. + config.action_controller.allow_forgery_protection = false + + # Tell Action Mailer not to deliver emails to the real world. + # The :test delivery method accumulates sent emails in the + # ActionMailer::Base.deliveries array. + config.action_mailer.delivery_method = :test + + # Print deprecation notices to the stderr. + config.active_support.deprecation = :stderr +end diff --git a/config/initializers/backtrace_silencers.rb b/config/initializers/backtrace_silencers.rb new file mode 100644 index 0000000..59385cd --- /dev/null +++ b/config/initializers/backtrace_silencers.rb @@ -0,0 +1,7 @@ +# Be sure to restart your server when you modify this file. + +# You can add backtrace silencers for libraries that you're using but don't wish to see in your backtraces. +# Rails.backtrace_cleaner.add_silencer { |line| line =~ /my_noisy_library/ } + +# You can also remove all the silencers if you're trying to debug a problem that might stem from framework code. +# Rails.backtrace_cleaner.remove_silencers! diff --git a/config/initializers/filter_parameter_logging.rb b/config/initializers/filter_parameter_logging.rb new file mode 100644 index 0000000..4a994e1 --- /dev/null +++ b/config/initializers/filter_parameter_logging.rb @@ -0,0 +1,4 @@ +# Be sure to restart your server when you modify this file. + +# Configure sensitive parameters which will be filtered from the log file. +Rails.application.config.filter_parameters += [:password] diff --git a/config/initializers/inflections.rb b/config/initializers/inflections.rb new file mode 100644 index 0000000..ac033bf --- /dev/null +++ b/config/initializers/inflections.rb @@ -0,0 +1,16 @@ +# Be sure to restart your server when you modify this file. + +# Add new inflection rules using the following format. Inflections +# are locale specific, and you may define rules for as many different +# locales as you wish. All of these examples are active by default: +# ActiveSupport::Inflector.inflections(:en) do |inflect| +# inflect.plural /^(ox)$/i, '\1en' +# inflect.singular /^(ox)en/i, '\1' +# inflect.irregular 'person', 'people' +# inflect.uncountable %w( fish sheep ) +# end + +# These inflection rules are supported but not enabled by default: +# ActiveSupport::Inflector.inflections(:en) do |inflect| +# inflect.acronym 'RESTful' +# end diff --git a/config/initializers/mailer.rb b/config/initializers/mailer.rb new file mode 100644 index 0000000..3691357 --- /dev/null +++ b/config/initializers/mailer.rb @@ -0,0 +1,10 @@ +if ENV['MAILGUN_SMTP_LOGIN'].present? + ActionMailer::Base.smtp_settings = { + :authentication => :plain, + :address => ENV.fetch('MAILGUN_SMTP_SERVER'), + :port => ENV.fetch('MAILGUN_SMTP_PORT'), + :domain => 'link_scraper.mailgun.org', + :user_name => ENV.fetch('MAILGUN_SMTP_LOGIN'), + :password => ENV.fetch('MAILGUN_SMTP_PASSWORD') + } +end diff --git a/config/initializers/mime_types.rb b/config/initializers/mime_types.rb new file mode 100644 index 0000000..72aca7e --- /dev/null +++ b/config/initializers/mime_types.rb @@ -0,0 +1,5 @@ +# Be sure to restart your server when you modify this file. + +# Add new mime types for use in respond_to blocks: +# Mime::Type.register "text/richtext", :rtf +# Mime::Type.register_alias "text/html", :iphone diff --git a/config/initializers/secret_token.rb b/config/initializers/secret_token.rb new file mode 100644 index 0000000..d57f41e --- /dev/null +++ b/config/initializers/secret_token.rb @@ -0,0 +1,12 @@ +# Be sure to restart your server when you modify this file. + +# Your secret key is used for verifying the integrity of signed cookies. +# If you change this key, all old signed cookies will become invalid! + +# Make sure the secret is at least 30 characters and all random, +# no regular words or you'll be exposed to dictionary attacks. +# You can use `rake secret` to generate a secure secret key. + +# Make sure your secret_key_base is kept private +# if you're sharing your code publicly. +TrackbackScraper::Application.config.secret_key_base = '74b5d9f645e136b55cd292f7d8920db52aca14e6ee7d2d4f8e946850abe11c05c6574b0a5bdd9a883a2041154def95913cc9109ce68b65fe517189d2e45654eb' diff --git a/config/initializers/session_store.rb b/config/initializers/session_store.rb new file mode 100644 index 0000000..b0284e5 --- /dev/null +++ b/config/initializers/session_store.rb @@ -0,0 +1,3 @@ +# Be sure to restart your server when you modify this file. + +TrackbackScraper::Application.config.session_store :cookie_store, key: '_link_scraper_session' diff --git a/config/initializers/slug_extensions.rb b/config/initializers/slug_extensions.rb new file mode 100644 index 0000000..72301a0 --- /dev/null +++ b/config/initializers/slug_extensions.rb @@ -0,0 +1,30 @@ +class String + def to_slug + strip. + downcase. + transliterate. + convert_smart_punctuation. + convert_misc_characters. + convert_dollar_signs. + gsub(/[`'()*,.#´]/, ''). + gsub(/[^a-z0-9\-]+/i, '-'). + gsub(/\-{2,}/, '-'). + gsub(/^\-|\-$/i, ''). + to_s + end + + def convert_dollar_signs + gsub(/\s+\$(?=[a-z0-9])/i, ' s') + end + + def convert_misc_characters + gsub(/\s+&\s+/, ' and '). + gsub(/\s+@\s+/, ' at '). + gsub(/[.]{2,}/, ' ') + # gsub(/(\d)%(\s|$)/, '\1 percent ') + end + + def transliterate + ActiveSupport::Inflector.transliterate(self) + end +end diff --git a/config/initializers/wrap_parameters.rb b/config/initializers/wrap_parameters.rb new file mode 100644 index 0000000..33725e9 --- /dev/null +++ b/config/initializers/wrap_parameters.rb @@ -0,0 +1,14 @@ +# Be sure to restart your server when you modify this file. + +# This file contains settings for ActionController::ParamsWrapper which +# is enabled by default. + +# Enable parameter wrapping for JSON. You can disable this by setting :format to an empty array. +ActiveSupport.on_load(:action_controller) do + wrap_parameters format: [:json] if respond_to?(:wrap_parameters) +end + +# To enable root element in JSON for ActiveRecord objects. +# ActiveSupport.on_load(:active_record) do +# self.include_root_in_json = true +# end diff --git a/config/locales/en.bootstrap.yml b/config/locales/en.bootstrap.yml new file mode 100644 index 0000000..c98d8d8 --- /dev/null +++ b/config/locales/en.bootstrap.yml @@ -0,0 +1,18 @@ +# Sample localization file for English. Add more files in this directory for other locales. +# See https://github.com/svenfuchs/rails-i18n/tree/master/rails%2Flocale for starting points. + +en: + helpers: + actions: "Actions" + links: + back: "Back" + cancel: "Cancel" + confirm: "Are you sure?" + destroy: "Delete" + new: "New" + edit: "Edit" + titles: + edit: "Edit %{model}" + save: "Save %{model}" + new: "New %{model}" + delete: "Delete %{model}" diff --git a/config/locales/en.yml b/config/locales/en.yml new file mode 100644 index 0000000..0653957 --- /dev/null +++ b/config/locales/en.yml @@ -0,0 +1,23 @@ +# Files in the config/locales directory are used for internationalization +# and are automatically loaded by Rails. If you want to use locales other +# than English, add the necessary files in this directory. +# +# To use the locales, use `I18n.t`: +# +# I18n.t 'hello' +# +# In views, this is aliased to just `t`: +# +# <%= t('hello') %> +# +# To use a different locale, set it with `I18n.locale`: +# +# I18n.locale = :es +# +# This would use the information in config/locales/es.yml. +# +# To learn more, please read the Rails Internationalization guide +# available at http://guides.rubyonrails.org/i18n.html. + +en: + hello: "Hello world" diff --git a/config/routes.rb b/config/routes.rb new file mode 100644 index 0000000..17f02db --- /dev/null +++ b/config/routes.rb @@ -0,0 +1,57 @@ +TrackbackScraper::Application.routes.draw do + root to: 'static#index' + # The priority is based upon order of creation: first created -> highest priority. + # See how all your routes lay out with "rake routes". + + # You can have the root of your site routed with "root" + # root 'welcome#index' + + # Example of regular route: + # get 'products/:id' => 'catalog#view' + + # Example of named route that can be invoked with purchase_url(id: product.id) + # get 'products/:id/purchase' => 'catalog#purchase', as: :purchase + + # Example resource route (maps HTTP verbs to controller actions automatically): + # resources :products + + # Example resource route with options: + # resources :products do + # member do + # get 'short' + # post 'toggle' + # end + # + # collection do + # get 'sold' + # end + # end + + # Example resource route with sub-resources: + # resources :products do + # resources :comments, :sales + # resource :seller + # end + + # Example resource route with more complex sub-resources: + # resources :products do + # resources :comments + # resources :sales do + # get 'recent', on: :collection + # end + # end + + # Example resource route with concerns: + # concern :toggleable do + # post 'toggle' + # end + # resources :posts, concerns: :toggleable + # resources :photos, concerns: :toggleable + + # Example resource route within a namespace: + # namespace :admin do + # # Directs /admin/products/* to Admin::ProductsController + # # (app/controllers/admin/products_controller.rb) + # resources :products + # end +end diff --git a/db/migrate/20131227162259_create_pages.rb b/db/migrate/20131227162259_create_pages.rb new file mode 100644 index 0000000..bebb042 --- /dev/null +++ b/db/migrate/20131227162259_create_pages.rb @@ -0,0 +1,25 @@ +class CreatePages < ActiveRecord::Migration + def change + create_table :pages do |t| + t.string :url, limit: 2000 + t.text :links + t.integer :count_of_links_to_rg_song_pages + t.integer :count_of_links_with_rg_format + t.datetime :created_at + t.datetime :updated_at + t.integer :error_code + t.text :error_message + t.integer :total_links_to_rg + t.boolean :fetched, default: false, null: false + t.integer :count_of_links_with_text_ending_in_lyrics + t.integer :count_of_annotation_links + t.datetime :locked_at + t.integer :count_of_link_clumps + t.integer :largest_link_clump_size + t.integer :count_of_link_clumps_fuzzy_match + t.integer :largest_link_clump_size_fuzzy_match + end + + add_index :pages, :url, unique: true + end +end diff --git a/db/schema.rb b/db/schema.rb new file mode 100644 index 0000000..8e8f692 --- /dev/null +++ b/db/schema.rb @@ -0,0 +1,41 @@ +# encoding: UTF-8 +# This file is auto-generated from the current state of the database. Instead +# of editing this file, please use the migrations feature of Active Record to +# incrementally modify your database, and then regenerate this schema definition. +# +# Note that this schema.rb definition is the authoritative source for your +# database schema. If you need to create the application database on another +# system, you should be using db:schema:load, not running all the migrations +# from scratch. The latter is a flawed and unsustainable approach (the more migrations +# you'll amass, the slower it'll run and the greater likelihood for issues). +# +# It's strongly recommended that you check this file into your version control system. + +ActiveRecord::Schema.define(version: 20131227162259) do + + # These are extensions that must be enabled in order to support this database + enable_extension "plpgsql" + + create_table "pages", force: true do |t| + t.string "url", limit: 2000 + t.text "links" + t.integer "count_of_links_to_rg_song_pages" + t.integer "count_of_links_with_rg_format" + t.datetime "created_at" + t.datetime "updated_at" + t.integer "error_code" + t.text "error_message" + t.integer "total_links_to_rg" + t.boolean "fetched", default: false, null: false + t.integer "count_of_links_with_text_ending_in_lyrics" + t.integer "count_of_annotation_links" + t.datetime "locked_at" + t.integer "count_of_link_clumps" + t.integer "largest_link_clump_size" + t.integer "count_of_link_clumps_fuzzy_match" + t.integer "largest_link_clump_size_fuzzy_match" + end + + add_index "pages", ["url"], name: "index_pages_on_url", unique: true, using: :btree + +end diff --git a/db/seeds.rb b/db/seeds.rb new file mode 100644 index 0000000..4edb1e8 --- /dev/null +++ b/db/seeds.rb @@ -0,0 +1,7 @@ +# This file should contain all the record creation needed to seed the database with its default values. +# The data can then be loaded with the rake db:seed (or created alongside the db with db:setup). +# +# Examples: +# +# cities = City.create([{ name: 'Chicago' }, { name: 'Copenhagen' }]) +# Mayor.create(name: 'Emanuel', city: cities.first) diff --git a/lib/assets/.keep b/lib/assets/.keep new file mode 100644 index 0000000..e69de29 diff --git a/lib/graceful_shutdown.rb b/lib/graceful_shutdown.rb new file mode 100644 index 0000000..17f4fd4 --- /dev/null +++ b/lib/graceful_shutdown.rb @@ -0,0 +1,2 @@ +class GracefulShutdown < StandardError +end diff --git a/lib/open_uri_scrape.rb b/lib/open_uri_scrape.rb new file mode 100644 index 0000000..67f4fff --- /dev/null +++ b/lib/open_uri_scrape.rb @@ -0,0 +1,76 @@ +require 'open-uri' + +class OpenUriScrape + class FakeTyphoeusResponse < Struct.new(:success, :body, :code, :timed_out, :return_message, :total_time_ms) + alias_method :success?, :success + alias_method :timed_out?, :timed_out + + def total_time + (total_time_ms || 0) / 1000.0 + end + end + + attr_reader :batch_size, :queue, :processed, :concurrency + + def initialize(batch_size) + @batch_size = batch_size + @queue, @processed, = Queue.new, Queue.new + @concurrency = ENV.fetch('HTTP_CONCURRENCY', 200).to_i + end + + def scrape_batch + pages = Page.reserve_batch_for_scraping(batch_size) + pages.each { |p| queue << p } + + concurrency.times do + Thread.new do + until queue.empty? + page = queue.pop + processed << [page, fetch_response(page.url)] + end + end + end + + pages.count.times do + page, response = processed.pop + + page.scraped!(response) + yield(response) if block_given? + end + rescue GracefulShutdown + queue.clear + pages.each { |p| p.unlock! if p.locked? } + raise + rescue => e + Rails.logger.error([e.message] + e.backtrace) + NotificationMailer.notify_error(e.message).deliver! if NotificationMailer.configured? + + raise e + end + + private + + def fetch_response(url) + result, error = nil, nil + + total_time = Benchmark.ms do + begin + Timeout.timeout(ENV.fetch('HTTP_TIMEOUT', 20).to_i) do + result = open(url) + end + rescue => e + error = e + end + end + + raise error if error + + FakeTyphoeusResponse.new(true, result.read, result.status.first.to_i, false, '', total_time) + rescue Timeout::Error => e + FakeTyphoeusResponse.new(false, nil, 0, true, nil, total_time) + rescue OpenURI::HTTPError => e + FakeTyphoeusResponse.new(false, nil, e.message.split.first.to_i, false, e.message, total_time) + rescue => e + FakeTyphoeusResponse.new(false, nil, 0, false, e.inspect, total_time) + end +end diff --git a/lib/page_parser.rb b/lib/page_parser.rb new file mode 100644 index 0000000..3efbf41 --- /dev/null +++ b/lib/page_parser.rb @@ -0,0 +1,81 @@ +class PageParser + def initialize(body) + @body = body + end + + def parse_and_find_rg_links + aggregates = {count_of_links_to_rg_song_pages: 0, + count_of_links_with_text_ending_in_lyrics: 0, + count_of_links_with_rg_format: 0, + count_of_annotation_links: 0} + links_to_rg = {} + + doc = Nokogiri::HTML.fragment(body) + doc.css('a').each do |link| + parsed = ParsedLink.parse(link) + next unless parsed.rg_link? + + links_to_rg[parsed.inner_text] = parsed.href + aggregates[:count_of_links_to_rg_song_pages] += 1 if parsed.song_page_link? + aggregates[:count_of_links_with_text_ending_in_lyrics] += 1 if parsed.lyrics_text_link? + aggregates[:count_of_links_with_rg_format] += 1 if parsed.rg_text_format? + aggregates[:count_of_annotation_links] += 1 if parsed.annotation_link? + end + + {links: links_to_rg, + total_links_to_rg: links_to_rg.count}.merge(aggregates).merge(identify_link_clumps) + end + + private + attr_reader :body + + def identify_link_clumps + link_clumps_by_rg_text_format = identify_link_clumps_with(:rg_text_format?) + link_clumps_fuzzy_match = identify_link_clumps_with(:lyrics_text_link?) + + {count_of_link_clumps: link_clumps_by_rg_text_format[:count], + largest_link_clump_size: link_clumps_by_rg_text_format[:largest], + count_of_link_clumps_fuzzy_match: link_clumps_fuzzy_match[:count], + largest_link_clump_size_fuzzy_match: link_clumps_fuzzy_match[:largest]} + end + + def adjacent_rg_text_format_link(link, method) + return unless next_link = next_anchor_sibling(link) + next_link['data-seen-already'] = true + next_link if ParsedLink.parse(next_link).try(method) + end + + def identify_link_clumps_with(method) + largest_clump_size, current_clump_size, number_of_clumps = 0, 0, 0 + + doc = Nokogiri::HTML.fragment(body) + + while link = doc.css('a:not([data-seen-already])').first + link['data-seen-already'] = 'true' + + parsed = ParsedLink.parse(link) + next unless parsed.try(method) + + current_clump_size = 1 + + current_link = link + while current_link = adjacent_rg_text_format_link(current_link, method) + current_clump_size += 1 + end + + if current_clump_size > 1 + number_of_clumps += 1 + largest_clump_size = current_clump_size if current_clump_size > largest_clump_size + end + end + + {count: number_of_clumps, largest: largest_clump_size} + end + + def next_anchor_sibling(node) + sibling = node.next_sibling + sibling = sibling.next_sibling while sibling.try(:name).to_s == 'br' + + sibling if sibling.try(:name) == 'a' + end +end diff --git a/lib/parsed_link.rb b/lib/parsed_link.rb new file mode 100644 index 0000000..44a21b8 --- /dev/null +++ b/lib/parsed_link.rb @@ -0,0 +1,47 @@ +class ParsedLink + include StringHelper + + attr_accessor :rg_link, :song_page_link, :annotation_link, :lyrics_text_link, :rg_text_format, :inner_text, :href + alias_method :rg_link?, :rg_link + alias_method :song_page_link?, :song_page_link + alias_method :annotation_link?, :annotation_link + alias_method :lyrics_text_link?, :lyrics_text_link + alias_method :rg_text_format?, :rg_text_format + + def initialize(link) + return unless self.href = link['href'].presence + self.href = coerce_to_utf8(href) + + begin + parsed = Addressable::URI.parse(href) + rescue Addressable::URI::InvalidURIError + return + end + + return unless parsed.host =~ /(?:^|\.)rapgenius.com\z/i + self.rg_link = true + + path, self.inner_text = coerce_to_utf8(parsed.path), coerce_to_utf8(link.inner_text) + + if path =~ /-lyrics\z/i + self.song_page_link = true + + if inner_text =~ /\sLyrics\z/ + self.lyrics_text_link = true + + artist, title = link.inner_text.split(/\s+–\s+/) + return unless artist.present? && title.present? + + title.chomp!(" Lyrics") + + self.rg_text_format = true if "/#{artist.to_slug}-#{title.to_slug}".downcase == parsed.path.dup.chomp("-lyrics").downcase + end + elsif path =~ %r(\A/\d+(\z|/)) + self.annotation_link = true + end + end + + def self.parse(link) + new(link) if link + end +end diff --git a/lib/tasks/.keep b/lib/tasks/.keep new file mode 100644 index 0000000..e69de29 diff --git a/lib/tasks/import_urls.rake b/lib/tasks/import_urls.rake new file mode 100644 index 0000000..e485960 --- /dev/null +++ b/lib/tasks/import_urls.rake @@ -0,0 +1,9 @@ +desc 'import urls from a file' +task 'urls:import' => :environment do + urls = File.read(Rails.root.join('vendor', 'urls.txt')) + urls.each_line.drop(1).each do |url| + url = url.strip + + Page.find_or_create_by_url!(url) + end +end diff --git a/lib/tasks/scrape_urls.rake b/lib/tasks/scrape_urls.rake new file mode 100644 index 0000000..ee22580 --- /dev/null +++ b/lib/tasks/scrape_urls.rake @@ -0,0 +1,38 @@ +desc 'scrape all urls' +task 'pages:scrape', [:limit] => :environment do |t, args| + limit = args[:limit] || 20 + + done = 0 + + Page.scrape_batch(limit) do |completed_response| + done += 1 + + if done % 10 == 0 + puts "Completed #{done}" + end + end + + NotificationMailer.notify_success(limit).deliver! if NotificationMailer.configured? +end + + +desc 'delayed_job-like worker task for scraping pages' +task 'pages:work' => :environment do + require 'graceful_shutdown' + trap('TERM') { raise GracefulShutdown } + + batch_size = ENV.fetch('SCRAPE_BATCH_SIZE', 200).to_i + + begin + loop do + start = Time.now + + Rails.logger.info "Scraping batch of #{batch_size}" + Page.scrape_batch(batch_size) + Rails.logger.info "Done scraping batch of #{batch_size} at #{((Time.now.to_f - start.to_f) / batch_size.to_f).round(2)}seconds/page" + + sleep 10 if Page.count.zero? + end + rescue GracefulShutdown + end +end diff --git a/lib/tasks/whois_records_query.rake b/lib/tasks/whois_records_query.rake new file mode 100644 index 0000000..c0105f1 --- /dev/null +++ b/lib/tasks/whois_records_query.rake @@ -0,0 +1,34 @@ +desc 'scrape whois contacts' +task 'whois:scrape' do + domains = CSV.parse(File.read(Rails.root.join('vendor/domain_lookups.csv'))).drop(1).map(&:first).map(&:strip) + + w = Whois::Client.new + + CSV.open(Rails.root.join('tmp', 'domain_lookups_output.csv'), 'wb') do |csv| + csv << %w(domain contact) + + domains.each do |d| + domain = + if d =~ /^(\d+\.?)+$/ + d + else + d.split('.').last(2).join('.') + end + + tries = 0 + + begin + email = w.lookup(domain).technical_contact.try(:email) + rescue Whois::WebInterfaceError, Whois::NoInterfaceError + rescue Timeout::Error, Whois::ConnectionError => e + tries += 1 + retry if tries <= 5 + rescue => e + puts "Error:" + puts [d, e].inspect + end + + csv << [d, email || 'Unknown'] + end + end +end diff --git a/log/.keep b/log/.keep new file mode 100644 index 0000000..e69de29 diff --git a/public/404.html b/public/404.html new file mode 100644 index 0000000..a0daa0c --- /dev/null +++ b/public/404.html @@ -0,0 +1,58 @@ + + + + The page you were looking for doesn't exist (404) + + + + + +
+

The page you were looking for doesn't exist.

+

You may have mistyped the address or the page may have moved.

+
+

If you are the application owner check the logs for more information.

+ + diff --git a/public/422.html b/public/422.html new file mode 100644 index 0000000..fbb4b84 --- /dev/null +++ b/public/422.html @@ -0,0 +1,58 @@ + + + + The change you wanted was rejected (422) + + + + + +
+

The change you wanted was rejected.

+

Maybe you tried to change something you didn't have access to.

+
+

If you are the application owner check the logs for more information.

+ + diff --git a/public/500.html b/public/500.html new file mode 100644 index 0000000..e9052d3 --- /dev/null +++ b/public/500.html @@ -0,0 +1,57 @@ + + + + We're sorry, but something went wrong (500) + + + + + +
+

We're sorry, but something went wrong.

+
+

If you are the application owner check the logs for more information.

+ + diff --git a/public/favicon.ico b/public/favicon.ico new file mode 100644 index 0000000..e69de29 diff --git a/public/robots.txt b/public/robots.txt new file mode 100644 index 0000000..1a3a5e4 --- /dev/null +++ b/public/robots.txt @@ -0,0 +1,5 @@ +# See http://www.robotstxt.org/wc/norobots.html for documentation on how to use the robots.txt file +# +# To ban all spiders from the entire site uncomment the next two lines: +# User-agent: * +# Disallow: / diff --git a/spec/controllers/static_controller_spec.rb b/spec/controllers/static_controller_spec.rb new file mode 100644 index 0000000..a3db7c0 --- /dev/null +++ b/spec/controllers/static_controller_spec.rb @@ -0,0 +1,5 @@ +require 'spec_helper' + +describe StaticController do + +end diff --git a/spec/helpers/static_helper_spec.rb b/spec/helpers/static_helper_spec.rb new file mode 100644 index 0000000..3df69da --- /dev/null +++ b/spec/helpers/static_helper_spec.rb @@ -0,0 +1,15 @@ +require 'spec_helper' + +# Specs in this file have access to a helper object that includes +# the StaticHelper. For example: +# +# describe StaticHelper do +# describe "string concat" do +# it "concats two strings with spaces" do +# expect(helper.concat_strings("this","that")).to eq("this that") +# end +# end +# end +describe StaticHelper do + pending "add some examples to (or delete) #{__FILE__}" +end diff --git a/spec/mailers/notification_spec.rb b/spec/mailers/notification_spec.rb new file mode 100644 index 0000000..77b73b9 --- /dev/null +++ b/spec/mailers/notification_spec.rb @@ -0,0 +1,5 @@ +require "spec_helper" + +describe Notification do + pending "add some examples to (or delete) #{__FILE__}" +end diff --git a/spec/models/page_spec.rb b/spec/models/page_spec.rb new file mode 100644 index 0000000..2cd1666 --- /dev/null +++ b/spec/models/page_spec.rb @@ -0,0 +1,5 @@ +require 'spec_helper' + +describe Page do + pending "add some examples to (or delete) #{__FILE__}" +end diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb new file mode 100644 index 0000000..943bc19 --- /dev/null +++ b/spec/spec_helper.rb @@ -0,0 +1,42 @@ +# This file is copied to spec/ when you run 'rails generate rspec:install' +ENV["RAILS_ENV"] ||= 'test' +require File.expand_path("../../config/environment", __FILE__) +require 'rspec/rails' +require 'rspec/autorun' + +# Requires supporting ruby files with custom matchers and macros, etc, +# in spec/support/ and its subdirectories. +Dir[Rails.root.join("spec/support/**/*.rb")].each { |f| require f } + +# Checks for pending migrations before tests are run. +# If you are not using ActiveRecord, you can remove this line. +ActiveRecord::Migration.check_pending! if defined?(ActiveRecord::Migration) + +RSpec.configure do |config| + # ## Mock Framework + # + # If you prefer to use mocha, flexmock or RR, uncomment the appropriate line: + # + # config.mock_with :mocha + # config.mock_with :flexmock + # config.mock_with :rr + + # Remove this line if you're not using ActiveRecord or ActiveRecord fixtures + config.fixture_path = "#{::Rails.root}/spec/fixtures" + + # If you're not using ActiveRecord, or you'd prefer not to run each of your + # examples within a transaction, remove the following line or assign false + # instead of true. + config.use_transactional_fixtures = true + + # If true, the base class of anonymous controllers will be inferred + # automatically. This will be the default behavior in future versions of + # rspec-rails. + config.infer_base_class_for_anonymous_controllers = false + + # Run specs in random order to surface order dependencies. If you find an + # order dependency and want to debug it, you can fix the order by providing + # the seed, which is printed after each run. + # --seed 1234 + config.order = "random" +end diff --git a/vendor/assets/javascripts/.keep b/vendor/assets/javascripts/.keep new file mode 100644 index 0000000..e69de29 diff --git a/vendor/assets/stylesheets/.keep b/vendor/assets/stylesheets/.keep new file mode 100644 index 0000000..e69de29 diff --git a/vendor/urls.txt b/vendor/urls.txt new file mode 100644 index 0000000..cd4e955 --- /dev/null +++ b/vendor/urls.txt @@ -0,0 +1,100 @@ +http://www.huffingtonpost.com/2013/09/10/kanye-west-ray-j-hit-it-first_n_3900572.html +http://www.huffingtonpost.com/2013/10/14/eminem-rap-god-debuts-marshall-mathers-lp-2_n_4098815.html +http://www.huffingtonpost.com/2013/09/16/rapper-common-chicago_n_3922182.html +http://www.refinedhype.com/hyped/entry/no-sense-lil-wayne-dedication-5 +http://thefreshheir.com/2013/11/26/new-music-sxmplelife-lloyd-banks-ft-50-cent-on-fire-bootleg/ +http://thefreshheir.com/2013/12/03/video-tupac-shakur-on-life-and-death-animated-interview-1994/ +http://www.huffingtonpost.com/2013/10/29/kanye-west-interview_n_4175351.html +http://www.mostlyjunkfood.com/the-many-nicknames-of-tyler-the-creator/ +http://www.huffingtonpost.com/2013/12/06/kanye-kid-china-photo_n_4399924.html +http://www.mostlyjunkfood.com/i-think-im-turning-japanese-raps-infatuation-with-benihana/ +http://thefreshheir.com/2013/11/26/rare-photo-of-kanye-west-as-a-child-surfaces/ +http://www.mostlyjunkfood.com/justin-timberlakes-the-2020-experience-album-art-tracklist/ +http://thefreshheir.com/2013/11/26/video-ne-hip-hop-interviews-natural/ +http://www.vibe.com/article/new-music-casey-veggies-and-rockie-fresh-aladdin +http://www.refinedhype.com/hyped/entry/classic-hate-juicy +http://www.refinedhype.com/hyped/entry/kanye-quote-post +http://www.mostlyjunkfood.com/review-drake-nothing/ +http://www.huffingtonpost.com/2013/11/25/childish-gambino-sweatpants_n_4338141.html +http://www.mostlyjunkfood.com/stream-commons-the-dreamer-the-believer-album-before-it-hits-stores-december-20th/ +http://www.huffingtonpost.com/2013/10/20/kanye-west-yeezus-tour-seattle-kendrick-lamar_n_4132372.html +http://www.huffingtonpost.com/2013/10/21/arcade-fire-afterlife-video_n_4139415.html +http://www.mostlyjunkfood.com/danny-brown-xxx-review/ +http://www.huffingtonpost.com/2013/12/20/lamborghini-huracan--lp-610-4-sports-car_n_4480302.html?utm +http://thefreshheir.com/2013/11/28/new-music-childish-gambino-telegraph-ave/ +http://www.huffingtonpost.com/2013/03/29/8-things-women-need-to-bitches-to-do-books_n_2979633.html +http://www.mostlyjunkfood.com/lists-suck-10-cosby-show-references-in-rap-songs/ +http://www.huffingtonpost.com/2013/12/12/kanye-west-returns-to-south-park_n_4433298.html +http://www.huffingtonpost.com/2013/06/24/kanye-west-i-am-a-god-fashion-week-diss_n_3490688.html +http://www.refinedhype.com/hyped/entry/fotw-drake-gets-emotional-chinese +http://www.mostlyjunkfood.com/pointcounterpoint-yellow-album-dom-kennedy-review/ +http://www.refinedhype.com/hyped/entry/craziest-lines-2-chainz-boats-2 +http://www.vibe.com/article/new-video-awkword-throw-away-key +http://www.huffingtonpost.com/2013/11/27/eminem-rap-god-video_n_4351130.html +http://www.refinedhype.com/hyped/entry/vagina-rap +http://www.huffingtonpost.com/2013/11/04/ben-stiller-something-about-mary-hair-gel_n_4212720.html +http://www.mostlyjunkfood.com/listen-kendrick-lamar-featuring-dr-dre-the-recipe/ +http://www.mostlyjunkfood.com/jay-electronica-call-of-duty-mw3-ft-mobb-deep/ +http://www.vibe.com/article/watch-t-pain-raps-without-auto-tune-name-drops-hello-kitty-work-video +http://www.refinedhype.com/hyped/entry/da-real-lambo-lebrons-mom +http://www.mostlyjunkfood.com/frank-ocean-acura-integurl-video/ +http://www.mostlyjunkfood.com/review-mellowhype-numbers/ +http://www.mostlyjunkfood.com/mp3-flux-pavilion-do-or-die-ft-childish-gambino/ +http://thefreshheir.com/2013/12/03/cant-tell-me-shit-remix/ +http://www.mostlyjunkfood.com/mp3-heems-killing-time/ +http://www.refinedhype.com/hyped/entry/fotw-paris-hilton-weezy +http://www.huffingtonpost.com/2013/10/27/paris-hilton-miley-cyrus-halloween_n_4167824.html +http://www.vibe.com/article/review-lupe-fiascos-peace-papercup-jayzus-proves-hes-best-lyricist-alive +http://www.mostlyjunkfood.com/drake-headlines/ +http://www.vibe.com/article/eminem-rap-god-lyrics +http://www.huffingtonpost.com/2013/11/21/law-and-order-svu-line_n_4316568.html +http://www.mostlyjunkfood.com/return-of-the-mac-top-10-steve-jobs-inspired-rap-lyrics/ +http://www.vibe.com/article/lecrae-talks-gravity-album-bridging-gaps-and-jeremy-lin +http://www.mostlyjunkfood.com/the-curious-case-of-sad-kanye-the-grey-sweatshirt/ +http://www.vibe.com/article/rick-ross-says-hes-good-standing-reebok +http://thefreshheir.com/2013/11/26/video-napoleon-lv-dreamcatcher/ +http://www.refinedhype.com/hyped/entry/bite-or-not-jay-z-biggie/ +http://www.mostlyjunkfood.com/lil-b-calls-the-game-irrelevant-video/ +http://www.refinedhype.com/hyped/entry/jay-z-pound-cake-wack +http://www.vibe.com/article/drakes-sophomore-album-too-emo-men +http://www.vibe.com/article/opinion-what-does-kendrick-lamar-vs-drake-mean-j-cole +http://www.huffingtonpost.com/2013/12/16/2-chainz-u-da-realest-video_n_4452612.html +http://thefreshheir.com/2013/12/04/new-music-love-mansuy-white/ +http://www.refinedhype.com/hyped/entry/migos-brokeanese +http://www.huffingtonpost.com/2013/12/09/ryan-seacrest-vh1-docu-series-white-female-rappers_n_4413515.html +http://www.mostlyjunkfood.com/drake-free-spirit-ft-rick-ross/ +http://www.mostlyjunkfood.com/soul-khan-speeding-bullets-video/ +http://www.mostlyjunkfood.com/the-outlawz-actually-smoked-tupacs-ashes-in-a-blunt-video/ +http://thefreshheir.com/2013/11/28/video-chance-the-rapper-performs-new-song-in-chicago/ +http://www.huffingtonpost.com/2013/03/28/rick-ross-uoeno-lyrics-rapper-responds_n_2974891.html +http://www.mostlyjunkfood.com/tyler-the-creator-goblin/ +http://www.mostlyjunkfood.com/the-roots-%e2%80%93-make-my-ft-big-k-r-i-t/ +http://www.huffingtonpost.com/2013/12/10/ariana-grande-santa-baby-liz-gillies_n_4417929.html +http://www.mostlyjunkfood.com/odd-future-at-hollywood-palladium-93011/ +http://www.huffingtonpost.com/2013/12/09/justin-bieber-confident-music-mondays-journals_n_4411851.html +http://www.refinedhype.com/hyped/entry/lil-dicky-white-dude +http://www.mostlyjunkfood.com/kendrick-lamar-rigamortus-ft-busta-rhymes-remix/ +http://www.mostlyjunkfood.com/five-music-tips-for-unsigned-artists/ +http://www.refinedhype.com/hyped/entry/rap-stats-rap-nerd-orgasm +http://thefreshheir.com/2013/11/20/video-martin-ky-polaroid/ +http://www.huffingtonpost.com/2013/10/10/kanye-west-gone-billboard-top-20-eight-years_n_4076965.html +http://www.refinedhype.com/hyped/entry/lupe-fiasco-put-em-down +http://thefreshheir.com/2013/11/26/video-wara-from-the-nbhd-98-rocafella/ +http://www.huffingtonpost.com/2012/11/15/robert-deniro-jay-z-actor-scolds-rapper_n_2137459.html +http://www.huffingtonpost.com/2013/02/21/emmett-tills-family-responds-lil-wayne-lyric-open-letter_n_2733765.html +http://www.huffingtonpost.com/2013/12/20/lamborghini-huracan--lp-610-4-sports-car_n_4480302.html +http://www.mostlyjunkfood.com/outkast-to-reunite-on-remixes-for-frank-oceans-pink-matter-t-i-s-sorry/ +http://www.mostlyjunkfood.com/ofwgkta-odd-future-tape-vol-2-concert-review/ +http://www.huffingtonpost.com/2013/08/13/kendrick-lamar-control-big-sean_n_3748466.html +http://www.huffingtonpost.com/2013/10/20/mase-now-we-even-rappers-first-album-in-years_n_4131795.html +http://www.huffingtonpost.com/2013/11/19/kanye-west-gigli-moment_n_4303434.html +http://www.refinedhype.com/hyped/entry/breaking-down-jay-z-100-bill-flow +http://www.huffingtonpost.com/2013/10/27/lorde-royals-youngest-artist-british-singles-chart_n_4168113.html +http://www.mostlyjunkfood.com/shotsfired-nothing-sames-hypothetical-hypothetical-subliminals/ +http://www.huffingtonpost.com/2013/09/15/drake-jay-z-pound-cake-paris-morton-music_n_3932410.html +http://www.refinedhype.com/hyped/entry/j-coles-control-response +http://www.vibe.com/article/take-look-adolescent-kanye-west-china-photo +http://www.huffingtonpost.com/2013/06/29/kanye-west-black-skinhead_n_3522146.html +http://www.refinedhype.com/hyped/entry/nmpa-rapgenius +http://thefreshheir.com/2013/11/25/new-music-dom-mclennon-wheredidmymindgo-demo/ +http://thefreshheir.com/2013/11/28/video-young-thug-some-more/