Skip to content

Commit

Permalink
Initial
Browse files Browse the repository at this point in the history
  • Loading branch information
Andrew Warner committed Jan 2, 2014
0 parents commit 0c86123
Show file tree
Hide file tree
Showing 82 changed files with 1,734 additions and 0 deletions.
19 changes: 19 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# See https://help.github.com/articles/ignoring-files for more about ignoring files.
#
# If you find yourself ignoring temporary files generated by your text editor
# or operating system, you probably want to add a global ignore instead:
# git config --global core.excludesfile '~/.gitignore_global'

# Ignore bundler config.
/.bundle

# Ignore the default SQLite database.
/db/*.sqlite3
/db/*.sqlite3-journal

# Ignore all logfiles and tempfiles.
/log/*.log
/tmp
vendor/all_urls.txt
vendor/domain_lookups.csv
.env
1 change: 1 addition & 0 deletions .rspec
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
--color
1 change: 1 addition & 0 deletions .ruby-gemset
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
link_scraper
1 change: 1 addition & 0 deletions .ruby-version
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
ruby-2.0.0-p195
35 changes: 35 additions & 0 deletions Gemfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
source 'https://rubygems.org'

gem "decent_exposure"
gem "decent_generators"
gem 'dotenv-rails'
gem "haml"
gem "haml-rails"
gem "librato-logreporter"
gem "pg"
gem "pry"
gem "pry-rails"
gem "twitter-bootstrap-rails"
gem 'addressable', require: 'addressable/uri'
gem 'coffee-rails', '~> 4.0.0'
gem 'jbuilder', '~> 1.2'
gem 'jquery-rails'
gem 'nokogiri'
gem 'rails'
gem 'sass-rails', '~> 4.0.0'
gem 'stringex'
gem 'turbolinks'
gem 'typhoeus'
gem 'uglifier', '>= 1.3.0'
gem 'whois'

group :test, :development do
gem "factory_girl"
gem "fivemat"
gem "rspec-rails"
gem "rspec"
end

group :test do
gem "shoulda-matchers"
end
193 changes: 193 additions & 0 deletions Gemfile.lock
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
GEM
remote: https://rubygems.org/
specs:
actionmailer (4.0.2)
actionpack (= 4.0.2)
mail (~> 2.5.4)
actionpack (4.0.2)
activesupport (= 4.0.2)
builder (~> 3.1.0)
erubis (~> 2.7.0)
rack (~> 1.5.2)
rack-test (~> 0.6.2)
activemodel (4.0.2)
activesupport (= 4.0.2)
builder (~> 3.1.0)
activerecord (4.0.2)
activemodel (= 4.0.2)
activerecord-deprecated_finders (~> 1.0.2)
activesupport (= 4.0.2)
arel (~> 4.0.0)
activerecord-deprecated_finders (1.0.3)
activesupport (4.0.2)
i18n (~> 0.6, >= 0.6.4)
minitest (~> 4.2)
multi_json (~> 1.3)
thread_safe (~> 0.1)
tzinfo (~> 0.3.37)
addressable (2.3.5)
arel (4.0.1)
atomic (1.1.14)
builder (3.1.4)
coderay (1.1.0)
coffee-rails (4.0.1)
coffee-script (>= 2.2.0)
railties (>= 4.0.0, < 5.0)
coffee-script (2.2.0)
coffee-script-source
execjs
coffee-script-source (1.6.3)
decent_exposure (2.3.0)
decent_generators (0.0.1)
rails (~> 4.0.0)
diff-lcs (1.2.5)
dotenv (0.9.0)
dotenv-rails (0.9.0)
dotenv (= 0.9.0)
erubis (2.7.0)
ethon (0.6.2)
ffi (>= 1.3.0)
mime-types (~> 1.18)
execjs (2.0.2)
factory_girl (4.3.0)
activesupport (>= 3.0.0)
ffi (1.9.3)
fivemat (1.2.1)
haml (4.0.4)
tilt
haml-rails (0.5.3)
actionpack (>= 4.0.1)
activesupport (>= 4.0.1)
haml (>= 3.1, < 5.0)
railties (>= 4.0.1)
hike (1.2.3)
i18n (0.6.9)
jbuilder (1.5.3)
activesupport (>= 3.0.0)
multi_json (>= 1.2.0)
jquery-rails (3.0.4)
railties (>= 3.0, < 5.0)
thor (>= 0.14, < 2.0)
json (1.8.1)
librato-logreporter (0.2.1)
mail (2.5.4)
mime-types (~> 1.16)
treetop (~> 1.4.8)
method_source (0.8.2)
mime-types (1.25.1)
mini_portile (0.5.2)
minitest (4.7.5)
multi_json (1.8.2)
nokogiri (1.6.1)
mini_portile (~> 0.5.0)
pg (0.17.1)
polyglot (0.3.3)
pry (0.9.12.4)
coderay (~> 1.0)
method_source (~> 0.8)
slop (~> 3.4)
pry-rails (0.3.2)
pry (>= 0.9.10)
rack (1.5.2)
rack-test (0.6.2)
rack (>= 1.0)
rails (4.0.2)
actionmailer (= 4.0.2)
actionpack (= 4.0.2)
activerecord (= 4.0.2)
activesupport (= 4.0.2)
bundler (>= 1.3.0, < 2.0)
railties (= 4.0.2)
sprockets-rails (~> 2.0.0)
railties (4.0.2)
actionpack (= 4.0.2)
activesupport (= 4.0.2)
rake (>= 0.8.7)
thor (>= 0.18.1, < 2.0)
rake (10.1.1)
rspec (2.14.1)
rspec-core (~> 2.14.0)
rspec-expectations (~> 2.14.0)
rspec-mocks (~> 2.14.0)
rspec-core (2.14.7)
rspec-expectations (2.14.4)
diff-lcs (>= 1.1.3, < 2.0)
rspec-mocks (2.14.4)
rspec-rails (2.14.0)
actionpack (>= 3.0)
activesupport (>= 3.0)
railties (>= 3.0)
rspec-core (~> 2.14.0)
rspec-expectations (~> 2.14.0)
rspec-mocks (~> 2.14.0)
sass (3.2.13)
sass-rails (4.0.1)
railties (>= 4.0.0, < 5.0)
sass (>= 3.1.10)
sprockets-rails (~> 2.0.0)
shoulda-matchers (2.4.0)
activesupport (>= 3.0.0)
slop (3.4.7)
sprockets (2.10.1)
hike (~> 1.2)
multi_json (~> 1.0)
rack (~> 1.0)
tilt (~> 1.1, != 1.3.0)
sprockets-rails (2.0.1)
actionpack (>= 3.0)
activesupport (>= 3.0)
sprockets (~> 2.8)
stringex (2.1.2)
thor (0.18.1)
thread_safe (0.1.3)
atomic
tilt (1.4.1)
treetop (1.4.15)
polyglot
polyglot (>= 0.3.1)
turbolinks (2.1.0)
coffee-rails
twitter-bootstrap-rails (2.2.8)
actionpack (>= 3.1)
execjs
rails (>= 3.1)
railties (>= 3.1)
typhoeus (0.6.7)
ethon (~> 0.6.2)
tzinfo (0.3.38)
uglifier (2.4.0)
execjs (>= 0.3.0)
json (>= 1.8.0)
whois (3.4.2)

PLATFORMS
ruby

DEPENDENCIES
addressable
coffee-rails (~> 4.0.0)
decent_exposure
decent_generators
dotenv-rails
factory_girl
fivemat
haml
haml-rails
jbuilder (~> 1.2)
jquery-rails
librato-logreporter
nokogiri
pg
pry
pry-rails
rails
rspec
rspec-rails
sass-rails (~> 4.0.0)
shoulda-matchers
stringex
turbolinks
twitter-bootstrap-rails
typhoeus
uglifier (>= 1.3.0)
whois
1 change: 1 addition & 0 deletions Procfile
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
worker: bundle exec rake pages:work
26 changes: 26 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
### Rap Genius Trackback Scraper

This is the tool we used to scrape 178k URLs in 15 minutes in order to find which pages were hosting potentially spammy Rap Genius links. Given a list of URLs to scrape, it creates aggregate information that identifies the spammiest sites for manual review.

For more details on the motivation and background for this repository, check out [the blog post on Rap Genius](http://news.rapgenius.com/Rap-genius-founders-rap-genius-is-back-on-google-lyrics)

### Setup

You can run the scrape process using a set of sample data in vendor/urls.txt. To get started:

```sh
$ bundle install && rake db:create db:migrate urls:import
$ gem install foreman
$ mkdir tmp
$ foreman start worker
```

Then, once the pages have all been scraped (i.e., `Page.unscraped.count == 0`):

```ruby
# from the console
Page.write_report!
```

### License
MIT
6 changes: 6 additions & 0 deletions Rakefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Add your own tasks in files placed in lib/tasks ending in .rake,
# for example lib/tasks/capistrano.rake, and they will automatically be available to Rake.

require File.expand_path('../config/application', __FILE__)

TrackbackScraper::Application.load_tasks
Empty file added app/assets/images/.keep
Empty file.
17 changes: 17 additions & 0 deletions app/assets/javascripts/application.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
// This is a manifest file that'll be compiled into application.js, which will include all the files
// listed below.
//
// Any JavaScript/Coffee file within this directory, lib/assets/javascripts, vendor/assets/javascripts,
// or vendor/assets/javascripts of plugins, if any, can be referenced here using a relative path.
//
// It's not advisable to add code directly here, but if you do, it'll appear at the bottom of the
// compiled file.
//
// Read Sprockets README (https://github.com/sstephenson/sprockets#sprockets-directives) for details
// about supported directives.
//
//= require jquery
//= require jquery_ujs
//= require twitter/bootstrap
//= require turbolinks
//= require_tree .
3 changes: 3 additions & 0 deletions app/assets/javascripts/bootstrap.js.coffee
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
jQuery ->
$("a[rel~=popover], .has-popover").popover()
$("a[rel~=tooltip], .has-tooltip").tooltip()
3 changes: 3 additions & 0 deletions app/assets/javascripts/static.js.coffee
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Place all the behaviors and hooks related to the matching controller here.
# All this logic will automatically be available in application.js.
# You can use CoffeeScript in this file: http://coffeescript.org/
13 changes: 13 additions & 0 deletions app/assets/stylesheets/application.css
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
/*
* This is a manifest file that'll be compiled into application.css, which will include all the files
* listed below.
*
* Any CSS and SCSS file within this directory, lib/assets/stylesheets, vendor/assets/stylesheets,
* or vendor/assets/stylesheets of plugins, if any, can be referenced here using a relative path.
*
* You're free to add application-wide styles to this file and they'll appear at the top of the
* compiled file, but it's generally better to create a new file per style scope.
*
*= require_self
*= require_tree .
*/
7 changes: 7 additions & 0 deletions app/assets/stylesheets/bootstrap_and_overrides.css
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
/*
=require twitter-bootstrap-static/bootstrap
Use Font Awesome icons (default)
To use Glyphicons sprites instead of Font Awesome, replace with "require twitter-bootstrap-static/sprites"
=require twitter-bootstrap-static/fontawesome
*/
2 changes: 2 additions & 0 deletions app/assets/stylesheets/global.sass
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
.container-fluid
margin-top: 50px
3 changes: 3 additions & 0 deletions app/assets/stylesheets/static.css.scss
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
// Place all the styles related to the static controller here.
// They will automatically be included in application.css.
// You can use Sass (SCSS) here: http://sass-lang.com/
5 changes: 5 additions & 0 deletions app/controllers/application_controller.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
class ApplicationController < ActionController::Base
# Prevent CSRF attacks by raising an exception.
# For APIs, you may want to use :null_session instead.
protect_from_forgery with: :exception
end
Empty file added app/controllers/concerns/.keep
Empty file.
2 changes: 2 additions & 0 deletions app/controllers/static_controller.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
class StaticController < ApplicationController
end
2 changes: 2 additions & 0 deletions app/helpers/application_helper.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
module ApplicationHelper
end
2 changes: 2 additions & 0 deletions app/helpers/static_helper.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
module StaticHelper
end
12 changes: 12 additions & 0 deletions app/helpers/string_helper.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
module StringHelper
extend self

def coerce_to_utf8(input)
output = input.dup.force_encoding("UTF-8")

return output if output.valid_encoding?

output = output.force_encoding("BINARY")
output.encode("UTF-8", invalid: :replace, undef: :replace)
end
end
Empty file added app/mailers/.keep
Empty file.
Loading

0 comments on commit 0c86123

Please sign in to comment.