Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Populate the 'authors' table with unique commit authors. #386

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
14 changes: 14 additions & 0 deletions barkeep_server.rb
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,7 @@ def ensure_required_params(*required_params)
permission = User.find(:permission => "admin").nil? ? "admin" : "normal"
User.new(:email => email, :name => email, :permission => permission).save
end
link_author_to_user(email)
redirect session[:login_started_url] || "/"
end
end
Expand Down Expand Up @@ -559,6 +560,19 @@ def get_openid_login_redirect(openid_provider_url)
end
end

# If the email matches an entry in both the authors table and the users table, then create
# a link between them by updating the "user_id" field in the authors table.
def link_author_to_user(email)
author = Author.first(:email => email)
if author && author.user_id.nil?
user = User.first(:email => email)
if user
author.user_id = user.id
author.save
end
end
end

def create_comment(repo_name, sha, filename, line_number_string, text)
commit = MetaRepo.instance.db_commit(repo_name, sha)
raise "No such commit." unless commit
Expand Down
1 change: 1 addition & 0 deletions lib/models.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
# Auto-populate "created_at" and "updated_at" fields.
Sequel::Model.plugin :timestamps

require "models/author"
require "models/git_repo"
require "models/git_branch"
require "models/user"
Expand Down
53 changes: 53 additions & 0 deletions migrations/20130116081012_populate_authors_table.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
require "bundler/setup"
require "pathological"
require "migrations/migration_helper.rb"
require "grit"

# This migration populates the "authors" table by fetching all the commits from all the repos
# and storing the unique commit authors in the "authors" table. If it finds an identical email
# address in the "users" table then it also adds the "user_id" to the authors table entry.

# This is the number of commits that we fetch at a time.
PAGE_SIZE = 100

Sequel.migration do
up do
repos = DB[:git_repos].all

# Find all the unique authors
authors = Hash.new { |hash, key| hash[key] = {} }
repos.each do |repo|
grit_repo = Grit::Repo.new(repo[:path])
total = 0
num = 0
begin
commits = grit_repo.commits("master", PAGE_SIZE, total)
commits.each { |commit| authors[commit.author.email][:name] = commit.author.name }
num = commits.length
total += num
end while num == PAGE_SIZE
puts "Processed #{total} commits from repo #{repo[:path]}."
end

# Find matching users (by email) in the "users" table.
authors.keys.each do |email|
user = DB[:users].first(:email => email)
authors[email][:user_id] = user[:id] if user
end
puts "Found #{authors.length} unique authors."

# Fill in the "authors" table.
num_inserts = 0
authors.each do |key, value|
row = DB[:authors].first(:email => key)
next if row
DB[:authors].insert(:email => key, :name => value[:name], :user_id => value[:user_id])
num_inserts += 1
end
puts "Inserted #{num_inserts} new authors."
end

# We don't need to remove the author entries.
down do
end
end
11 changes: 11 additions & 0 deletions migrations/20130116151408_add_author_id_to_commits.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
require "bundler/setup"
require "pathological"
require "migrations/migration_helper.rb"

Sequel.migration do
change do
alter_table :commits do
add_foreign_key :author_id, :authors, :key => :id
end
end
end
55 changes: 55 additions & 0 deletions migrations/20130116152425_populate_author_id_field.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
require "bundler/setup"
require "pathological"
require "migrations/migration_helper.rb"
require "grit"

# This migration populates the "author_id" field in the "commits" table.

Sequel.migration do
up do
# Fetch all the authors (expected to be small, say less than 1000) so that we don't have to do
# an extra SQL query for every commit to check if the author email exists. Then create a mapping
# from email -> author.id
rows = DB[:authors].all
authors = {}
rows.each do |row|
authors[row[:email]] = row[:id]
end

# Fetch all the git repos (also small, less than 100) and create a mapping from
# git_repo_id -> Grit::Repo
repos = {}
DB[:git_repos].each { |row| repos[row[:id]] = Grit::Repo.new(row[:path]) }

total_updates = 0
new_authors = 0
commits = DB[:commits].filter(:author_id => nil).all
commits.each do |row|
commit = repos[row[:git_repo_id]].commit(row[:sha])
next unless commit
email = commit.author.email
author_id = authors[email]
# If the author is not in our db, then add it.
if author_id.nil?
# Check if the same email exists in the users table.
user = DB[:users].first(:email => email)
user_id = user[:id] if user
DB[:authors].insert(:email => email, :name => commit.author.name, :user_id => user_id)

# Get the author_id and add it to the hash.
author = DB[:authors].first(:email => email)
author_id = author[:id]
authors[email] = author_id
new_authors += 1
end
total_updates += 1
DB[:commits].filter(:id => row[:id]).update(:author_id => authors[email])
end
puts "New authors: #{new_authors}"
puts "Updated commits: #{total_updates}"
end

# We don't need to undo this.
down do
end
end
5 changes: 5 additions & 0 deletions models/author.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
class Author < Sequel::Model
# This is really one_to_one, but Sequel requires the table containing the foreign key to be many_to_one.
many_to_one :user
one_to_one :commit
end
3 changes: 3 additions & 0 deletions models/commit.rb
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ class Commit < Sequel::Model
one_to_many :comments
many_to_one :approved_by_user, :class => User

# This is really one_to_one, but Sequel requires the table containing the foreign key to be many_to_one.
many_to_one :author

add_association_dependencies :comments => :destroy, :commit_files => :destroy

add_filter(:message) { |message| StringFilter.escape_html(message) }
Expand Down
1 change: 1 addition & 0 deletions models/user.rb
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
class User < Sequel::Model
one_to_many :saved_searches, :order => [:user_order.desc]
one_to_many :comments
one_to_one :author

ONE_YEAR = 365

Expand Down
16 changes: 16 additions & 0 deletions resque_jobs/db_commit_ingest.rb
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ def self.perform(repo_name, remote_name)

page_of_rows_to_insert = commits.map do |commit|
next if existing_shas.include?(commit.sha)
author_id = insert_author_if_new(commit)

{
:git_repo_id => db_repo.id,
Expand All @@ -52,6 +53,7 @@ def self.perform(repo_name, remote_name)
# NOTE(caleb): For some reason, the commit object you get from a remote returns nil for #date (but
# it does have #authored_date and #committed_date. Bug?
:date => commit.authored_date,
:author_id => author_id,
}
end
page_of_rows_to_insert.compact!
Expand All @@ -78,4 +80,18 @@ def self.perform(repo_name, remote_name)
Resque.enqueue(GenerateTaggedDiffs, repo_name, row[:sha])
end
end

# Given a new commit, insert the author into the authors table if the author does not exist.
# In any case, returns the author id.
def self.insert_author_if_new(commit)
email = commit.author.email
author = Author.first(:email => email)
return author.id if author

user = User.first(:email => email)
user_id = user.id if user
Author.insert(:email => email, :name => commit.author.name, :user_id => user_id)
author = Author.first(:email => email)
author.id
end
end