From d57af88cee3b3885b8c929ead4c6af2ff6948520 Mon Sep 17 00:00:00 2001 From: Jack Veenstra Date: Wed, 16 Jan 2013 13:49:10 -0800 Subject: [PATCH 1/7] Populate the 'authors' table with unique commit authors. --- .../20130116081012_populate_authors_table.rb | 53 +++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 migrations/20130116081012_populate_authors_table.rb diff --git a/migrations/20130116081012_populate_authors_table.rb b/migrations/20130116081012_populate_authors_table.rb new file mode 100644 index 00000000..b8584a83 --- /dev/null +++ b/migrations/20130116081012_populate_authors_table.rb @@ -0,0 +1,53 @@ +require "bundler/setup" +require "pathological" +require "migrations/migration_helper.rb" +require "grit" + +# This migration populates the "authors" table by fetching all the commits from all the repos +# and storing the unique commit authors in the "authors" table. If it finds an identical email +# address in the "users" table then it also adds the "user_id" to the authors table entry. + +# This is the number of commits that we fetch at a time. +PAGE_SIZE = 100 + +Sequel.migration do + up do + repos = DB[:git_repos].all + + # Find all the unique authors + authors = Hash.new { |hash, key| hash[key] = {} } + repos.each do |repo| + grit_repo = Grit::Repo.new(repo[:path]) + total = 0 + num = 0 + begin + commits = grit_repo.commits("master", PAGE_SIZE, total) + commits.each { |commit| authors[commit.author.email][:name] = commit.author.name } + num = commits.length + total += num + end while num == PAGE_SIZE + puts "Processed #{total} commits from repo #{repo[:path]}." + end + + # Find matching users (by email) in the "users" table. + authors.keys.each do |email| + user = DB[:users].first(:email => email) + authors[email][:user_id] = user[:id] if user + end + puts "Found #{authors.length} unique authors." + + # Fill in the "authors" table. + num_inserts = 0 + authors.each do |key, value| + row = DB[:authors].first(:email => key) + next if row + DB[:authors].insert(:email => key, :name => value[:name], :user_id => value[:user_id]) + num_inserts += 1 + end + puts "Inserted #{num_inserts} new authors." + end + + # We don't need to remove the author entries. + down do + end +end From 3a01ed36d71b06c156c9fec9e838cfff8b9bb580 Mon Sep 17 00:00:00 2001 From: Jack Veenstra Date: Wed, 16 Jan 2013 15:20:06 -0800 Subject: [PATCH 2/7] Add author_id field to 'commits' table. --- migrations/20130116151408_add_author_id_to_commits.rb | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 migrations/20130116151408_add_author_id_to_commits.rb diff --git a/migrations/20130116151408_add_author_id_to_commits.rb b/migrations/20130116151408_add_author_id_to_commits.rb new file mode 100644 index 00000000..263f1730 --- /dev/null +++ b/migrations/20130116151408_add_author_id_to_commits.rb @@ -0,0 +1,11 @@ +require "bundler/setup" +require "pathological" +require "migrations/migration_helper.rb" + +Sequel.migration do + change do + alter_table :commits do + add_foreign_key :author_id, :authors, :key => :id + end + end +end From 999e0b554af3abc9bbaa414de98fad5dcb383bc6 Mon Sep 17 00:00:00 2001 From: Jack Veenstra Date: Wed, 16 Jan 2013 16:25:37 -0800 Subject: [PATCH 3/7] Populate the author_id field in the commits table. --- ...20130116152425_populate_author_id_field.rb | 62 +++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 migrations/20130116152425_populate_author_id_field.rb diff --git a/migrations/20130116152425_populate_author_id_field.rb b/migrations/20130116152425_populate_author_id_field.rb new file mode 100644 index 00000000..7de5d713 --- /dev/null +++ b/migrations/20130116152425_populate_author_id_field.rb @@ -0,0 +1,62 @@ +require "bundler/setup" +require "pathological" +require "migrations/migration_helper.rb" +require "grit" + +# This migration populates the "author_id" field in the "commits" table. + +# This is the number of commits that we fetch at a time. +PAGE_SIZE = 100 + +Sequel.migration do + up do + repos = DB[:git_repos].all + + # Fetch all the authors (expected to be small, say less than 1000) so that we don't have to do + # an extra SQL query for every commit to check if the author email exists. Then create a mapping + # from email -> author.id + rows = DB[:authors].all + puts "Read #{rows.length} rows from the 'authors' table." + authors = {} + rows.each do |row| + authors[row[:email]] = row[:id] + end + + # Create a mapping from commit sha -> author email + shas = Hash.new { |hash, key| hash[key] = {} } + repos.each do |repo| + grit_repo = Grit::Repo.new(repo[:path]) + total = 0 + num = 0 + begin + commits = grit_repo.commits("master", PAGE_SIZE, total) + commits.each do |commit| + email = commit.author.email + author_id = authors[email] + if author_id.nil? + # This shouldn't happen. We should already have the author in our db. But if not, + # add the author to the db. + user = DB[:users].first(:email => email) + user_id = user[:id] if user + DB[:authors].insert(:email => email, :name => commit.author.name, :user_id => user_id) + + # Get the author_id and add it to the hash. + author = DB[:authors].first(:email => email) + author_id = author[:id] + authors[email] = author_id + end + + # Update the author_id field. + DB[:commits].filter(:sha => commit.sha).update(:author_id => author_id) + end + num = commits.length + total += num + end while num == PAGE_SIZE + puts "Processed #{total} commits from repo #{repo[:path]}." + end + end + + # We don't need to undo this. + down do + end +end From 2203c968a988a2c81435d7be38e4e9bdfffce9b4 Mon Sep 17 00:00:00 2001 From: Jack Veenstra Date: Wed, 16 Jan 2013 17:55:30 -0800 Subject: [PATCH 4/7] Insert new commit authors into 'authors' table when new commits are ingested. This also fills in the "author_id" field for new commits. --- lib/models.rb | 1 + models/author.rb | 4 ++++ models/user.rb | 1 + resque_jobs/db_commit_ingest.rb | 16 ++++++++++++++++ 4 files changed, 22 insertions(+) create mode 100644 models/author.rb diff --git a/lib/models.rb b/lib/models.rb index 09adbc3c..52b8c35e 100644 --- a/lib/models.rb +++ b/lib/models.rb @@ -11,6 +11,7 @@ # Auto-populate "created_at" and "updated_at" fields. Sequel::Model.plugin :timestamps +require "models/author" require "models/git_repo" require "models/git_branch" require "models/user" diff --git a/models/author.rb b/models/author.rb new file mode 100644 index 00000000..883bf424 --- /dev/null +++ b/models/author.rb @@ -0,0 +1,4 @@ +class Author < Sequel::Model + # This is really one_to_one, but Sequel requires the table containing the foreign key to be many_to_one. + many_to_one :user +end diff --git a/models/user.rb b/models/user.rb index 37ae46e6..6ade5607 100644 --- a/models/user.rb +++ b/models/user.rb @@ -12,6 +12,7 @@ class User < Sequel::Model one_to_many :saved_searches, :order => [:user_order.desc] one_to_many :comments + one_to_one :author ONE_YEAR = 365 diff --git a/resque_jobs/db_commit_ingest.rb b/resque_jobs/db_commit_ingest.rb index 5186645b..afcdf87c 100644 --- a/resque_jobs/db_commit_ingest.rb +++ b/resque_jobs/db_commit_ingest.rb @@ -44,6 +44,7 @@ def self.perform(repo_name, remote_name) page_of_rows_to_insert = commits.map do |commit| next if existing_shas.include?(commit.sha) + author_id = insert_author_if_new(commit) { :git_repo_id => db_repo.id, @@ -52,6 +53,7 @@ def self.perform(repo_name, remote_name) # NOTE(caleb): For some reason, the commit object you get from a remote returns nil for #date (but # it does have #authored_date and #committed_date. Bug? :date => commit.authored_date, + :author_id => author_id, } end page_of_rows_to_insert.compact! @@ -78,4 +80,18 @@ def self.perform(repo_name, remote_name) Resque.enqueue(GenerateTaggedDiffs, repo_name, row[:sha]) end end + + # Given a new commit, insert the author into the authors table if the author does not exist. + # In any case, returns the author id. + def self.insert_author_if_new(commit) + email = commit.author.email + author = Author.first(:email => email) + return author.id if author + + user = User.first(:email => email) + user_id = user.id if user + Author.insert(:email => email, :name => commit.author.name, :user_id => user_id) + author = Author.first(:email => email) + author.id + end end From 3d56362c640062fc21ff668e6b13b3a34dbf7bb9 Mon Sep 17 00:00:00 2001 From: Jack Veenstra Date: Thu, 17 Jan 2013 11:13:35 -0800 Subject: [PATCH 5/7] Link authors to users when they sign in. --- barkeep_server.rb | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/barkeep_server.rb b/barkeep_server.rb index d2176e7e..a001fb61 100644 --- a/barkeep_server.rb +++ b/barkeep_server.rb @@ -247,6 +247,7 @@ def ensure_required_params(*required_params) permission = User.find(:permission => "admin").nil? ? "admin" : "normal" User.new(:email => email, :name => email, :permission => permission).save end + link_author_to_user(email) redirect session[:login_started_url] || "/" end end @@ -559,6 +560,19 @@ def get_openid_login_redirect(openid_provider_url) end end + # If the email matches an entry in both the authors table and the users table, then create + # a link between them by updating the "user_id" field in the authors table. + def link_author_to_user(email) + author = Author.first(:email => email) + if author && author.user_id.nil? + user = User.first(:email => email) + if user + author.user_id = user.id + author.save + end + end + end + def create_comment(repo_name, sha, filename, line_number_string, text) commit = MetaRepo.instance.db_commit(repo_name, sha) raise "No such commit." unless commit From 31198ce1b5eb8cf3af067f9dcabcbbbe7d8a12aa Mon Sep 17 00:00:00 2001 From: Jack Veenstra Date: Thu, 17 Jan 2013 13:07:08 -0800 Subject: [PATCH 6/7] Modified the migration that populates the author_id field to iterate over the db commits (instead of Grit). --- ...20130116152425_populate_author_id_field.rb | 67 +++++++++---------- 1 file changed, 30 insertions(+), 37 deletions(-) diff --git a/migrations/20130116152425_populate_author_id_field.rb b/migrations/20130116152425_populate_author_id_field.rb index 7de5d713..35800f4c 100644 --- a/migrations/20130116152425_populate_author_id_field.rb +++ b/migrations/20130116152425_populate_author_id_field.rb @@ -5,55 +5,48 @@ # This migration populates the "author_id" field in the "commits" table. -# This is the number of commits that we fetch at a time. -PAGE_SIZE = 100 - Sequel.migration do up do - repos = DB[:git_repos].all - # Fetch all the authors (expected to be small, say less than 1000) so that we don't have to do # an extra SQL query for every commit to check if the author email exists. Then create a mapping # from email -> author.id rows = DB[:authors].all - puts "Read #{rows.length} rows from the 'authors' table." authors = {} rows.each do |row| authors[row[:email]] = row[:id] end - # Create a mapping from commit sha -> author email - shas = Hash.new { |hash, key| hash[key] = {} } - repos.each do |repo| - grit_repo = Grit::Repo.new(repo[:path]) - total = 0 - num = 0 - begin - commits = grit_repo.commits("master", PAGE_SIZE, total) - commits.each do |commit| - email = commit.author.email - author_id = authors[email] - if author_id.nil? - # This shouldn't happen. We should already have the author in our db. But if not, - # add the author to the db. - user = DB[:users].first(:email => email) - user_id = user[:id] if user - DB[:authors].insert(:email => email, :name => commit.author.name, :user_id => user_id) - - # Get the author_id and add it to the hash. - author = DB[:authors].first(:email => email) - author_id = author[:id] - authors[email] = author_id - end - - # Update the author_id field. - DB[:commits].filter(:sha => commit.sha).update(:author_id => author_id) - end - num = commits.length - total += num - end while num == PAGE_SIZE - puts "Processed #{total} commits from repo #{repo[:path]}." + # Fetch all the git repos (also small, less than 100) and create a mapping from + # git_repo_id -> Grit::Repo + repos = {} + DB[:git_repos].each { |row| repos[row[:id]] = Grit::Repo.new(row[:path]) } + + total_updates = 0 + new_authors = 0 + commits = DB[:commits].filter(:author_id => nil).all + commits.each do |row| + commit = repos[row[:git_repo_id]].commit(row[:sha]) + next unless commit + email = commit.author.email + author_id = authors[email] + # If the author is not in our db, then add it. + if author_id.nil? + # Check if the same email exists in the users table. + user = DB[:users].first(:email => email) + user_id = user[:id] if user + DB[:authors].insert(:email => email, :name => commit.author.name, :user_id => user_id) + + # Get the author_id and add it to the hash. + author = DB[:authors].first(:email => email) + author_id = author[:id] + authors[email] = author_id + new_authors += 1 + end + total_updates += 1 + DB[:commits].filter(:id => row[:id]).update(:author_id => authors[email]) end + puts "New authors: #{new_authors}" + puts "Updated commits: #{total_updates}" end # We don't need to undo this. From f850dd5e1ea22348444b5a82c3fdfe73ac829704 Mon Sep 17 00:00:00 2001 From: Jack Veenstra Date: Thu, 17 Jan 2013 17:31:48 -0800 Subject: [PATCH 7/7] Add Sequel associations between Author and Commit. --- models/author.rb | 1 + models/commit.rb | 3 +++ 2 files changed, 4 insertions(+) diff --git a/models/author.rb b/models/author.rb index 883bf424..2af9e257 100644 --- a/models/author.rb +++ b/models/author.rb @@ -1,4 +1,5 @@ class Author < Sequel::Model # This is really one_to_one, but Sequel requires the table containing the foreign key to be many_to_one. many_to_one :user + one_to_one :commit end diff --git a/models/commit.rb b/models/commit.rb index a50da71e..de9dbc32 100644 --- a/models/commit.rb +++ b/models/commit.rb @@ -12,6 +12,9 @@ class Commit < Sequel::Model one_to_many :comments many_to_one :approved_by_user, :class => User + # This is really one_to_one, but Sequel requires the table containing the foreign key to be many_to_one. + many_to_one :author + add_association_dependencies :comments => :destroy, :commit_files => :destroy add_filter(:message) { |message| StringFilter.escape_html(message) }