Skip to content

Commit

Permalink
Merge pull request #897 from Ensembl/feature/factory_integration_to_h…
Browse files Browse the repository at this point in the history
…ive_pipeline

Integration of genome factory for alphafold pipeline and recent chang…
  • Loading branch information
vinay-ebi authored Mar 20, 2024
2 parents b98df7c + 77cc897 commit 5cc46e6
Show file tree
Hide file tree
Showing 2 changed files with 174 additions and 107 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,12 @@
-species Production name of species to process. For a collection DB, any species from the DB
-species_list List of species production names. For a single species, a list
with just this species. For a collection DB, list of all species in the DB
-db_dir Path to the uniparc-to-uniprot DB and the uniprot-to-alpha DB, both in LevelDB format
-rest_server GIFTS rest server to fetch the perfect matches data from.
-db_dir Path to the uniparc-to-uniprot DB and the uniprot-to-alpha DB, both in KyotoCabinet format
-cs_version Needed for GIFTS, otherwise optional. Name of assembly, like 'GRCh38'.
- gifts_host The GIFTS DB hostname
- gifts_db The GIFTS DB schema name
- gifts_user The GIFTS DB user name
- gifts_pass The GIFTS DB password
=head1 EXAMPLE USAGE
Expand All @@ -69,8 +72,8 @@
-species homo_sapiens
-species_list "['homo_sapiens']"
-db_dir /hps/scratch/...
-rest_server 'https://www.ebi.ac.uk/gifts/api/'
-registry my_reg.pm
-gifts_pass 'ThePassword'
=cut

Expand All @@ -85,17 +88,16 @@ use Bio::EnsEMBL::Analysis;
use Bio::EnsEMBL::ProteinFeature;
use Bio::EnsEMBL::Registry;
use Bio::EnsEMBL::Utils::Exception qw(throw info);
use Bio::EnsEMBL::GIFTS::DB qw(fetch_latest_uniprot_enst_perfect_matches);
use Tie::LevelDB;
use Fcntl qw(:flock);
use DBD::Pg; # For GIFTS
use KyotoCabinet;

sub fetch_input {
my $self = shift;

$self->param_required('species');
$self->param_required('rest_server');
$self->param_required('db_dir');
$self->param_required('species_list');
$self->param_required('gifts_pass');

return 1;
}
Expand All @@ -107,8 +109,8 @@ sub run {
# Bio::EnsEMBL::Utils::Exception::verbose('INFO');

my $db_path = $self->param_required('db_dir');
my $idx_dir_al = $db_path . '/uniprot-to-alpha.leveldb';
my $idx_dir_up = $db_path . '/uniparc-to-uniprot.leveldb';
my $idx_dir_al = $db_path . '/uniprot-to-alphafold/uniprot-to-alphafold.kch';
my $idx_dir_up = $db_path . '/uniparc-to-uniprot/uniparc-to-uniprot.kch';

my $species = $self->param_required('species');
my $species_list = $self->param_required('species_list');
Expand All @@ -129,21 +131,33 @@ sub run {

info("Initiating and creating the analysis object for species $species\n");

dblock($db_path);

my %alpha_db;
tie(%alpha_db, 'Tie::LevelDB', $idx_dir_al)
or die "Error trying to tie Tie::LevelDB $idx_dir_al: $!";

my $al_string;
while (my ($k, $v) = each %alpha_db) {
$al_string = $v;
my $alpha_version = 0;
my $mapsize_gb = 4 << 30;
my $db = tie(my %db, 'KyotoCabinet::DB', "$idx_dir_al#msiz=$mapsize_gb#opts=l");
# traverse records by iterator
while (my ($key, $value) = each(%db)) {
$alpha_version = (split ',', $value)[-1];
chomp($alpha_version);
last;
}
untie(%alpha_db);

my $alpha_version = (split ',', $al_string)[-1];
$alpha_version //= 0;
untie %db;
untie $db;
die "Unable to extract AlphaFold version" unless $alpha_version;

my $alpha_db = new KyotoCabinet::DB;
# Set 4 GB mmap size
$alpha_db->open("$idx_dir_al#msiz=$mapsize_gb#opts=l",
$alpha_db->OREADER
) or die "Error opening DB: " . $alpha_db->error();

my $gifts_host = $self->param('gifts_host') // 'pgsql-hlvm-051.ebi.ac.uk';
my $gifts_db = $self->param('gifts_db') // 'gtigftpro';
my $gifts_user = $self->param('gifts_user') // 'giftsrw';
my $gifts_pass = $self->param_required('gifts_pass');
my $gifts_dbh = DBI->connect("dbi:Pg:dbname=$gifts_db;host=$gifts_host", $gifts_user, $gifts_pass,
{AutoCommit => 0, RaiseError => 1, PrintError => 0}
);

my $analysis = new Bio::EnsEMBL::Analysis(
-logic_name => 'alphafold',
Expand All @@ -164,24 +178,24 @@ sub run {

if (defined $self->param('cs_version')) {
my $assembly = $self->param('cs_version');
my $gifts_dir = $self->param_required('gifts_dir');
opendir(my $dh, $gifts_dir) or die "Error opening dir $gifts_dir: $!";
for my $file (readdir($dh)) {
next unless $file =~ /^(.*?)-(.*)-rel(.*?)\.csv$/;
info("Found GIFTS file $file, species: $1, assembly: $2, release: $3");
if ($2 eq $assembly) {
$mappings = read_gifts_data("$gifts_dir/$file");
info("Read data for GIFTS from file $gifts_dir/file, " . (scalar (keys ($mappings))) . " entries");
# grab GIFTS species / assembly data
my $gifts_data = read_gifts_species($gifts_dbh);
for my $row (@$gifts_data) {
my ($g_species_id, $g_species, $g_assembly, $g_release) = @$row;
info("Found species in GIFTS DB: species: $g_species, assembly: $g_assembly, release: $g_release");
if ($g_assembly eq $assembly) {
$mappings = read_gifts_data($gifts_dbh, $g_species_id);
info("Read data for GIFTS, " . (scalar (keys %$mappings)) . " entries");
last;
}
}
closedir($dh);
}
$gifts_dbh->disconnect();

my $no_uniparc = 0;
my $no_uniprot = 0;
my $no_dbid = 0;
my $protein_count = 0;
my $no_alpha = 0;
if (! $mappings or ! %$mappings) {

# If we don't have data in $mappings, this species is not in GIFTS.
Expand All @@ -190,38 +204,54 @@ sub run {
# alphafold data using the uniprot id and insert a protein feature using
# the dbid.


tie(my %uniprot_db, 'Tie::LevelDB', $idx_dir_up)
or die "Error trying to tie Tie::LevelDB $idx_dir_up: $!";
my $uniprot_db = new KyotoCabinet::DB;
# Set 4 GB mmap size
my $mapsize_gb = 4 << 30;
$uniprot_db->open("$idx_dir_up#msiz=$mapsize_gb#opts=l",
$uniprot_db->OREADER
) or die "Error opening DB: " . $uniprot_db->error();

# We currently have the same uniparc accession tied to the same
# translation_id but in different versions (xref pipeline run
# 'xrefchecksum' and 'uniparc_checksum')
my $sql = <<SQL;
SELECT xr.dbprimary_acc as uniparc_id, tr.stable_id, tr.translation_id
FROM xref xr, object_xref ox, translation tr
where external_db_id = (SELECT external_db_id FROM external_db where db_name = 'UniParc')
and xr.xref_id = ox.xref_id
and ox.ensembl_id = tr.translation_id
and ox.ensembl_object_type = 'Translation'
group by uniparc_id, tr.stable_id, tr.translation_id
SQL
my $sql = "
SELECT xr.dbprimary_acc as uniparc_id, tr.stable_id, tr.translation_id
FROM xref xr, object_xref ox, translation tr
where external_db_id = (SELECT external_db_id FROM external_db where db_name = 'UniParc')
and xr.xref_id = ox.xref_id
and ox.ensembl_id = tr.translation_id
and ox.ensembl_object_type = 'Translation'
group by uniparc_id, tr.stable_id, tr.translation_id
";

my $sth = $dbc->prepare($sql);
$sth->execute;
while ( my @row = $sth->fetchrow_array ) {
while (my @row = $sth->fetchrow_array) {
$protein_count++;
my ($uniparc_id, $stable_id, $dbid) = @row;
my $uniprot_id = $uniprot_db{$uniparc_id};
unless ($uniprot_id) {
my $uniprot_str = $uniprot_db->get($uniparc_id);
unless ($uniprot_str) {
$no_uniparc++;
next;
}
push @{$mappings->{$uniprot_id}}, {'uniparc' => $uniparc_id, 'dbid' => $dbid, 'ensid' => $stable_id};
my @uniprots = split("\t", $uniprot_str);

my $at_least_one_alpha_mapping = 0;
for my $uniprot_id (@uniprots) {
my $alpha_data = $alpha_db->get($uniprot_id);
next unless $alpha_data;
push @{$mappings->{$uniprot_id}},
{'uniparc' => $uniparc_id, 'dbid' => $dbid, 'ensid' => $stable_id, 'alpha' => $alpha_data};
$at_least_one_alpha_mapping = 1;
}
unless ($at_least_one_alpha_mapping) {
$no_alpha++;
info("No alphafold data for: UniParc acc $uniparc_id, stable_id $stable_id, translation_id $dbid");
}
}
info("Num proteins in DB $protein_count, no uniparc $no_uniparc");

untie %uniprot_db;
$uniprot_db->close() or die "Error closing DB: " . $uniprot_db->error();

} else {

Expand Down Expand Up @@ -254,22 +284,27 @@ SQL
my ($dbid, $stable_id) = @row;

unless (exists $rev_mappings{$stable_id}) {
info("No entry in GIFTS for stable ID $stable_id");
$no_dbid++;
info("No entry in GIFTS for stable ID $stable_id");
next;
}
my @rmap = @{$rev_mappings{$stable_id}};

my $at_least_one_alpha_mapping = 0;
for my $uniprot_id (@rmap) {
unless ($uniprot_id) {
$no_uniprot++;
next;
}
push @{$mappings->{$uniprot_id}}, {'dbid' => $dbid, 'ensid' => $stable_id};
my $alpha_data = $alpha_db->get($uniprot_id);
next unless $alpha_data;
push @{$mappings->{$uniprot_id}}, {'dbid' => $dbid, 'ensid' => $stable_id, 'alpha' => $alpha_data};
$at_least_one_alpha_mapping = 1;
info("Mapping found for $stable_id: $alpha_data");
}
unless (scalar @rmap) {
info("No mapping data for stable ID $stable_id");
$no_dbid++;
info("No mapping data for stable ID $stable_id");
}
if (@rmap and ! $at_least_one_alpha_mapping) {
$no_alpha++;
info("No alphafold data for: stable_id $stable_id, translation_id $dbid, uniprot accessions: @rmap");
}
}
}
Expand All @@ -278,27 +313,23 @@ SQL
die(sprintf("No matches for species %s found in core DB %s\n", $species, $dbc->dbname()));
}

tie(%alpha_db, 'Tie::LevelDB', $idx_dir_al)
or die "Error trying to tie Tie::LevelDB $idx_dir_al: $!";

my $pfa = $core_dba->get_ProteinFeatureAdaptor();

my $good = 0;
my $no_alpha = 0;

info("Unique uniprot accessions for species after mapping: " . scalar (keys %$mappings));
info("Unique UniProt accessions for species after mapping: " . scalar (keys %$mappings));
for my $uniprot (keys %$mappings) {
for my $entry (@{$mappings->{$uniprot}}) {

my $uniparc = $entry->{'uniparc'};
my $ensid = $entry->{'ensid'};
my $translation_id = $entry->{'dbid'};
my $alpha_data = $alpha_db{$uniprot};
my $alpha_data = $entry->{'alpha'};

unless ($alpha_data) {
$no_alpha++;
info("No alphafold data for: $uniprot, $translation_id");
next;
die("There is an entry in the mappings hash, but alphafold data is missing for entry: " .
"uniprot $uniprot, stable_id $ensid, translation_id $translation_id");
}
$good++;

Expand All @@ -308,9 +339,10 @@ SQL

my $comment = 'Mapped ';
if ($uniparc) {
$comment .= "direct from UniParc $uniparc to UniProt $uniprot, Ensembl stable ID $ensid";
$comment .= "direct from UniParc $uniparc to UniProt $uniprot, " .
"Ensembl stable ID $ensid, alpha_version $alpha_version)";
} else {
$comment .= "using GIFTS DB (UniProt $uniprot, Ensembl stable ID $ensid)";
$comment .= "using GIFTS DB (UniProt $uniprot, Ensembl stable ID $ensid), alpha_version $alpha_version)";
}

info("Protein feature: start $al_start, end $al_end, $alpha_accession: $comment");
Expand All @@ -331,40 +363,64 @@ SQL
}
}

untie(%alpha_db);

dbunlock();
$alpha_db->close() or die "Error closing DB: " . $alpha_db->error();

# Info line to be stored in the hive DB
$self->warning("Inserted $good OK. Num of proteins for species: $protein_count, no uniparc mapping: $no_uniparc, no uniprot mapping: $no_uniprot, no stable ID match: $no_dbid, no alphafold data: $no_alpha. Species: $log_species");
}

my $lock_fh;

sub dblock {
my $path = shift;

info("Waiting for dblock");
open($lock_fh, ">", "$path/dblock") or die "Failed to open or create lock file: $!";
flock ($lock_fh, LOCK_EX) or die "Unable to lock $path/dblock: $!";
info("Locked dblock OK");
my $covered_proteins = $protein_count - $no_dbid - $no_uniparc - $no_alpha;
$self->warning(
"Inserted $good protein features. Num of proteins for species: $protein_count " .
"with at least one mapping for $covered_proteins. ".
"No UniParc to UniProt mappings: $no_uniparc; No GIFTS mappings for stable ID: $no_dbid; " .
"No link from UniProt to Alphafold (for any UniProt accession found through UniParc): $no_alpha. " .
"Species: $log_species"
);
}

sub dbunlock {
flock $lock_fh, LOCK_UN;
close $lock_fh;
# Reads info about available species from GIFTS. Will use the latest release for
# which GIFTS data is available. This is probably older than the release the
# pipeline is running with. Since the species in GIFTS are unlikely to see
# updates in the Core DBs, this is likely not an issue.
# Returns an arrayref like: [[ensembl_species_history_id, species, assembly, ensembl_release], ...]
sub read_gifts_species {
my $dbh = shift;

my $sql = "
select ensembl_species_history_id, species, assembly_accession as assembly, ensembl_release as release
from ensembl_gifts.ensembl_species_history
where ensembl_release = (select max(ensembl_release) from ensembl_gifts.ensembl_species_history)
order by ensembl_species_history_id
";

my $sth = $dbh->prepare($sql);
$sth->execute();
return $sth->fetchall_arrayref();
}

# Read a GIFTS CSV dump file, return a hash-ref like:
# Read GIFTS data from DB, return a hash-ref like:
# { 'A0A0G2K0H5' => [ 'ENSRNOT00000083658' ], 'B5DEL8' => [ 'ENSRNOT00000036389' ], ...}
sub read_gifts_data {
my $path = shift;
open (my $fh, '<', $path) or die "GIFTS dump file '$path' not readable: $!";
my $dbh = shift;
my $ensembl_species_history_id = shift;

my $sql = "
select et.enst_id, ue.uniprot_acc
from ensembl_gifts.alignment al
inner join ensembl_gifts.alignment_run alr ON alr.alignment_run_id = al.alignment_run_id
inner join ensembl_gifts.release_mapping_history rmh ON rmh.release_mapping_history_id = alr.release_mapping_history_id
inner join ensembl_gifts.ensembl_species_history esh ON esh.ensembl_species_history_id = rmh.ensembl_species_history_id
inner join ensembl_gifts.ensembl_transcript et ON et.transcript_id = al.transcript_id
inner join ensembl_gifts.uniprot_entry ue ON ue.uniprot_id = al.uniprot_id
where score1_type = 'perfect_match'
and alr.ensembl_release = esh.ensembl_release
and rmh.ensembl_species_history_id = ?
";

my $sth = $dbh->prepare($sql);
$sth->execute($ensembl_species_history_id);

my %uniprot_ensid;
while (my $line = <$fh>) {
chomp $line;
(undef, undef, my $ensid, my $uniprot) = split ",", $line;
while (my $row = $sth->fetchrow_arrayref()) {
my ($ensid, $uniprot) = @$row;
# A protein may have isoforms. In Uniprot, their reference looks like
# e.g. F1RA39-1. Alphafold currently only has data for the canonical
# sequence. Here, we just remove the isoform reference and always refer
Expand All @@ -386,7 +442,10 @@ sub cleanup_protein_features {

if (defined($analysis)) {
my $analysis_id = $analysis->dbID();
info(sprintf("Found alphafold analysis (ID: $analysis_id) for species %s. Deleting it.\n", $self->param('species')));
info(sprintf(
"Found alphafold analysis (ID: $analysis_id) for species %s. Deleting it.\n", $self->param('species')
)
);

my $pfa = $core_dba->get_ProteinFeatureAdaptor();
$pfa->remove_by_analysis_id($analysis_id);
Expand Down
Loading

0 comments on commit 5cc46e6

Please sign in to comment.