Skip to content

Commit

Permalink
Remove ortholog and topup clustering modes
Browse files Browse the repository at this point in the history
  • Loading branch information
twalsh-ebi committed Oct 27, 2024
1 parent 66d7171 commit de123c3
Show file tree
Hide file tree
Showing 10 changed files with 52 additions and 244 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,6 @@ sub default_options {
'mafft_runtime' => 7200,
'treebest_threshold_n_residues' => 10000,
'treebest_threshold_n_genes' => 400,
'update_threshold_trees' => 0.2,

# alignment filtering options
'threshold_n_genes' => 20,
Expand Down Expand Up @@ -984,7 +983,7 @@ sub core_pipeline_analyses {
-module => 'Bio::EnsEMBL::Compara::RunnableDB::GeneTrees::CreateClustersets',
-parameters => {
member_type => 'protein',
'additional_clustersets' => [qw(treebest phyml-aa phyml-nt nj-dn nj-ds nj-mm raxml raxml_parsimony raxml_bl notung copy raxml_update filter_level_1 filter_level_2 filter_level_3 filter_level_4 fasttree )],
'additional_clustersets' => [qw(treebest phyml-aa phyml-nt nj-dn nj-ds nj-mm raxml raxml_parsimony raxml_bl notung filter_level_1 filter_level_2 filter_level_3 filter_level_4 fasttree)],
},
-flow_into => [ 'cluster_tagging_factory' ],
},
Expand Down
171 changes: 16 additions & 155 deletions modules/Bio/EnsEMBL/Compara/PipeConfig/ProteinTrees_conf.pm
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,6 @@ sub default_options {
'examl_ptiles' => 16,
'treebest_threshold_n_residues' => 10000,
'treebest_threshold_n_genes' => 400,
'update_threshold_trees' => 0.2,

# sequence type used on the phylogenetic inferences
# It has to be set to 1 for the strains
Expand Down Expand Up @@ -236,10 +235,6 @@ sub default_options {
'loadmembers_capacity' => 30,
'HMMer_classifyPantherScore_capacity' => 1000,
'HMMer_search_capacity' => 8000,
'copy_trees_capacity' => 50,
'copy_alignments_capacity' => 50,
'mafft_update_capacity' => 50,
'raxml_update_capacity' => 50,
'ortho_stats_capacity' => 10,
'cafe_capacity' => 50,

Expand Down Expand Up @@ -297,10 +292,8 @@ sub default_options {
# How will the pipeline create clusters (families) ?
# Possible values: 'blastp' (default), 'hmm', 'hybrid'
# 'blastp' means that the pipeline will run a all-vs-all blastp comparison of the proteins and run hcluster to create clusters. This can take a *lot* of compute
# 'ortholog' means that the pipeline will use previously inferred orthologs to perform a cluster projection
# 'hmm' means that the pipeline will run an HMM classification
# 'hybrid' is like "hmm" except that the unclustered proteins go to a all-vs-all blastp + hcluster stage
# 'topup' means that the HMM classification is reused from prev_rel_db, and topped-up with the updated / new species >> UNIMPLEMENTED <<
'clustering_mode' => 'hybrid',

# List of species some genes have been projected from
Expand Down Expand Up @@ -393,11 +386,11 @@ sub pipeline_checks_pre_init {

my %reuse_modes = (clusters => 1, members => 1);
die "'reuse_level' must be set to one of: ".join(", ", keys %reuse_modes) unless $self->o('reuse_level') and $reuse_modes{$self->o('reuse_level')};
my %clustering_modes = (blastp => 1, ortholog => 1, hmm => 1, hybrid => 1, topup => 1);
my %clustering_modes = (blastp => 1, hmm => 1, hybrid => 1);
die "'clustering_mode' must be set to one of: ".join(", ", keys %clustering_modes) unless $self->o('clustering_mode') and $clustering_modes{$self->o('clustering_mode')};

# In HMM mode the library must exist
if (($self->o('clustering_mode') ne 'blastp') and ($self->o('clustering_mode') ne 'ortholog')) {
if (($self->o('clustering_mode') eq 'hmm') or ($self->o('clustering_mode') eq 'hybrid')) {
my $lib = $self->o('hmm_library_basedir');
if ($self->o('hmm_library_version') == 2){
die "'$lib' does not seem to be a valid HMM library (Panther-style)\n" unless ((-d $lib) && (-d "$lib/books") && (-d "$lib/globals") && (-s "$lib/globals/con.Fasta"));
Expand Down Expand Up @@ -597,15 +590,6 @@ sub core_pipeline_analyses {
'output_clusterset_id' => $self->o('use_notung') ? 'raxml' : 'default',
'input_clusterset_id' => 'raxml_parsimony',
);
my %raxml_update_parameters = (
'raxml_pthread_exe_sse3' => $self->o('raxml_pthread_exe_sse3'),
'raxml_pthread_exe_avx' => $self->o('raxml_pthread_exe_avx'),
'raxml_exe_sse3' => $self->o('raxml_exe_sse3'),
'raxml_exe_avx' => $self->o('raxml_exe_avx'),
'treebest_exe' => $self->o('treebest_exe'),
'input_clusterset_id' => 'copy',
'output_clusterset_id' => 'raxml_update',
);

my %raxml_bl_parameters = (
'raxml_pthread_exe_sse3' => $self->o('raxml_pthread_exe_sse3'),
Expand Down Expand Up @@ -968,35 +952,6 @@ sub core_pipeline_analyses {
%hc_analysis_params,
},

{ -logic_name => 'copy_trees_from_previous_release',
-module => 'Bio::EnsEMBL::Compara::RunnableDB::GeneTrees::CopyTreesFromDB',
-parameters => {
'input_clusterset_id' => 'default',
'output_clusterset_id' => 'copy',
'branch_for_new_tree' => '3',
'branch_for_wiped_out_trees' => '4',
'branch_for_update_threshold_trees' => '5',
'update_threshold_trees' => $self->o('update_threshold_trees'),
},
-flow_into => {
1 => [ 'copy_alignments_from_previous_release' ],
3 => [ 'alignment_entry_point' ],
4 => [ 'alignment_entry_point' ],
5 => [ 'alignment_entry_point' ],
},
-hive_capacity => $self->o('copy_trees_capacity'),
-rc_name => '8Gb_job',
},

{ -logic_name => 'copy_alignments_from_previous_release',
-module => 'Bio::EnsEMBL::Compara::RunnableDB::ProteinTrees::CopyAlignmentsFromDB',
-parameters => {
'input_clusterset_id' => 'default',
},
-flow_into => [ 'mafft_update' ],
-hive_capacity => $self->o('copy_alignments_capacity'),
-rc_name => '8Gb_job',
},
# ---------------------------------------------[reuse members]-----------------------------------------------------------------------

{ -logic_name => 'member_copy_factory',
Expand Down Expand Up @@ -1260,16 +1215,6 @@ sub core_pipeline_analyses {
}
},

{
-logic_name => 'flag_update_clusters',
-module => 'Bio::EnsEMBL::Compara::RunnableDB::GeneTrees::FlagUpdateClusters',
-parameters => {
'update_threshold_trees' => $self->o('update_threshold_trees'),
},
-rc_name => '16Gb_job',
},


# -------------------------------------------------[BuildHMMprofiles pipeline]-------------------------------------------------------

{ -logic_name => 'dump_unannotated_members',
Expand Down Expand Up @@ -1554,8 +1499,7 @@ sub core_pipeline_analyses {
-flow_into => {
'1->A' => WHEN(
'#clustering_mode# eq "blastp"' => 'prepare_blastdb',
'#clustering_mode# eq "ortholog"' => 'ortholog_cluster',
ELSE 'load_InterproAnnotation', # hmm, hybrid, topup
ELSE 'load_InterproAnnotation', # hmm, hybrid
),
'A->1' => [ 'expand_clusters_with_projections' ],
},
Expand All @@ -1572,15 +1516,6 @@ sub core_pipeline_analyses {
},
},

{ -logic_name => 'ortholog_cluster',
-module => 'Bio::EnsEMBL::Compara::RunnableDB::ProteinTrees::OrthologClusters',
-parameters => {
'sort_clusters' => 1,
},
-rc_name => '4Gb_job',
-hive_capacity => $self->o('reuse_capacity'),
},

{ -logic_name => 'hcluster_dump_input_per_genome',
-module => 'Bio::EnsEMBL::Compara::RunnableDB::ProteinTrees::HclusterPrepare',
-parameters => {
Expand Down Expand Up @@ -1667,7 +1602,7 @@ sub core_pipeline_analyses {
{ -logic_name => 'create_additional_clustersets',
-module => 'Bio::EnsEMBL::Compara::RunnableDB::GeneTrees::CreateClustersets',
-parameters => {
'additional_clustersets' => [qw(treebest phyml-aa phyml-nt nj-dn nj-ds nj-mm raxml raxml_parsimony raxml_bl notung treerecs copy raxml_update )],
'additional_clustersets' => [qw(treebest phyml-aa phyml-nt nj-dn nj-ds nj-mm raxml raxml_parsimony raxml_bl notung treerecs)],
},
},

Expand Down Expand Up @@ -1713,9 +1648,6 @@ sub core_pipeline_analyses {
1 => [
'create_additional_clustersets',
'cluster_tagging_factory',
WHEN(
'#clustering_mode# eq "topup"' => 'flag_update_clusters',
),
],
},
},
Expand All @@ -1740,10 +1672,7 @@ sub core_pipeline_analyses {
'inputquery' => 'SELECT root_id AS gene_tree_id, COUNT(seq_member_id) AS tree_num_genes FROM gene_tree_root JOIN gene_tree_node USING (root_id) WHERE tree_type = "tree" AND clusterset_id="default" GROUP BY root_id',
},
-flow_into => {
'2->A' => WHEN(
'#clustering_mode# eq "topup"' => 'copy_trees_from_previous_release',
ELSE 'alignment_entry_point',
),
'2->A' => [ 'alignment_entry_point' ],
'1->A' => [ 'join_panther_subfam' ],
'A->1' => [ 'global_tree_processing' ],
},
Expand Down Expand Up @@ -1912,16 +1841,6 @@ sub core_pipeline_analyses {
},
},

{ -logic_name => 'mafft_update',
-module => 'Bio::EnsEMBL::Compara::RunnableDB::ProteinTrees::MafftUpdate',
-parameters => {
'mafft_exe' => $self->o('mafft_exe'),
},
-hive_capacity => $self->o('mafft_update_capacity'),
-rc_name => '2Gb_job',
-flow_into => [ 'raxml_update_decision' ],
},

{ -logic_name => 'mcoffee_himem',
-module => 'Bio::EnsEMBL::Compara::RunnableDB::ProteinTrees::MCoffee',
-parameters => {
Expand Down Expand Up @@ -1985,32 +1904,32 @@ sub core_pipeline_analyses {

{ -logic_name => 'exon_boundaries_prep',
-module => 'Bio::EnsEMBL::Compara::RunnableDB::ObjectStore::GeneTreeAlnExonBoundaries',
-parameters => {
'treebreak_gene_count' => $self->o('treebreak_gene_count'),
},
-flow_into => {
-1 => 'exon_boundaries_prep_himem',
1 => WHEN(
'#use_quick_tree_break# and (#tree_num_genes# > #treebreak_gene_count#)' => 'quick_tree_break',
ELSE 'aln_tagging',
),
1 => 'treebreak_decision',
},
-hive_capacity => $self->o('split_genes_capacity'),
-batch_size => 20,
},

{ -logic_name => 'exon_boundaries_prep_himem',
-module => 'Bio::EnsEMBL::Compara::RunnableDB::ObjectStore::GeneTreeAlnExonBoundaries',
-flow_into => [ 'treebreak_decision' ],
-rc_name => '2Gb_job',
-hive_capacity => $self->o('split_genes_capacity'),
-batch_size => 20,
},

{ -logic_name => 'treebreak_decision',
-module => 'Bio::EnsEMBL::Hive::RunnableDB::Dummy',
-parameters => {
'treebreak_gene_count' => $self->o('treebreak_gene_count'),
'treebreak_gene_count' => $self->o('treebreak_gene_count'),
},
-flow_into => WHEN(
'#use_quick_tree_break# and (#tree_num_genes# > #treebreak_gene_count#)' => 'quick_tree_break',
ELSE 'aln_tagging',
),
-rc_name => '2Gb_job',
-hive_capacity => $self->o('split_genes_capacity'),
-batch_size => 20,
%decision_analysis_params,
},

{ -logic_name => 'aln_tagging',
Expand Down Expand Up @@ -2708,64 +2627,6 @@ sub core_pipeline_analyses {
}
},

{ -logic_name => 'raxml_update_decision',
-module => 'Bio::EnsEMBL::Compara::RunnableDB::GeneTrees::LoadTags',
-parameters => {
'tags' => {
#The default value matches the default dataflow we want: _8_cores analysis.
'gene_count' => 0,
},
},
-flow_into => {
1 => WHEN(
'(#tree_gene_count# <= 500)' => 'raxml_update',
'(#tree_gene_count# > 500) && (#tree_gene_count# <= 1000)' => 'raxml_update_8',
'(#tree_gene_count# > 1000) && (#tree_gene_count# <= 2000)' => 'raxml_update_16',
'(#tree_gene_count# > 3000)' => 'raxml_update_32',
),
},
%decision_analysis_params,
},

{ -logic_name => 'raxml_update',
-module => 'Bio::EnsEMBL::Compara::RunnableDB::ProteinTrees::RAxML_update',
-parameters => {
%raxml_update_parameters,
},
-hive_capacity => $self->o('raxml_update_capacity'),
-rc_name => '8Gb_job',
},

{ -logic_name => 'raxml_update_8',
-module => 'Bio::EnsEMBL::Compara::RunnableDB::ProteinTrees::RAxML_update',
-parameters => {
%raxml_update_parameters,
'raxml_number_of_cores' => 8,
},
-hive_capacity => $self->o('raxml_update_capacity'),
-rc_name => '16Gb_8c_job',
},

{ -logic_name => 'raxml_update_16',
-module => 'Bio::EnsEMBL::Compara::RunnableDB::ProteinTrees::RAxML_update',
-parameters => {
%raxml_update_parameters,
'raxml_number_of_cores' => 16,
},
-hive_capacity => $self->o('raxml_update_capacity'),
-rc_name => '16Gb_16c_job',
},

{ -logic_name => 'raxml_update_32',
-module => 'Bio::EnsEMBL::Compara::RunnableDB::ProteinTrees::RAxML_update',
-parameters => {
%raxml_update_parameters,
'raxml_number_of_cores' => 32,
},
-hive_capacity => $self->o('raxml_update_capacity'),
-rc_name => '32Gb_32c_job',
},

{ -logic_name => 'treebest_small_families',
-module => 'Bio::EnsEMBL::Compara::RunnableDB::ProteinTrees::NJTREE_PHYML',
-parameters => {
Expand Down
18 changes: 2 additions & 16 deletions modules/Bio/EnsEMBL/Compara/PipeConfig/ncRNAtrees_conf.pm
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,7 @@ sub default_options {

# How will the pipeline create clusters (families) ?
# Possible values: 'rfam' (default) or 'ortholog'
# 'blastp' means that the pipeline will clusters genes according to their RFAM accession
# 'ortholog' means that the pipeline will use previously inferred orthologs to perform a cluster projection
# 'rfam' means that the pipeline will cluster genes according to their RFAM accession
'clustering_mode' => 'rfam',

'master_db' => 'compara_master',
Expand Down Expand Up @@ -592,10 +591,7 @@ sub core_pipeline_analyses {
'type' => 'infernal',
'skip_consensus' => 1,
},
-flow_into => WHEN(
'#clustering_mode# eq "ortholog"' => 'ortholog_cluster',
ELSE 'rfam_classify',
),
-flow_into => [ 'rfam_classify' ],
},

# ---------------------------------------------[run RFAM classification]--------------------------------------------------------------
Expand Down Expand Up @@ -637,16 +633,6 @@ sub core_pipeline_analyses {
-module => 'Bio::EnsEMBL::Compara::RunnableDB::GeneTrees::PerGenomeGroupsetQC',
},

{ -logic_name => 'ortholog_cluster',
-module => 'Bio::EnsEMBL::Compara::RunnableDB::ProteinTrees::OrthologClusters',
-parameters => {
'sort_clusters' => 1,
'add_model_id' => 1,
},
-rc_name => '2Gb_job',
-flow_into => 'expand_clusters_with_projections',
},

{ -logic_name => 'expand_clusters_with_projections',
-module => 'Bio::EnsEMBL::Compara::RunnableDB::ProteinTrees::ExpandClustersWithProjections',
-flow_into => [ 'cluster_qc_factory' ],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,9 @@ limitations under the License.
Bio::EnsEMBL::Compara::RunnableDB::GeneTrees::CopyTreesFromDB
=head1 DESCRIPTION
=head1 DEPRECATION NOTICE
1) Used to copy all the trees from a previous database.
2) It identifies the genes in compara_dba that have been updated, deleted or added
when compared to the reuse_compara_dba.
3) It disavows all the genes that were flagged by FlagUpdateClusters.pm and stored
in gene_tree_root_tag as "updated_genes_list", "added_genes_list" and "deleted_genes_list"
This runnable is deprecated, and may be removed in a future release.
=cut

Expand All @@ -44,6 +38,8 @@ use base ('Bio::EnsEMBL::Compara::RunnableDB::GeneTrees::StoreTree');
sub fetch_input {
my $self = shift @_;

$self->warning("RunnableDB::GeneTrees::CopyTreesFromDB is deprecated, and may be removed in a future release");

#get compara_dba adaptor
$self->param( 'compara_dba', $self->compara_dba );

Expand Down
Loading

0 comments on commit de123c3

Please sign in to comment.