Remove ortholog and topup clustering modes

Ensembl · Oct 27, 2024 · de123c3 · de123c3
1 parent 66d7171
commit de123c3
Show file tree

Hide file tree

Showing 10 changed files with 52 additions and 244 deletions.
diff --git a/modules/Bio/EnsEMBL/Compara/PipeConfig/CreateHmmProfiles_conf.pm b/modules/Bio/EnsEMBL/Compara/PipeConfig/CreateHmmProfiles_conf.pm
@@ -115,7 +115,6 @@ sub default_options {
         'mafft_runtime'             => 7200,
         'treebest_threshold_n_residues' => 10000,
         'treebest_threshold_n_genes'    => 400,
-        'update_threshold_trees'    => 0.2,
 
     # alignment filtering options
         'threshold_n_genes'       => 20,
@@ -984,7 +983,7 @@ sub core_pipeline_analyses {
             -module             => 'Bio::EnsEMBL::Compara::RunnableDB::GeneTrees::CreateClustersets',
             -parameters         => {
                 member_type     => 'protein',
-                'additional_clustersets'    => [qw(treebest phyml-aa phyml-nt nj-dn nj-ds nj-mm raxml raxml_parsimony raxml_bl notung copy raxml_update filter_level_1 filter_level_2 filter_level_3 filter_level_4 fasttree )],
+                'additional_clustersets'    => [qw(treebest phyml-aa phyml-nt nj-dn nj-ds nj-mm raxml raxml_parsimony raxml_bl notung filter_level_1 filter_level_2 filter_level_3 filter_level_4 fasttree)],
             },
             -flow_into          => [ 'cluster_tagging_factory' ],
         },

diff --git a/modules/Bio/EnsEMBL/Compara/PipeConfig/ProteinTrees_conf.pm b/modules/Bio/EnsEMBL/Compara/PipeConfig/ProteinTrees_conf.pm
@@ -140,7 +140,6 @@ sub default_options {
         'examl_ptiles'            => 16,
         'treebest_threshold_n_residues' => 10000,
         'treebest_threshold_n_genes'    => 400,
-        'update_threshold_trees'    => 0.2,
 
     # sequence type used on the phylogenetic inferences
     # It has to be set to 1 for the strains
@@ -236,10 +235,6 @@ sub default_options {
         'loadmembers_capacity'      =>  30,
         'HMMer_classifyPantherScore_capacity'   => 1000,
         'HMMer_search_capacity'     => 8000,
-        'copy_trees_capacity'       => 50,
-        'copy_alignments_capacity'  => 50,
-        'mafft_update_capacity'     => 50,
-        'raxml_update_capacity'     => 50,
         'ortho_stats_capacity'      => 10,
         'cafe_capacity'             => 50,
 
@@ -297,10 +292,8 @@ sub default_options {
         # How will the pipeline create clusters (families) ?
         # Possible values: 'blastp' (default), 'hmm', 'hybrid'
         #   'blastp' means that the pipeline will run a all-vs-all blastp comparison of the proteins and run hcluster to create clusters. This can take a *lot* of compute
-        #   'ortholog' means that the pipeline will use previously inferred orthologs to perform a cluster projection
         #   'hmm' means that the pipeline will run an HMM classification
         #   'hybrid' is like "hmm" except that the unclustered proteins go to a all-vs-all blastp + hcluster stage
-        #   'topup' means that the HMM classification is reused from prev_rel_db, and topped-up with the updated / new species  >> UNIMPLEMENTED <<
         'clustering_mode'           => 'hybrid',
 
         # List of species some genes have been projected from
@@ -393,11 +386,11 @@ sub pipeline_checks_pre_init {
 
     my %reuse_modes = (clusters => 1, members => 1);
     die "'reuse_level' must be set to one of: ".join(", ", keys %reuse_modes) unless $self->o('reuse_level') and $reuse_modes{$self->o('reuse_level')};
-    my %clustering_modes = (blastp => 1, ortholog => 1, hmm => 1, hybrid => 1, topup => 1);
+    my %clustering_modes = (blastp => 1, hmm => 1, hybrid => 1);
     die "'clustering_mode' must be set to one of: ".join(", ", keys %clustering_modes) unless $self->o('clustering_mode') and $clustering_modes{$self->o('clustering_mode')};
 
     # In HMM mode the library must exist
-    if (($self->o('clustering_mode') ne 'blastp') and ($self->o('clustering_mode') ne 'ortholog')) {
+    if (($self->o('clustering_mode') eq 'hmm') or ($self->o('clustering_mode') eq 'hybrid')) {
         my $lib = $self->o('hmm_library_basedir');
             if ($self->o('hmm_library_version') == 2){
                 die "'$lib' does not seem to be a valid HMM library (Panther-style)\n" unless ((-d $lib) && (-d "$lib/books") && (-d "$lib/globals") && (-s "$lib/globals/con.Fasta"));
@@ -597,15 +590,6 @@ sub core_pipeline_analyses {
         'output_clusterset_id'      => $self->o('use_notung') ? 'raxml' : 'default',
         'input_clusterset_id'       => 'raxml_parsimony',
     );
-    my %raxml_update_parameters = (
-        'raxml_pthread_exe_sse3'    => $self->o('raxml_pthread_exe_sse3'),
-        'raxml_pthread_exe_avx'     => $self->o('raxml_pthread_exe_avx'),
-        'raxml_exe_sse3'            => $self->o('raxml_exe_sse3'),
-        'raxml_exe_avx'             => $self->o('raxml_exe_avx'),
-        'treebest_exe'              => $self->o('treebest_exe'),
-		'input_clusterset_id'	    => 'copy',
-        'output_clusterset_id'      => 'raxml_update',
-    );
 
     my %raxml_bl_parameters = (
         'raxml_pthread_exe_sse3'    => $self->o('raxml_pthread_exe_sse3'),
@@ -968,35 +952,6 @@ sub core_pipeline_analyses {
             %hc_analysis_params,
         },
 
-        {   -logic_name => 'copy_trees_from_previous_release',
-            -module     => 'Bio::EnsEMBL::Compara::RunnableDB::GeneTrees::CopyTreesFromDB',
-            -parameters => {
-                'input_clusterset_id'               => 'default',
-                'output_clusterset_id'              => 'copy',
-                'branch_for_new_tree'               => '3',
-                'branch_for_wiped_out_trees'        => '4',
-                'branch_for_update_threshold_trees' => '5',
-                'update_threshold_trees'            => $self->o('update_threshold_trees'),
-            },
-            -flow_into  => {
-                 1 => [ 'copy_alignments_from_previous_release' ],
-                 3 => [ 'alignment_entry_point' ],
-                 4 => [ 'alignment_entry_point' ],
-                 5 => [ 'alignment_entry_point' ],
-            },
-            -hive_capacity        => $self->o('copy_trees_capacity'),
-            -rc_name => '8Gb_job',
-        },
-
-        {   -logic_name => 'copy_alignments_from_previous_release',
-            -module     => 'Bio::EnsEMBL::Compara::RunnableDB::ProteinTrees::CopyAlignmentsFromDB',
-            -parameters => {
-                'input_clusterset_id'   => 'default',
-            },
-            -flow_into  			=> [ 'mafft_update' ],
-            -hive_capacity          => $self->o('copy_alignments_capacity'),
-            -rc_name => '8Gb_job',
-        },
 # ---------------------------------------------[reuse members]-----------------------------------------------------------------------
 
         {   -logic_name => 'member_copy_factory',
@@ -1260,16 +1215,6 @@ sub core_pipeline_analyses {
                 }
             },
 
-        {
-            -logic_name     => 'flag_update_clusters',
-            -module         => 'Bio::EnsEMBL::Compara::RunnableDB::GeneTrees::FlagUpdateClusters',
-			-parameters     => {
-                'update_threshold_trees' => $self->o('update_threshold_trees'),
-			},
-            -rc_name => '16Gb_job',
-        },
-
-
 # -------------------------------------------------[BuildHMMprofiles pipeline]-------------------------------------------------------
 
         {   -logic_name => 'dump_unannotated_members',
@@ -1554,8 +1499,7 @@ sub core_pipeline_analyses {
             -flow_into => {
                 '1->A' => WHEN(
                     '#clustering_mode# eq "blastp"'     => 'prepare_blastdb',
-                    '#clustering_mode# eq "ortholog"'   => 'ortholog_cluster',
-                    ELSE                                   'load_InterproAnnotation',   # hmm, hybrid, topup
+                    ELSE                                   'load_InterproAnnotation',   # hmm, hybrid
                 ),
                 'A->1' => [ 'expand_clusters_with_projections' ],
             },
@@ -1572,15 +1516,6 @@ sub core_pipeline_analyses {
             },
         },
 
-        {   -logic_name => 'ortholog_cluster',
-            -module     => 'Bio::EnsEMBL::Compara::RunnableDB::ProteinTrees::OrthologClusters',
-            -parameters => {
-                'sort_clusters'         => 1,
-            },
-            -rc_name    => '4Gb_job',
-            -hive_capacity => $self->o('reuse_capacity'),
-        },
-
         {   -logic_name => 'hcluster_dump_input_per_genome',
             -module     => 'Bio::EnsEMBL::Compara::RunnableDB::ProteinTrees::HclusterPrepare',
             -parameters => {
@@ -1667,7 +1602,7 @@ sub core_pipeline_analyses {
         {   -logic_name         => 'create_additional_clustersets',
             -module             => 'Bio::EnsEMBL::Compara::RunnableDB::GeneTrees::CreateClustersets',
             -parameters         => {
-                'additional_clustersets'    => [qw(treebest phyml-aa phyml-nt nj-dn nj-ds nj-mm raxml raxml_parsimony raxml_bl notung treerecs copy raxml_update )],
+                'additional_clustersets'    => [qw(treebest phyml-aa phyml-nt nj-dn nj-ds nj-mm raxml raxml_parsimony raxml_bl notung treerecs)],
             },
         },
 
@@ -1713,9 +1648,6 @@ sub core_pipeline_analyses {
                 1 => [
                     'create_additional_clustersets',
                     'cluster_tagging_factory',
-                    WHEN(
-                        '#clustering_mode# eq "topup"' => 'flag_update_clusters',
-                    ),
                 ],
             },
         },
@@ -1740,10 +1672,7 @@ sub core_pipeline_analyses {
                 'inputquery'        => 'SELECT root_id AS gene_tree_id, COUNT(seq_member_id) AS tree_num_genes FROM gene_tree_root JOIN gene_tree_node USING (root_id) WHERE tree_type = "tree" AND clusterset_id="default" GROUP BY root_id',
             },
             -flow_into  => {
-                '2->A'  => WHEN(
-                    '#clustering_mode# eq "topup"' => 'copy_trees_from_previous_release',
-                    ELSE 'alignment_entry_point',
-                ),
+                '2->A' => [ 'alignment_entry_point' ],
                 '1->A' => [ 'join_panther_subfam' ],
                 'A->1' => [ 'global_tree_processing' ],
             },
@@ -1912,16 +1841,6 @@ sub core_pipeline_analyses {
             },
         },
 
-        {   -logic_name => 'mafft_update',
-            -module     => 'Bio::EnsEMBL::Compara::RunnableDB::ProteinTrees::MafftUpdate',
-            -parameters => {
-                'mafft_exe'                  => $self->o('mafft_exe'),
-            },
-            -hive_capacity        => $self->o('mafft_update_capacity'),
-            -rc_name    => '2Gb_job',
-            -flow_into      => [ 'raxml_update_decision' ],
-        },
-
         {   -logic_name => 'mcoffee_himem',
             -module     => 'Bio::EnsEMBL::Compara::RunnableDB::ProteinTrees::MCoffee',
             -parameters => {
@@ -1985,32 +1904,32 @@ sub core_pipeline_analyses {
 
         {   -logic_name     => 'exon_boundaries_prep',
             -module         => 'Bio::EnsEMBL::Compara::RunnableDB::ObjectStore::GeneTreeAlnExonBoundaries',
-            -parameters => {
-                'treebreak_gene_count'      => $self->o('treebreak_gene_count'),
-            },
             -flow_into      => {
                 -1 => 'exon_boundaries_prep_himem',
-                1 => WHEN(
-                    '#use_quick_tree_break# and (#tree_num_genes# > #treebreak_gene_count#)' => 'quick_tree_break',
-                    ELSE 'aln_tagging',
-                ),
+                1 => 'treebreak_decision',
             },
             -hive_capacity  => $self->o('split_genes_capacity'),
             -batch_size     => 20,
         },
 
         {   -logic_name     => 'exon_boundaries_prep_himem',
             -module         => 'Bio::EnsEMBL::Compara::RunnableDB::ObjectStore::GeneTreeAlnExonBoundaries',
+            -flow_into      => [ 'treebreak_decision' ],
+            -rc_name        => '2Gb_job',
+            -hive_capacity  => $self->o('split_genes_capacity'),
+            -batch_size     => 20,
+        },
+
+        {   -logic_name => 'treebreak_decision',
+            -module     => 'Bio::EnsEMBL::Hive::RunnableDB::Dummy',
             -parameters => {
-                'treebreak_gene_count'      => $self->o('treebreak_gene_count'),
+                'treebreak_gene_count' => $self->o('treebreak_gene_count'),
             },
             -flow_into      => WHEN(
                 '#use_quick_tree_break# and (#tree_num_genes# > #treebreak_gene_count#)' => 'quick_tree_break',
                 ELSE 'aln_tagging',
             ),
-            -rc_name    => '2Gb_job',
-            -hive_capacity  => $self->o('split_genes_capacity'),
-            -batch_size     => 20,
+            %decision_analysis_params,
         },
 
         {   -logic_name     => 'aln_tagging',
@@ -2708,64 +2627,6 @@ sub core_pipeline_analyses {
             }
         },
 
-        {   -logic_name => 'raxml_update_decision',
-            -module     => 'Bio::EnsEMBL::Compara::RunnableDB::GeneTrees::LoadTags',
-            -parameters => {
-                'tags'  => {
-                    #The default value matches the default dataflow we want: _8_cores analysis.
-                    'gene_count'          => 0,
-                },
-            },
-            -flow_into  => {
-                1 => WHEN(
-                    '(#tree_gene_count# <= 500)'                                => 'raxml_update',
-                    '(#tree_gene_count# > 500)  && (#tree_gene_count# <= 1000)' => 'raxml_update_8',
-                    '(#tree_gene_count# > 1000) && (#tree_gene_count# <= 2000)' => 'raxml_update_16',
-                    '(#tree_gene_count# > 3000)'                                => 'raxml_update_32',
-                ),
-            },
-            %decision_analysis_params,
-        },
-
-        {   -logic_name => 'raxml_update',
-            -module     => 'Bio::EnsEMBL::Compara::RunnableDB::ProteinTrees::RAxML_update',
-            -parameters => {
-                %raxml_update_parameters,
-            },
-            -hive_capacity        => $self->o('raxml_update_capacity'),
-            -rc_name    => '8Gb_job',
-        },
-
-        {   -logic_name => 'raxml_update_8',
-            -module     => 'Bio::EnsEMBL::Compara::RunnableDB::ProteinTrees::RAxML_update',
-            -parameters => {
-                %raxml_update_parameters,
-                'raxml_number_of_cores'     => 8,
-            },
-            -hive_capacity        => $self->o('raxml_update_capacity'),
-            -rc_name 	=> '16Gb_8c_job',
-        },
-
-        {   -logic_name => 'raxml_update_16',
-            -module     => 'Bio::EnsEMBL::Compara::RunnableDB::ProteinTrees::RAxML_update',
-            -parameters => {
-                %raxml_update_parameters,
-                'raxml_number_of_cores'     => 16,
-            },
-            -hive_capacity        => $self->o('raxml_update_capacity'),
-            -rc_name    => '16Gb_16c_job',
-        },
-
-        {   -logic_name => 'raxml_update_32',
-            -module     => 'Bio::EnsEMBL::Compara::RunnableDB::ProteinTrees::RAxML_update',
-            -parameters => {
-                %raxml_update_parameters,
-                'raxml_number_of_cores'     => 32,
-            },
-            -hive_capacity        => $self->o('raxml_update_capacity'),
-            -rc_name    => '32Gb_32c_job',
-        },
-
         {   -logic_name => 'treebest_small_families',
             -module     => 'Bio::EnsEMBL::Compara::RunnableDB::ProteinTrees::NJTREE_PHYML',
             -parameters => {

diff --git a/modules/Bio/EnsEMBL/Compara/PipeConfig/ncRNAtrees_conf.pm b/modules/Bio/EnsEMBL/Compara/PipeConfig/ncRNAtrees_conf.pm
@@ -64,8 +64,7 @@ sub default_options {
 
             # How will the pipeline create clusters (families) ?
             # Possible values: 'rfam' (default) or 'ortholog'
-            #   'blastp' means that the pipeline will clusters genes according to their RFAM accession
-            #   'ortholog' means that the pipeline will use previously inferred orthologs to perform a cluster projection
+            #   'rfam' means that the pipeline will cluster genes according to their RFAM accession
             'clustering_mode'           => 'rfam',
 
         'master_db'   => 'compara_master',
@@ -592,10 +591,7 @@ sub core_pipeline_analyses {
                                'type'              => 'infernal',
                                'skip_consensus'    => 1,
                               },
-            -flow_into     => WHEN(
-                                   '#clustering_mode# eq "ortholog"' => 'ortholog_cluster',
-                                   ELSE 'rfam_classify',
-                               ),
+            -flow_into     => [ 'rfam_classify' ],
         },
 
 # ---------------------------------------------[run RFAM classification]--------------------------------------------------------------
@@ -637,16 +633,6 @@ sub core_pipeline_analyses {
                 -module     => 'Bio::EnsEMBL::Compara::RunnableDB::GeneTrees::PerGenomeGroupsetQC',
             },
 
-        {   -logic_name => 'ortholog_cluster',
-            -module     => 'Bio::EnsEMBL::Compara::RunnableDB::ProteinTrees::OrthologClusters',
-            -parameters => {
-                'sort_clusters'         => 1,
-                'add_model_id'          => 1,
-            },
-            -rc_name    => '2Gb_job',
-            -flow_into  => 'expand_clusters_with_projections',
-        },
-
         {   -logic_name         => 'expand_clusters_with_projections',
             -module             => 'Bio::EnsEMBL::Compara::RunnableDB::ProteinTrees::ExpandClustersWithProjections',
             -flow_into          => [ 'cluster_qc_factory' ],

diff --git a/modules/Bio/EnsEMBL/Compara/RunnableDB/GeneTrees/CopyTreesFromDB.pm b/modules/Bio/EnsEMBL/Compara/RunnableDB/GeneTrees/CopyTreesFromDB.pm
@@ -21,15 +21,9 @@ limitations under the License.
 
 Bio::EnsEMBL::Compara::RunnableDB::GeneTrees::CopyTreesFromDB
 
-=head1 DESCRIPTION
+=head1 DEPRECATION NOTICE
 
-1) Used to copy all the trees from a previous database.
-
-2) It identifies the genes in compara_dba that have been updated, deleted or added
-when compared to the reuse_compara_dba.
-
-3) It disavows all the genes that were flagged by FlagUpdateClusters.pm and stored
-in gene_tree_root_tag as "updated_genes_list", "added_genes_list" and "deleted_genes_list"
+This runnable is deprecated, and may be removed in a future release.
 
 =cut
 
@@ -44,6 +38,8 @@ use base ('Bio::EnsEMBL::Compara::RunnableDB::GeneTrees::StoreTree');
 sub fetch_input {
     my $self = shift @_;
 
+        $self->warning("RunnableDB::GeneTrees::CopyTreesFromDB is deprecated, and may be removed in a future release");
+
         #get compara_dba adaptor
         $self->param( 'compara_dba', $self->compara_dba );