Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove ortholog and topup clustering modes #873

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,6 @@ sub default_options {
'mafft_runtime' => 7200,
'treebest_threshold_n_residues' => 10000,
'treebest_threshold_n_genes' => 400,
'update_threshold_trees' => 0.2,

# alignment filtering options
'threshold_n_genes' => 20,
Expand Down Expand Up @@ -984,7 +983,7 @@ sub core_pipeline_analyses {
-module => 'Bio::EnsEMBL::Compara::RunnableDB::GeneTrees::CreateClustersets',
-parameters => {
member_type => 'protein',
'additional_clustersets' => [qw(treebest phyml-aa phyml-nt nj-dn nj-ds nj-mm raxml raxml_parsimony raxml_bl notung copy raxml_update filter_level_1 filter_level_2 filter_level_3 filter_level_4 fasttree )],
'additional_clustersets' => [qw(treebest phyml-aa phyml-nt nj-dn nj-ds nj-mm raxml raxml_parsimony raxml_bl notung filter_level_1 filter_level_2 filter_level_3 filter_level_4 fasttree)],
},
-flow_into => [ 'cluster_tagging_factory' ],
},
Expand Down
171 changes: 16 additions & 155 deletions modules/Bio/EnsEMBL/Compara/PipeConfig/ProteinTrees_conf.pm
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,6 @@ sub default_options {
'examl_ptiles' => 16,
'treebest_threshold_n_residues' => 10000,
'treebest_threshold_n_genes' => 400,
'update_threshold_trees' => 0.2,

# sequence type used on the phylogenetic inferences
# It has to be set to 1 for the strains
Expand Down Expand Up @@ -236,10 +235,6 @@ sub default_options {
'loadmembers_capacity' => 30,
'HMMer_classifyPantherScore_capacity' => 1000,
'HMMer_search_capacity' => 8000,
'copy_trees_capacity' => 50,
'copy_alignments_capacity' => 50,
'mafft_update_capacity' => 50,
'raxml_update_capacity' => 50,
'ortho_stats_capacity' => 10,
'cafe_capacity' => 50,

Expand Down Expand Up @@ -297,10 +292,8 @@ sub default_options {
# How will the pipeline create clusters (families) ?
# Possible values: 'blastp' (default), 'hmm', 'hybrid'
# 'blastp' means that the pipeline will run a all-vs-all blastp comparison of the proteins and run hcluster to create clusters. This can take a *lot* of compute
# 'ortholog' means that the pipeline will use previously inferred orthologs to perform a cluster projection
# 'hmm' means that the pipeline will run an HMM classification
# 'hybrid' is like "hmm" except that the unclustered proteins go to a all-vs-all blastp + hcluster stage
# 'topup' means that the HMM classification is reused from prev_rel_db, and topped-up with the updated / new species >> UNIMPLEMENTED <<
'clustering_mode' => 'hybrid',

# List of species some genes have been projected from
Expand Down Expand Up @@ -393,11 +386,11 @@ sub pipeline_checks_pre_init {

my %reuse_modes = (clusters => 1, members => 1);
die "'reuse_level' must be set to one of: ".join(", ", keys %reuse_modes) unless $self->o('reuse_level') and $reuse_modes{$self->o('reuse_level')};
my %clustering_modes = (blastp => 1, ortholog => 1, hmm => 1, hybrid => 1, topup => 1);
my %clustering_modes = (blastp => 1, hmm => 1, hybrid => 1);
die "'clustering_mode' must be set to one of: ".join(", ", keys %clustering_modes) unless $self->o('clustering_mode') and $clustering_modes{$self->o('clustering_mode')};

# In HMM mode the library must exist
if (($self->o('clustering_mode') ne 'blastp') and ($self->o('clustering_mode') ne 'ortholog')) {
if (($self->o('clustering_mode') eq 'hmm') or ($self->o('clustering_mode') eq 'hybrid')) {
my $lib = $self->o('hmm_library_basedir');
if ($self->o('hmm_library_version') == 2){
die "'$lib' does not seem to be a valid HMM library (Panther-style)\n" unless ((-d $lib) && (-d "$lib/books") && (-d "$lib/globals") && (-s "$lib/globals/con.Fasta"));
Expand Down Expand Up @@ -597,15 +590,6 @@ sub core_pipeline_analyses {
'output_clusterset_id' => $self->o('use_notung') ? 'raxml' : 'default',
'input_clusterset_id' => 'raxml_parsimony',
);
my %raxml_update_parameters = (
'raxml_pthread_exe_sse3' => $self->o('raxml_pthread_exe_sse3'),
'raxml_pthread_exe_avx' => $self->o('raxml_pthread_exe_avx'),
'raxml_exe_sse3' => $self->o('raxml_exe_sse3'),
'raxml_exe_avx' => $self->o('raxml_exe_avx'),
'treebest_exe' => $self->o('treebest_exe'),
'input_clusterset_id' => 'copy',
'output_clusterset_id' => 'raxml_update',
);

my %raxml_bl_parameters = (
'raxml_pthread_exe_sse3' => $self->o('raxml_pthread_exe_sse3'),
Expand Down Expand Up @@ -999,35 +983,6 @@ sub core_pipeline_analyses {
%hc_analysis_params,
},

{ -logic_name => 'copy_trees_from_previous_release',
-module => 'Bio::EnsEMBL::Compara::RunnableDB::GeneTrees::CopyTreesFromDB',
-parameters => {
'input_clusterset_id' => 'default',
'output_clusterset_id' => 'copy',
'branch_for_new_tree' => '3',
'branch_for_wiped_out_trees' => '4',
'branch_for_update_threshold_trees' => '5',
'update_threshold_trees' => $self->o('update_threshold_trees'),
},
-flow_into => {
1 => [ 'copy_alignments_from_previous_release' ],
3 => [ 'alignment_entry_point' ],
4 => [ 'alignment_entry_point' ],
5 => [ 'alignment_entry_point' ],
},
-hive_capacity => $self->o('copy_trees_capacity'),
-rc_name => '8Gb_job',
},

{ -logic_name => 'copy_alignments_from_previous_release',
-module => 'Bio::EnsEMBL::Compara::RunnableDB::ProteinTrees::CopyAlignmentsFromDB',
-parameters => {
'input_clusterset_id' => 'default',
},
-flow_into => [ 'mafft_update' ],
-hive_capacity => $self->o('copy_alignments_capacity'),
-rc_name => '8Gb_job',
},
# ---------------------------------------------[reuse members]-----------------------------------------------------------------------

{ -logic_name => 'member_copy_factory',
Expand Down Expand Up @@ -1299,16 +1254,6 @@ sub core_pipeline_analyses {
}
},

{
-logic_name => 'flag_update_clusters',
-module => 'Bio::EnsEMBL::Compara::RunnableDB::GeneTrees::FlagUpdateClusters',
-parameters => {
'update_threshold_trees' => $self->o('update_threshold_trees'),
},
-rc_name => '16Gb_job',
},


# -------------------------------------------------[BuildHMMprofiles pipeline]-------------------------------------------------------

{ -logic_name => 'dump_unannotated_members',
Expand Down Expand Up @@ -1595,8 +1540,7 @@ sub core_pipeline_analyses {
-flow_into => {
'1->A' => WHEN(
'#clustering_mode# eq "blastp"' => 'prepare_blastdb',
'#clustering_mode# eq "ortholog"' => 'ortholog_cluster',
ELSE 'load_InterproAnnotation', # hmm, hybrid, topup
ELSE 'load_InterproAnnotation', # hmm, hybrid
),
'A->1' => [ 'clustering_funnel_check' ],
},
Expand All @@ -1613,15 +1557,6 @@ sub core_pipeline_analyses {
},
},

{ -logic_name => 'ortholog_cluster',
-module => 'Bio::EnsEMBL::Compara::RunnableDB::ProteinTrees::OrthologClusters',
-parameters => {
'sort_clusters' => 1,
},
-rc_name => '4Gb_job',
-hive_capacity => $self->o('reuse_capacity'),
},

{ -logic_name => 'hcluster_dump_input_per_genome',
-module => 'Bio::EnsEMBL::Compara::RunnableDB::ProteinTrees::HclusterPrepare',
-parameters => {
Expand Down Expand Up @@ -1714,7 +1649,7 @@ sub core_pipeline_analyses {
{ -logic_name => 'create_additional_clustersets',
-module => 'Bio::EnsEMBL::Compara::RunnableDB::GeneTrees::CreateClustersets',
-parameters => {
'additional_clustersets' => [qw(treebest phyml-aa phyml-nt nj-dn nj-ds nj-mm raxml raxml_parsimony raxml_bl notung treerecs copy raxml_update )],
'additional_clustersets' => [qw(treebest phyml-aa phyml-nt nj-dn nj-ds nj-mm raxml raxml_parsimony raxml_bl notung treerecs)],
},
},

Expand Down Expand Up @@ -1766,9 +1701,6 @@ sub core_pipeline_analyses {
1 => [
'create_additional_clustersets',
'cluster_tagging_factory',
WHEN(
'#clustering_mode# eq "topup"' => 'flag_update_clusters',
),
],
},
},
Expand All @@ -1793,10 +1725,7 @@ sub core_pipeline_analyses {
'inputquery' => 'SELECT root_id AS gene_tree_id, COUNT(seq_member_id) AS tree_num_genes FROM gene_tree_root JOIN gene_tree_node USING (root_id) WHERE tree_type = "tree" AND clusterset_id="default" GROUP BY root_id',
},
-flow_into => {
'2->A' => WHEN(
'#clustering_mode# eq "topup"' => 'copy_trees_from_previous_release',
ELSE 'alignment_entry_point',
),
'2->A' => [ 'alignment_entry_point' ],
'1->A' => [ 'join_panther_subfam' ],
'A->1' => [ 'global_tree_processing' ],
},
Expand Down Expand Up @@ -1965,16 +1894,6 @@ sub core_pipeline_analyses {
},
},

{ -logic_name => 'mafft_update',
-module => 'Bio::EnsEMBL::Compara::RunnableDB::ProteinTrees::MafftUpdate',
-parameters => {
'mafft_exe' => $self->o('mafft_exe'),
},
-hive_capacity => $self->o('mafft_update_capacity'),
-rc_name => '2Gb_job',
-flow_into => [ 'raxml_update_decision' ],
},

{ -logic_name => 'mcoffee_himem',
-module => 'Bio::EnsEMBL::Compara::RunnableDB::ProteinTrees::MCoffee',
-parameters => {
Expand Down Expand Up @@ -2038,32 +1957,32 @@ sub core_pipeline_analyses {

{ -logic_name => 'exon_boundaries_prep',
-module => 'Bio::EnsEMBL::Compara::RunnableDB::ObjectStore::GeneTreeAlnExonBoundaries',
-parameters => {
'treebreak_gene_count' => $self->o('treebreak_gene_count'),
},
-flow_into => {
-1 => 'exon_boundaries_prep_himem',
1 => WHEN(
'#use_quick_tree_break# and (#tree_num_genes# > #treebreak_gene_count#)' => 'quick_tree_break',
ELSE 'aln_tagging',
),
1 => 'treebreak_decision',
},
-hive_capacity => $self->o('split_genes_capacity'),
-batch_size => 20,
},

{ -logic_name => 'exon_boundaries_prep_himem',
-module => 'Bio::EnsEMBL::Compara::RunnableDB::ObjectStore::GeneTreeAlnExonBoundaries',
-flow_into => [ 'treebreak_decision' ],
-rc_name => '2Gb_job',
-hive_capacity => $self->o('split_genes_capacity'),
-batch_size => 20,
},

{ -logic_name => 'treebreak_decision',
-module => 'Bio::EnsEMBL::Hive::RunnableDB::Dummy',
-parameters => {
'treebreak_gene_count' => $self->o('treebreak_gene_count'),
'treebreak_gene_count' => $self->o('treebreak_gene_count'),
},
-flow_into => WHEN(
'#use_quick_tree_break# and (#tree_num_genes# > #treebreak_gene_count#)' => 'quick_tree_break',
ELSE 'aln_tagging',
),
-rc_name => '2Gb_job',
-hive_capacity => $self->o('split_genes_capacity'),
-batch_size => 20,
%decision_analysis_params,
},

{ -logic_name => 'aln_tagging',
Expand Down Expand Up @@ -2767,64 +2686,6 @@ sub core_pipeline_analyses {
}
},

{ -logic_name => 'raxml_update_decision',
-module => 'Bio::EnsEMBL::Compara::RunnableDB::GeneTrees::LoadTags',
-parameters => {
'tags' => {
#The default value matches the default dataflow we want: _8_cores analysis.
'gene_count' => 0,
},
},
-flow_into => {
1 => WHEN(
'(#tree_gene_count# <= 500)' => 'raxml_update',
'(#tree_gene_count# > 500) && (#tree_gene_count# <= 1000)' => 'raxml_update_8',
'(#tree_gene_count# > 1000) && (#tree_gene_count# <= 2000)' => 'raxml_update_16',
'(#tree_gene_count# > 3000)' => 'raxml_update_32',
),
},
%decision_analysis_params,
},

{ -logic_name => 'raxml_update',
-module => 'Bio::EnsEMBL::Compara::RunnableDB::ProteinTrees::RAxML_update',
-parameters => {
%raxml_update_parameters,
},
-hive_capacity => $self->o('raxml_update_capacity'),
-rc_name => '8Gb_job',
},

{ -logic_name => 'raxml_update_8',
-module => 'Bio::EnsEMBL::Compara::RunnableDB::ProteinTrees::RAxML_update',
-parameters => {
%raxml_update_parameters,
'raxml_number_of_cores' => 8,
},
-hive_capacity => $self->o('raxml_update_capacity'),
-rc_name => '16Gb_8c_job',
},

{ -logic_name => 'raxml_update_16',
-module => 'Bio::EnsEMBL::Compara::RunnableDB::ProteinTrees::RAxML_update',
-parameters => {
%raxml_update_parameters,
'raxml_number_of_cores' => 16,
},
-hive_capacity => $self->o('raxml_update_capacity'),
-rc_name => '16Gb_16c_job',
},

{ -logic_name => 'raxml_update_32',
-module => 'Bio::EnsEMBL::Compara::RunnableDB::ProteinTrees::RAxML_update',
-parameters => {
%raxml_update_parameters,
'raxml_number_of_cores' => 32,
},
-hive_capacity => $self->o('raxml_update_capacity'),
-rc_name => '32Gb_32c_job',
},

{ -logic_name => 'treebest_small_families',
-module => 'Bio::EnsEMBL::Compara::RunnableDB::ProteinTrees::NJTREE_PHYML',
-parameters => {
Expand Down
18 changes: 2 additions & 16 deletions modules/Bio/EnsEMBL/Compara/PipeConfig/ncRNAtrees_conf.pm
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,7 @@ sub default_options {

# How will the pipeline create clusters (families) ?
# Possible values: 'rfam' (default) or 'ortholog'
# 'blastp' means that the pipeline will clusters genes according to their RFAM accession
# 'ortholog' means that the pipeline will use previously inferred orthologs to perform a cluster projection
# 'rfam' means that the pipeline will cluster genes according to their RFAM accession
'clustering_mode' => 'rfam',

'master_db' => 'compara_master',
Expand Down Expand Up @@ -653,10 +652,7 @@ sub core_pipeline_analyses {
'type' => 'infernal',
'skip_consensus' => 1,
},
-flow_into => WHEN(
'#clustering_mode# eq "ortholog"' => 'ortholog_cluster',
ELSE 'rfam_classify',
),
-flow_into => [ 'rfam_classify' ],
},

# ---------------------------------------------[run RFAM classification]--------------------------------------------------------------
Expand Down Expand Up @@ -704,16 +700,6 @@ sub core_pipeline_analyses {
%hc_params,
},

{ -logic_name => 'ortholog_cluster',
-module => 'Bio::EnsEMBL::Compara::RunnableDB::ProteinTrees::OrthologClusters',
-parameters => {
'sort_clusters' => 1,
'add_model_id' => 1,
},
-rc_name => '2Gb_job',
-flow_into => 'expand_clusters_with_projections',
},

{ -logic_name => 'expand_clusters_with_projections',
-module => 'Bio::EnsEMBL::Compara::RunnableDB::ProteinTrees::ExpandClustersWithProjections',
-flow_into => [ 'cluster_qc_factory' ],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,9 @@ limitations under the License.

Bio::EnsEMBL::Compara::RunnableDB::GeneTrees::CopyTreesFromDB

=head1 DESCRIPTION
=head1 DEPRECATION NOTICE

1) Used to copy all the trees from a previous database.

2) It identifies the genes in compara_dba that have been updated, deleted or added
when compared to the reuse_compara_dba.

3) It disavows all the genes that were flagged by FlagUpdateClusters.pm and stored
in gene_tree_root_tag as "updated_genes_list", "added_genes_list" and "deleted_genes_list"
This runnable is deprecated, and may be removed in a future release.

=cut

Expand All @@ -44,6 +38,8 @@ use base ('Bio::EnsEMBL::Compara::RunnableDB::GeneTrees::StoreTree');
sub fetch_input {
my $self = shift @_;

$self->warning("RunnableDB::GeneTrees::CopyTreesFromDB is deprecated, and may be removed in a future release");

#get compara_dba adaptor
$self->param( 'compara_dba', $self->compara_dba );

Expand Down
Loading