Skip to content

Commit

Permalink
add panaroo tool
Browse files Browse the repository at this point in the history
  • Loading branch information
mthang committed Jul 30, 2024
1 parent c42b6e3 commit 5eefab3
Show file tree
Hide file tree
Showing 18 changed files with 90,924 additions and 0 deletions.
10 changes: 10 additions & 0 deletions tools/panaroo/.shed.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
name: panaroo
owner: galaxy-australia
categories:
- Pangenome
description: A Bacterial Pangenome Analysis Pipeline
homepage_url: https://gthlab.au/panaroo/#/
long_description: |
a graph-based pangenome clustering tool that is able to account for many of the sources of error introduced during the annotation of prokaryotic genome assemblies.
remote_repository_url: https://github.com/usegalaxy-au/tools-au/tree/master/tools/panaroo
type: unrestricted
69 changes: 69 additions & 0 deletions tools/panaroo/macros.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
<macros>
<token name="@TOOL_VERSION@">1.5.0</token>
<token name="@VERSION_SUFFIX@">0</token>
<token name="@PROFILE@">22.05</token>
<xml name="edam_ontology">
<edam_topics>
<edam_topic>topic_0194</edam_topic>
</edam_topics>
</xml>
<xml name="biotools">
<xrefs>
<xref type="bio.tools">panaroo</xref>
</xrefs>
</xml>
<xml name="requirements">
<requirements>
<requirement type="package" version="@TOOL_VERSION@">panaroo</requirement>
<requirement type="package" version="170427">prank</requirement>
</requirements>
</xml>
<xml name="clean_mode">
<option value="strict">strict</option>
<option value="moderate">moderate</option>
<option value="sensitive">sensitive</option>
</xml>
<xml name="genetic_code">
<option value="1">1. Standard</option>
<option value="2">2. Vertebrate Mitochondrial</option>
<option value="3">3. Yeast Mitochondrial</option>
<option value="4">4. Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code</option>
<option value="5">5. Invertebrate Mitochondrial</option>
<option value="6">6. Ciliate, Dasycladacean and Hexamita Nuclear Code</option>
<option value="9">9. Echinoderm Mitochondrial</option>
<option value="10">10. Euplotid Nuclear</option>
<option value="11" selected="True">11. Bacteria and Archaea</option>
<option value="12">12. Alternative Yeast Nuclear</option>
<option value="13">13. Ascidian Mitochondrial</option>
<option value="14">14. Flatworm Mitochondrial</option>
<option value="15">15. Blepharisma Macronuclear</option>
<option value="16">16. Chlorophycean Mitochondrial</option>
<option value="21">21. Trematode Mitochondrial</option>
<option value="22">22. Scenedesmus obliquus mitochondrial</option>
<option value="23">23. Thraustochytrium Mitochondrial</option>
<option value="24">24. Pterobranchia mitochondrial</option>
<option value="25">25. Candidate Division SR1 and Gracilibacteria Code</option>
<option value="26">26. Pachysolen tannophilus Nuclear Code</option>
<option value="27">27. Karyorelict Nuclear Code</option>
<option value="28">28. Condylostoma Nuclear Code</option>
<option value="29">29. Mesodinium Nuclear Code</option>
<option value="30">30. Peritrich Nuclear Code</option>
<option value="31">31. Blastocrithidia Nuclear Code</option>
<option value="33">33. Cephalodiscidae Mitochondrial UAA-Tyr Code</option>
</xml>
<xml name="refind_mode_option">
<option value="default" selected="True">default</option>
<option value="strict">strict</option>
<option value="off">off</option>
</xml>
<xml name="gene_alignment">
<option value="None" selected="True">None</option>
<option value="core">core</option>
<option value="pan">pan</option>
</xml>
<xml name="gene_aligner">
<option value="mafft" selected="True">mafft</option>
<option value="prank">prank</option>
<option value="clustal">clustal</option>
</xml>
</macros>
235 changes: 235 additions & 0 deletions tools/panaroo/panaroo.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,235 @@
<tool id="panaroo" name="Panaroo" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
<description>A Bacterial Pangenome Analysis Pipeline</description>
<macros>
<import>macros.xml</import>
</macros>
<expand macro="edam_ontology"/>
<expand macro="biotools"/>
<expand macro="requirements"/>
<stdio>
<exit_code range="1:" />
<regex match="System..*Exception"
source="both"
level="fatal"
description="Error encountered" />
</stdio>
<command><![CDATA[
mkdir outdir &&
#import re
#set input_directory = 'input_directory'
mkdir $input_directory &&
#for $gff in $gff_input_collection:
#set identifier = re.sub('[^\s\w\-\\.]','_',str($gff.element_identifier))
ln -fs '$gff' '$input_directory/$identifier' &&
#end for
panaroo
-t \${GALAXY_SLOTS:-2}
#if str($gen_code) != 'None':
--codon-table $gen_code
#end if
#if str($advanced.adv_options_selector) == "set":
#if $advanced.remove_invalid_gene
$advanced.remove_invalid_gene
#end if
-c '$advanced.matching_option.seq_threshold'
-f '$advanced.matching_option.peptide_threshold'
--len_dif_percent '$advanced.matching_option.length_diff_cutoff'
$advanced.matching_option.merge_paralogs
--search_radius '$advanced.refind_option.search_radius'
--refind_prop_match '$advanced.refind_option.refind_prop_match'
--refind-mode '$advanced.refind_option.refind_mode'
--min_trailing_support '$advanced.graph_correction_option.min_trailing_support'
--trailing_recursive '$advanced.graph_correction_option.trailing_recursive'
--edge_support_threshold '$advanced.graph_correction_option.edge_support_threshold'
--remove_by_consensus '$advanced.graph_correction_option.remove_by_consensus'
--high_var_flag '$advanced.graph_correction_option.high_var_flag'
--min_edge_support_sv '$advanced.graph_correction_option.min_edge_support_sv'
$advanced.graph_correction_option.all_seq_in_graph
$advanced.graph_correction_option.no_clean_edges
#if $advanced.gene_alignment_option.a != 'None'
-a '$advanced.gene_alignment_option.a'
#end if
#if '$advanced.gene_alignment_option.aligner' == 'mafft'
--aligner mafft
#else
--aligner '$advanced.gene_alignment_option.aligner'
#end if
#if $advanced.gene_alignment_option.core_subset != ''
--core_subset $advanced.gene_alignment_option.core_subset
#end if
#end if
-i $input_directory/*.gff
-o outdir
--clean-mode $mode
> '$log' &&
mv outdir/gene_presence_absence.Rtab outdir/gene_presence_absence_rtab.Rtab &&
2>&1
]]></command>
<inputs>
<param name="gff_input_collection" type="data_collection" format="gff" collection_type="list" label="GFF Input Collection" help="A list of gff files (i.e prokka)"/>
<param name="mode" type="select" label="The stringency mode at which to run panaroo" help="--clean-mode">
<expand macro="clean_mode"/>
</param>
<param name="gen_code" type="select" label="the codon table user for translation" help="default: 11">
<expand macro="genetic_code"/>
</param>
<conditional name="advanced">
<param name="adv_options_selector" type="select" label="Set advanced options?" help="Provides additional controls">
<option value="set">Set</option>
<option value="do_not_set" selected="True">Do not set</option>
</param>
<when value="set">
<param name="remove_invalid_gene" argument="--remove-invalid-genes" type="boolean" truevalue="--remove-invalid-genes" falsevalue="" label="removes annotations that do not conform to the expected Prokka format such as those including premature stop codons" help="--remove-invalid-genes"/>

<section name="matching_option" title="Matching" expanded="false">
<param name="seq_threshold" argument="--threshold" type="float" value="0.98" label="sequence identity threshold" help="default: 0.98"/>
<param name="peptide_threshold" argument="--family_threshold" type="float" value="0.7" label="protein family sequence identity threshold" help="default: 0.7"/>
<param name="length_diff_cutoff" argument="--len_dif_percent" type="float" value="0.98" label="length difference cutoff" help="default: 0.98"/>
<param name="merge_paralogs" type="boolean" truevalue="--merge_paralogs" falsevalue="" checked="false" label="do not split paralogs" help="--merge_paralogs"/>
</section>

<section name="refind_option" title="Refind" expanded="false">
<param argument="--search_radius" type="integer" value="5000" label="Search radius" help="--search_radius (default: 5000)"/>
<param argument="--refind_prop_match" type="float" value="0.75" label="Gene proportion match" help="default: 0.75"/>
<param argument="--refind_mode" type="select" label="The stringency mode at which to re-find genes" help="default: default">
<expand macro="refind_mode_option"/>
</param>
</section>

<section name="graph_correction_option" title="Graph Correction" expanded="false">
<param argument="--min_trailing_support" type="integer" value="2" label="Minimum cluster size to keep a gene called at the end of a contig" help="--min_traiiing_support [relexed mode : 2 is used]"/>
<param argument="--trailing_recursive" type="integer" value="1" label="Number of times to perform recursive trimming of low support nodes near the end of contigs" help="--trailing_recursive [relaxed mode: 1 is used]"/>
<param name="edge_support_threshold" type="integer" value="1" label="Edge support threshold" help="--edge_support_threshold [ Minimal edge 1 is used ]"/>
<param name="len_outlier_proportion" type="float" value="0.01" label="Length outlier support proportion" help="--length_outlier_support_proportion [default: 0.01]"/>
<param name="remove_by_consensus" type="boolean" truevalue="True" falsevalue="False" checked="False" label="Remove consensus" help="--remove_by_consensus [default: False]"/>
<param name="high_var_flag" type="integer" value="5" label="Highly variable gene region" help="--high_var_flag [default: 5]"/>
<param name="min_edge_support_sv" type="integer" value="2" label="Minimum edge support structural variants" help="--min_edge_support_sv [relaxed mode: 2 is used]"/>
<param argument="--all_seq_in_graph" type="boolean" truevalue="--all_seq_in_graph" falsevalue="" label="Retains all DNA sequence" help="--all_seq_in_graph [default: off]"/>
<param argument="--no_clean_edges" type="boolean" truevalue="--no_clean_edges" falsevalue="" label="Edge filtering in the final output graph" help="--no_clean_edges [default: off]"/>
</section>

<section name="gene_alignment_option" title="Gene Alignment" expanded="false">
<param argument="-a" type="select" label="Output alignments of core genes or all genes." help="-a [optional: core or pan; default: None">
<expand macro="gene_alignment"/>
</param>
<param argument="--aligner" type="select" label="Specify an aligner" help="--aligner [mafft|prank|clustal][default: mafft]">
<expand macro="gene_aligner"/>
</param>
<param name="codons" type="boolean" truevalue="true" falsevalue="false" checked="false" label="Generate codon alignments" help="--codons"/>
<param name="core_threshold" type="float" value="0.95" label="Core-genome sample threshold" help="--core_threshold [default: 0.95]"/>
<param argument="--core_subset" type="integer" value="" optional="true" label="Subset of the core genome to these many genes" help="--core_subset [default: all]"/>
<param name="core_entropy" type="float" value="0.1" label="Set the Block Mapping and Gathering with Entropy" help="--core_entropy_filter (threshold can be between 0.0 and 1.0) [default: Tukey outlier method]"/>
</section>
</when>
<when value="do_not_set"/>
</conditional>
</inputs>
<outputs>
<collection name="output" type="list" label="${tool.name} on ${on_string}: Pangenome output">
<discover_datasets pattern="(?P&lt;designation&gt;.+)\.(?P&lt;ext&gt;clstr)" directory="outdir" format="txt" visible="false" />
<discover_datasets pattern="(?P&lt;designation&gt;.+)\.(?P&lt;ext&gt;txt)" directory="outdir" format="txt" visible="false" />
<discover_datasets pattern="(?P&lt;designation&gt;.+)\.(?P&lt;ext&gt;gml)" directory="outdir" format="txt" visible="false" />
<discover_datasets pattern="(?P&lt;designation&gt;.+)\.(?P&lt;ext&gt;Rtab)" directory="outdir" format="tabular" visible="false" />
<discover_datasets pattern="(?P&lt;designation&gt;.+)\.(?P&lt;ext&gt;csv)" directory="outdir" format="csv" visible="false" />
<discover_datasets pattern="(?P&lt;designation&gt;.+)\.(?P&lt;ext&gt;fasta)" directory="outdir" format="fasta" visible="false" />
<discover_datasets pattern="(?P&lt;designation&gt;.+)\.(?P&lt;ext&gt;fa)" directory="outdir" format="fasta" visible="false" />
<filter>advanced['gene_alignment_option']['a'] == 'None' </filter>
</collection>
<collection name="output_pangenome" type="list" label="${tool.name} on ${on_string}: Pangenome alignment output">
<discover_datasets pattern="(?P&lt;designation&gt;.+)\.(?P&lt;ext&gt;clstr)" directory="outdir" format="txt" visible="false" />
<discover_datasets pattern="(?P&lt;designation&gt;.+)\.(?P&lt;ext&gt;txt)" directory="outdir" format="txt" visible="false" />
<discover_datasets pattern="(?P&lt;designation&gt;.+)\.(?P&lt;ext&gt;gml)" directory="outdir" format="txt" visible="false" />
<discover_datasets pattern="(?P&lt;designation&gt;.+)\.(?P&lt;ext&gt;Rtab)" directory="outdir" format="tabular" visible="false" />
<discover_datasets pattern="(?P&lt;designation&gt;.+)\.(?P&lt;ext&gt;csv)" directory="outdir" format="csv" visible="false" />
<discover_datasets pattern="(?P&lt;designation&gt;.+)\.(?P&lt;ext&gt;fasta)" directory="outdir" format="fasta" visible="false" />
<discover_datasets pattern="(?P&lt;designation&gt;.+)\.(?P&lt;ext&gt;fa)" directory="outdir" format="fasta" visible="false" />
<discover_datasets pattern="(?P&lt;designation&gt;.+)\.(?P&lt;ext&gt;aln)" directory="outdir" format="aln" visible="false" />
<discover_datasets pattern="(?P&lt;designation&gt;.+)\.(?P&lt;ext&gt;embl)" directory="outdir" format="embl" visible="false" />
<filter>advanced['gene_alignment_option']['a'] != 'None' </filter>
</collection>
<collection name="output_pangenome_fasta" type="list" label="${tool.name} on ${on_string}: Pangenom alignment fasta">
<discover_datasets pattern="(?P&lt;designation&gt;.+)\.(?P&lt;ext&gt;fas)" directory="outdir/aligned_gene_sequences" format="fasta" visible="false" />
<filter>advanced['gene_alignment_option']['a'] != 'None' </filter>
</collection>

<data name="log" format="txt" label="${tool.name} on ${on_string}: log"/>
</outputs>
<tests>
<!-- run panaroo with default parameters (i.e panaroo -t 2 -i *.gff -o default \-\-clean-mode strict \-\-remove-invalid-genes) -->
<test expect_num_outputs="2">
<param name="gen_code" value="11"/>
<param name="mode" value="strict"/>
<param name="adv_options_selector" value="set"/>
<param name="a" value="None"/>
<param name="gff_input_collection">
<collection type="list">
<element name="gff10.gff" value="10_small.gff"/>
<element name="gff11.gff" value="11_small.gff"/>
</collection>
</param>
<output_collection name="output" count="13"/>
<output name="log">
<assert_contents>
<has_text text="pre-processing gff3 files..."/>
</assert_contents>
</output>
</test>
<test expect_num_outputs="3">
<param name="gen_code" value="11"/>
<param name="mode" value="strict"/>
<param name="adv_options_selector" value="set"/>
<param name="a" value="core"/>
<param name="gff_input_collection">
<collection type="list">
<element name="gff10.gff" value="10_small.gff"/>
<element name="gff11.gff" value="11_small.gff"/>
</collection>
</param>
<output_collection name="output_pangenome" count="18"/>
<output_collection name="output_pangenome_fasta" count="251"/>
<output name="log">
<assert_contents>
<has_text text="pre-processing gff3 files..."/>
</assert_contents>
</output>
</test>
</tests>
<help><![CDATA[
Panaroo_ is A Bacterial Pangenome Analysis Pipeline.
**INPUTS**
Panaroo now supports multiple input formats. To use non-standard GFF3 files you must profile the input file as a list in a text file (one per line). Separate GFF and FASTA files can be provided per isolate by providing each file delimited by a space or a tab. Genbank file formats are also supported with extensions '.gbk', '.gb' or '.gbff'. These must compliant with Genbank/ENA/DDJB. This can be forced in Prokka by specifying the --compliance parameter.
- data file in gff format
**OUTPUTS**
- combined_protein_cdhit_out.txt
- combined_protein_cdhit_out.txt.clstr
- pre_filt_graph.gml
- gene_data.csv
- combined_protein_CDS.fasta
- combined_DNA_CDS.fasta
- gene_presence_absence.Rtab
- gene_presence_absence_roary.csv
- gene_presence_absence.csv
- summary_statistics.txt
- pan_genome_reference.fa
- struct_presence_absence.Rtab
- final_graph.gml
.. _Panaroo: https://gthlab.au/panaroo/#/gettingstarted/quickstart
]]></help>
<citations>
<citation type="doi">10.1186/s13059-020-02090-4</citation>
</citations>
</tool>

Loading

0 comments on commit 5eefab3

Please sign in to comment.