inital basic test. no advanced params yet

galaxy-genome-annotation · Nov 2, 2023 · d172a83 · d172a83
1 parent b86f302
commit d172a83
Show file tree

Hide file tree

Showing 3 changed files with 183 additions and 24 deletions.
diff --git a/tools/repeatexplorer2/repex_full_clustering.xml b/tools/repeatexplorer2/repex_full_clustering.xml
@@ -9,42 +9,46 @@
       export PYTHONHASHSEED=0 &&
 
       /repex_tarean/seqclust
+      '$paired'
       #if $subsample_size:
         --sample '${subsample_size}'
       #end if
-      --output_dir=tarean_output
-      --logfile='${log}'
-      --cleanup '$paired'
       --taxon '$taxon'
+      --output_dir=output
+      --cpu \${GALAXY_SLOTS:-1}
+      ${FastaFile}  
+
+      &&
+
+      tar -cvf '${ReportArchive}' --directory=output .
+
 
-      #if $advanced_options.advanced:
-      --mincl $advanced_options.size_threshold $advanced_options.keep_names $advanced_options.automatic_filtering  -D $advanced_options.blastx.options_blastx
-      --assembly_min $advanced_options.assembly_min_cluster_size
+      ## #if $advanced_options.advanced:
+      ## --mincl $advanced_options.size_threshold $advanced_options.keep_names $advanced_options.automatic_filtering  -D $advanced_options.blastx.options_blastx
+      ## --assembly_min $advanced_options.assembly_min_cluster_size
 
-        #if $advanced_options.comparative.options_comparative:
-          --prefix_length $advanced_options.comparative.prefix_length
-        #end if
+      ##   #if $advanced_options.comparative.options_comparative:
+      ##     --prefix_length $advanced_options.comparative.prefix_length
+      ##   #end if
       
-        #if $advanced_options.custom_library.options_custom_library:
-       	  -d $advanced_options.custom_library.library extra_database
-        #end if
+      ##   #if $advanced_options.custom_library.options_custom_library:
+      ##  	  -d $advanced_options.custom_library.library extra_database
+      ##   #end if
         
-        #if $advanced_options.options.options:
-         -opt $advanced_options.options.options
-        #end if 
-      #end if
-      ${FastaFile}  
+      ##   #if $advanced_options.options.options:
+      ##    -opt $advanced_options.options.options
+      ##   #end if 
+      ## #end if
       ]]></command>
   <inputs>
     <param name="FastaFile" label="NGS reads" type="data" format="fasta" help="Input file must contain FASTA-formatted NGS reads. Illumina paired-end reads are recommended."/>
-    <param name="paired" type="boolean" truevalue="--paired" falsevalue="" checked="True" label="Paired-end reads" help="If paired-end reads are used, left- and right-hand reads must be interleaved and all pairs must be complete. Example of the correct format is provided in the help below."/>
-    <param name="subsample_size" type="integer" optional="true" value="" label="Subsample reads (number)" help="Use an integer &gt; 1 to select a specific number of reads to use."/>
+    <param name="paired" type="boolean" truevalue="--paired" falsevalue="" checked="True" label="Paired-end reads" help="If paired-end reads are used, they must be interleaved and all pairs must be complete. Example of the correct format is provided in the help below."/>
+    <param name="subsample_size" type="integer" optional="true" value="" label="Subsample reads (number)" help="Use an integer &gt; 1 to select a specific number of reads to use. Leave this field blank to use the entire dataset"/>
     <param name="taxon" label="Select taxon and protein domain database version (REXdb)" type="select" help="Reference database of transposable element protein domains - REXdb - is used for annotation of repeats">
       <option value="VIRIDIPLANTAE3.0" selected="true">Viridiplantae version 3.0 </option>
       <option value="VIRIDIPLANTAE2.2" selected="true">Viridiplantae version 2.2</option>
       <option value="METAZOA3.0">Metazoa version 3.0</option>
       <option value="METAZOA2.0">Metazoa version 2.0</option>
-      <!-- Modify setting in config.py accordingly -->
     </param>
     <section name="advanced" title="Advanced options" expanded="false">
       <param name="options_comparative" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Perform comparative analysis" help="Use this options to analyze multiple samples simultaneously"/>
@@ -65,13 +69,40 @@
       <param name="keep_names" label="Keep original read names" type="boolean" truevalue="--keep_names" falsevalue="" checked="false" help="By default, reads are renamed using integers. Use this option to keep original names."/>
       <param name="assembly_min_cluster_size" type="integer" label="Minimal cluster size for assembly" value="5" min="2" max="100"/>
     </section>
-
   </inputs>
   <outputs>
-    <data name="log" format="txt" label="RepeatExplorer2 - log file"/>
-    <data name="ReportArchive" format="zip" label="RepeatExplorer2 - Archive with HTML report from data ${FastaFile.hid}"/>
-    <data name="ReportFile" format="html" label="RepeatExplorer2 - HTML report from data ${FastaFile.hid}"/>
+    <data name="ReportArchive" format="tar" label="RepeatExplorer2 - Archive with HTML report from data ${FastaFile.hid}"/>
+    <data name="ReportFile" format="html" from_work_dir="output/index.html" label="RepeatExplorer2 - HTML report from data ${FastaFile.hid}"/>
   </outputs>
+  <tests>
+    <!-- test1: basic function -->
+    <test expect_num_outputs="2">
+      <param name="FastaFile" value="LAS_paired_10k.fa.gz" ftype="fasta.gz"/>
+      <param name="paired" value="True"/>
+      <param name="taxon" value="VIRIDIPLANTAE3.0"/>
+      <output name="ReportArchive" file="test1_out.tar">
+        <assert_contents>
+          <has_size value="33873920" delta="3000000"/>
+        </assert_contents>
+      </output>
+      <output name="ReportFile" file="test1_out.html">
+      </output>
+    </test>
+    <!-- test2: read subsample -->
+    <test expect_num_outputs="2">
+      <param name="FastaFile" value="LAS_paired_10k.fa.gz" ftype="fasta.gz"/>
+      <param name="paired" value="True"/>
+      <param name="subsample_size" value="5000"/>
+      <param name="taxon" value="VIRIDIPLANTAE3.0"/>
+      <output name="ReportArchive" file="test2_out.tar">
+        <assert_contents>
+          <has_size value="17981440" delta="3000000"/>
+        </assert_contents>
+      </output>
+      <output name="ReportFile" file="test2_out.html">
+    </output>
+    </test>
+  </tests>
   <help><![CDATA[
       **HELP**
       

diff --git a/tools/repeatexplorer2/test-data/test1_out.html b/tools/repeatexplorer2/test-data/test1_out.html
@@ -0,0 +1,64 @@
+
+<html xmlns:mml="http://www.w3.org/1998/Math/MathML">
+  <head>
+    <meta charset="utf-8"/>	
+    <title> Clustering summary </title>
+    <link rel="stylesheet" href="style1.css">
+  </head>
+
+ <h1 > Clustering Summary</h1>
+<a href="summary_histogram.png"> <img src="summary_histogram.png" width="700" border="1" > </a><p> <b> Graphical summary of the clustering results. </b> Bars represent superclusters, with their heights and widths corresponding to the numbers of reads in the superclusters (y-axis) and to their proportions in all analyzed reads (x-axis), respectively. Rectangles inside the supercluster bars represent individual clusters. If the filtering of abundant satellites was performed, the affected clusters are shown in green, and their sizes correspond to the adjusted values. Blue and pink background panels show proportions of reads that were clustered and remained single, respectively. Top clusters are on the left of the dotted line. </p><hr><br><br>
+ <h2 > Run information:</h2>
+
+<p class='character'>Number of input reads: 10000</p>
+
+<p class='character'>Number of analyzed reads: 10000</p>
+
+<p class='character'>Proportion of reads in top clusters : 14 %</p>
+
+<p class='character'>Cluster merging: No</p>
+
+<p class='character'>Paired-end reads: Yes</p>
+
+ <h2 > Available analyses:</h2>
+<p> <a href="tarean_report.html">Tandem repeat analysis</a> </p><p> <a href="cluster_report.html">Cluster annotation</a> </p><p> <a href="supercluster_report.html">Supercluster annotation</a> </p><p> <a href="summarized_annotation.html">Repeat annotation summary</a> </p>
+ <h2 > Supplementary files:</h2>
+<p> <a href="CLUSTER_TABLE.csv">CLUSTER_TABLE.csv</a> </p><p> <a href="SUPERCLUSTER_TABLE.csv">SUPERCLUSTER_TABLE.csv</a> </p><p> <a href="contigs.fasta">contigs.fasta</a> </p><hr>
+
+<h3> How to cite </h3>
+<p>
+	Novak, P., Neumann, P., Pech, J., Steinhaisl, J., Macas, J. (2013) -
+	  <a href="http://bioinformatics.oxfordjournals.org/content/29/6/792">RepeatExplorer: a Galaxy-based web server for genome-wide characterization of eukaryotic repetitive elements from next generation sequence reads.</a> <i> Bioinformatics</i> <b>29</b>:792-793.
+</p>
+
+<p><i> Classification of repetitive elements using REXdb:</i></p>
+<p>Neumann, P., Novak, P., Hostakova, N., Macas, J. (2019) &#8211; <a href="https://mobilednajournal.biomedcentral.com/articles/10.1186/s13100-018-0144-1" target="_blank">Systematic survey of plant LTR-retrotransposons elucidates phylogenetic relationships of their polyprotein domains and provides a reference for element classification</a>. <em>Mobile DNA</em> <b>10</b>:1.</p>
+
+</p>
+<i>The principle of repeat identification implemented in the RepeatExplorer:</i>
+<p>
+	  Novak, P., Neumann, P., Macas, J. (2010) - <a href="http://www.biomedcentral.com/1471-2105/11/378">Graph-based clustering and characterization of repetitive sequences in next-generation sequencing data.</a> <i>BMC Bioinformatics</i> <b>11</b>:378.
+</p>
+<i>Using TAREAN for satellite repeat detection and characterization:</i>
+<p>
+  Novak, P., Robledillo, L.A.,Koblizkova, A., Vrbova, I., Neumann, P., Macas, J. (2017) -
+    <a href="https://doi.org/10.1093/nar/gkx257"> TAREAN: a computational tool for identification and characterization of satellite DNA from unassembled short reads.</a> <i> Nucleic Acid Research </i> <b>45</b>:e111
+</p>
+<br><hr>
+ <h3 > Details:</h3>
+<pre>
+--------------------------------------------------------------------------
+PIPELINE VERSION         : devel-0.3.8-2917(e753f81)
+
+PROTEIN DATABASE VERSION : protein_database_viridiplantae_v3.0.fasta
+            md5 checksum : a36362f4e8b024f1ce97589aac1e6f1a
+
+DNA DATABASE VERSION     : dna_database_masked.fasta
+            md5 checksum : 86bab7cdd3e70374cd756de13680240d
+--------------------------------------------------------------------------
+</pre>
+<p class='character'>Minimal number of reads in cluster to be considered top cluster : 20</p>
+
+<p class='character'>Reserved Memory : 31G</p>
+
+<p class='character'>Maximum number of processable reads with the reserved memory : 1557523</p>
diff --git a/tools/repeatexplorer2/test-data/test2_out.html b/tools/repeatexplorer2/test-data/test2_out.html
@@ -0,0 +1,64 @@
+
+<html xmlns:mml="http://www.w3.org/1998/Math/MathML">
+  <head>
+    <meta charset="utf-8"/>	
+    <title> Clustering summary </title>
+    <link rel="stylesheet" href="style1.css">
+  </head>
+
+ <h1 > Clustering Summary</h1>
+<a href="summary_histogram.png"> <img src="summary_histogram.png" width="700" border="1" > </a><p> <b> Graphical summary of the clustering results. </b> Bars represent superclusters, with their heights and widths corresponding to the numbers of reads in the superclusters (y-axis) and to their proportions in all analyzed reads (x-axis), respectively. Rectangles inside the supercluster bars represent individual clusters. If the filtering of abundant satellites was performed, the affected clusters are shown in green, and their sizes correspond to the adjusted values. Blue and pink background panels show proportions of reads that were clustered and remained single, respectively. Top clusters are on the left of the dotted line. </p><hr><br><br>
+ <h2 > Run information:</h2>
+
+<p class='character'>Number of input reads: 10000</p>
+
+<p class='character'>Number of analyzed reads: 5000</p>
+
+<p class='character'>Proportion of reads in top clusters : 8.3 %</p>
+
+<p class='character'>Cluster merging: No</p>
+
+<p class='character'>Paired-end reads: Yes</p>
+
+ <h2 > Available analyses:</h2>
+<p> <a href="tarean_report.html">Tandem repeat analysis</a> </p><p> <a href="cluster_report.html">Cluster annotation</a> </p><p> <a href="supercluster_report.html">Supercluster annotation</a> </p><p> <a href="summarized_annotation.html">Repeat annotation summary</a> </p>
+ <h2 > Supplementary files:</h2>
+<p> <a href="CLUSTER_TABLE.csv">CLUSTER_TABLE.csv</a> </p><p> <a href="SUPERCLUSTER_TABLE.csv">SUPERCLUSTER_TABLE.csv</a> </p><p> <a href="contigs.fasta">contigs.fasta</a> </p><hr>
+
+<h3> How to cite </h3>
+<p>
+	Novak, P., Neumann, P., Pech, J., Steinhaisl, J., Macas, J. (2013) -
+	  <a href="http://bioinformatics.oxfordjournals.org/content/29/6/792">RepeatExplorer: a Galaxy-based web server for genome-wide characterization of eukaryotic repetitive elements from next generation sequence reads.</a> <i> Bioinformatics</i> <b>29</b>:792-793.
+</p>
+
+<p><i> Classification of repetitive elements using REXdb:</i></p>
+<p>Neumann, P., Novak, P., Hostakova, N., Macas, J. (2019) &#8211; <a href="https://mobilednajournal.biomedcentral.com/articles/10.1186/s13100-018-0144-1" target="_blank">Systematic survey of plant LTR-retrotransposons elucidates phylogenetic relationships of their polyprotein domains and provides a reference for element classification</a>. <em>Mobile DNA</em> <b>10</b>:1.</p>
+
+</p>
+<i>The principle of repeat identification implemented in the RepeatExplorer:</i>
+<p>
+	  Novak, P., Neumann, P., Macas, J. (2010) - <a href="http://www.biomedcentral.com/1471-2105/11/378">Graph-based clustering and characterization of repetitive sequences in next-generation sequencing data.</a> <i>BMC Bioinformatics</i> <b>11</b>:378.
+</p>
+<i>Using TAREAN for satellite repeat detection and characterization:</i>
+<p>
+  Novak, P., Robledillo, L.A.,Koblizkova, A., Vrbova, I., Neumann, P., Macas, J. (2017) -
+    <a href="https://doi.org/10.1093/nar/gkx257"> TAREAN: a computational tool for identification and characterization of satellite DNA from unassembled short reads.</a> <i> Nucleic Acid Research </i> <b>45</b>:e111
+</p>
+<br><hr>
+ <h3 > Details:</h3>
+<pre>
+--------------------------------------------------------------------------
+PIPELINE VERSION         : devel-0.3.8-2917(e753f81)
+
+PROTEIN DATABASE VERSION : protein_database_viridiplantae_v3.0.fasta
+            md5 checksum : a36362f4e8b024f1ce97589aac1e6f1a
+
+DNA DATABASE VERSION     : dna_database_masked.fasta
+            md5 checksum : 86bab7cdd3e70374cd756de13680240d
+--------------------------------------------------------------------------
+</pre>
+<p class='character'>Minimal number of reads in cluster to be considered top cluster : 20</p>
+
+<p class='character'>Reserved Memory : 31G</p>
+
+<p class='character'>Maximum number of processable reads with the reserved memory : 1557523</p>