dada2: upgrade to 1.12

- remove "deprecated" derep step - added script for the generation of (almost all) test data - few tweaks and bugfixes
galaxyproject · Oct 29, 2019 · b360e1c · b360e1c
1 parent d9c7658
commit b360e1c
Show file tree

Hide file tree

Showing 31 changed files with 493 additions and 541 deletions.
diff --git a/tools/dada2/README.md b/tools/dada2/README.md
@@ -1,7 +1,6 @@
 Wrappers for the core functionality of the dada2 package https://benjjneb.github.io/dada2/index.html. 
 
 - filterAndTrim
-- derep
 - learnErrors
 - dada
 - mergePairs
@@ -13,9 +12,8 @@ Datatypes
 
 The dada2 Galaxy wrappers use a few extra data types to ensure that only inputs of the correct type can be used, these datatypes are available from Galaxy release 19.05, for earlier releases they need to be added manually. 
 
-For the outputs of derep, dada, learnErrors, and mergePairs the following datatypes are used that derive from  Rdata (which contains the named list that is returned from the corresponding dada function):
+For the outputs of dada, learnErrors, and mergePairs the following datatypes are used that derive from  Rdata (which contains the named list that is returned from the corresponding dada function):
 
-- dada2_derep (Rdata: named list see docs for derep-class)
 - dada2_dada (Rdata: named list, see docs for dada-class)
 - dada2_errorrates (Rdata: named list, see docs for learnErrors)
 - dada2_mergepairs (Rdata: named list, see docs for mergePairs)
@@ -31,8 +29,13 @@ For the outputs of makeSequenceTable and removeBimeraDenovo the following data t
 
 Note the difference between the R and Galaxy representations! The main motivation is that the dada2_sequencetable is analogous to OTU tables as produced for instance by qiime (and it seemed natural to extend this to the uniques which are essentially a sequencetables of single samples).
 
+Test data
+=========
+
+Test data for `dada2_seqCounts` is generated using planemo's `--update_test_data` argument and manual
+inspection of the test files. In addition a run of the pipeline (using collections) is executed
+manually using `planemo serve` making sure that the entries of the tables are generated in a useful way.
 
-TODOs 
-=====
+In order to have the Collection unzip tool available use `planemo s --galaxy_root GALAXY_ROOT  --extra_tools GALAXY_ROOT/lib/galaxy/tools/`
 
-- implement getUniques tool to view intermediate results?
+All test other test data is generated using the shell script (`gentest.sh`) in test-data 
diff --git a/tools/dada2/dada2_derepFastq.xml b/tools/dada2/dada2_derepFastq.xml
diff --git a/tools/dada2/dada2_filterAndTrim.xml b/tools/dada2/dada2_filterAndTrim.xml
@@ -69,17 +69,17 @@ filt.rev <- NULL
 #end if
 
 #if str($orientFwd) == ""
-	orientFwd <- NULL
+    orientFwd <- NULL
 #else
-	orientFwd <- "$orientFwd"
+    orientFwd <- "$orientFwd"
 #end if
 
 library(dada2, quietly=T)
 
 ftout <- filterAndTrim(fwd, filt.fwd, rev, filt.rev,  compress = TRUE,
     truncQ = truncQ, truncLen = truncLen, trimLeft = trimLeft, trimRight = trimRight, maxLen = maxLen,
-	minLen = minLen, maxN = maxN, minQ = minQ, maxEE = maxEE, rm.lowcomplex = $rmlowcomplex,
-	rm.phix = $rmPhiX, orient.fwd = orientFwd)
+    minLen = minLen, maxN = maxN, minQ = minQ, maxEE = maxEE, rm.lowcomplex = $rmlowcomplex,
+    rm.phix = $rmPhiX, orient.fwd = orientFwd)
 
 rownames(ftout) <- c( '$paired_cond.reads.element_identifier' )
 write.table(ftout, "$outtab", quote=F, sep="\t", col.names=NA)
@@ -229,7 +229,7 @@ Usage
 
 **Output** is a (paired) collection of filtered and trimmed paired FASTQ datasets (again one data set or pair per sample).
 
-Upstream dada2 tools are *dada2: derepFastq* and *dada2: learnErrorRates*. Note that these tools do not work on paired end data. So, if you have paired end data you need to split the generated paired collection into one containing the forward reads and one containing the reverse reads. This can be done by the *unzip collection* tool.
+Upstream dada2 tools are *dada2: learnErrorRates* and *dada2: dada*. Note that these tools do not work on paired end data. So, if you have paired end data you need to split the generated paired collection into one containing the forward reads and one containing the reverse reads. This can be done by the *unzip collection* tool.
 
 An additional tabular output gives the number of reads before and after trimming. This can data set can be used as input for *dada2: sequence counts* to track the sequence counts for each sample through all dada2 pipeline step.
 

diff --git a/tools/dada2/dada2_plotComplexity.xml b/tools/dada2/dada2_plotComplexity.xml
@@ -87,7 +87,7 @@ ggsave('output_rev.pdf', qp, width = 20,height = 15,units = c("cm"))
             <when value="batch">
                 <expand macro="fastq_input" multiple="False" collection_type="paired" argument_fwd="fl" argument_rev="fl"/>
             </when>
-	</conditional>
+        </conditional>
         <param argument="kmerSize" type="integer" value="2" label="kmer size" help="kmer: also known as oligonucleotides words"/>
         <param argument="window" type="integer" value="" optional="true" label="width (nucleotides) of the moving window" help="If not specified (default) the whole sequence is used"/>
         <param argument="by" type="integer" value="5" label="step size (nucleotides)" help="between each moving window tested"/>
@@ -106,6 +106,7 @@ ggsave('output_rev.pdf', qp, width = 20,height = 15,units = c("cm"))
         </data>
     </outputs>
     <tests>
+        <!-- all tests are against the same file using a delta that should ensure that the pdf contains a plot -->
         <!-- paired joint, no-aggregate -->
         <test expect_num_outputs="2">
             <param name="batch_cond|batch_select" value="joint"/>
@@ -121,8 +122,8 @@ ggsave('output_rev.pdf', qp, width = 20,height = 15,units = c("cm"))
                 </collection>
             </param>
             <param name="batch_cond|aggregate" value="FALSE"/>
-            <output name="output_fwd" value="complexityMultiple.pdf" ftype="pdf"/>
-            <output name="output_rev" value="complexityMultiple_rev.pdf" ftype="pdf"/>
+            <output name="output_fwd" value="complexity.pdf" ftype="pdf" compare="sim_size" delta="200"/>
+            <output name="output_rev" value="complexity.pdf" ftype="pdf" compare="sim_size" delta="200"/>
         </test>
         <!-- paired-separate joint, no-aggregate (sim_size because element ids differ) -->
         <test expect_num_outputs="2">
@@ -131,8 +132,8 @@ ggsave('output_rev.pdf', qp, width = 20,height = 15,units = c("cm"))
             <param name="batch_cond|paired_cond|reads" value="F3D0_S188_L001_R1_001.fastq.gz" ftype="fastqsanger.gz"/>
             <param name="batch_cond|paired_cond|sdaer" value="F3D0_S188_L001_R2_001.fastq.gz" ftype="fastqsanger.gz"/>
             <param name="batch_cond|aggregate" value="FALSE"/>
-            <output name="output_fwd" value="complexityMultiple.pdf" ftype="pdf" compare="sim_size"/>
-            <output name="output_rev" value="complexityMultiple_rev.pdf" ftype="pdf" compare="sim_size"/>
+            <output name="output_fwd" value="complexity.pdf" ftype="pdf" compare="sim_size" delta="200"/>
+            <output name="output_rev" value="complexity.pdf" ftype="pdf" compare="sim_size" delta="200"/>
         </test>
         <!-- single, non-batch, aggregate, small sample -->
         <test expect_num_outputs="1">
@@ -141,7 +142,7 @@ ggsave('output_rev.pdf', qp, width = 20,height = 15,units = c("cm"))
             <param name="batch_cond|paired_cond|reads" value="F3D0_S188_L001_R1_001.fastq.gz,F3D0_S188_L001_R2_001.fastq.gz" ftype="fastqsanger.gz"/>
             <param name="n" value="10000"/>
             <param name="batch_cond|aggregate" value="TRUE"/>
-            <output name="output" value="complexitySmallSample.pdf" ftype="pdf"/>
+            <output name="output" value="complexity.pdf" ftype="pdf" compare="sim_size" delta="200"/>
         </test>
 
         <!-- paired, batch -->
@@ -154,25 +155,25 @@ ggsave('output_rev.pdf', qp, width = 20,height = 15,units = c("cm"))
                     <element name="reverse" value="F3D0_S188_L001_R2_001.fastq.gz" ftype="fastqsanger.gz"/>
                 </collection>
             </param>
-            <output name="output_fwd" value="complexity.pdf" ftype="pdf"/>
-            <output name="output_rev" value="complexity_rev.pdf" ftype="pdf"/>
+            <output name="output_fwd" value="complexity.pdf" ftype="pdf" compare="sim_size" delta="200"/>
+            <output name="output_rev" value="complexity.pdf" ftype="pdf" compare="sim_size" delta="200"/>
         </test>
         <!-- paired-separate batch  (sim_size because element ids differ)-->
         <test expect_num_outputs="2">
             <param name="batch_cond|batch_select" value="batch"/>
             <param name="batch_cond|paired_cond|paired_select" value="separate"/>
             <param name="batch_cond|paired_cond|reads" value="F3D0_S188_L001_R1_001.fastq.gz" ftype="fastqsanger.gz"/>
             <param name="batch_cond|paired_cond|sdaer" value="F3D0_S188_L001_R2_001.fastq.gz" ftype="fastqsanger.gz"/>
-            <output name="output_fwd" value="complexity.pdf" ftype="pdf" compare="sim_size"/>
-            <output name="output_rev" value="complexity_rev.pdf" ftype="pdf" compare="sim_size"/>
+            <output name="output_fwd" value="complexity.pdf" ftype="pdf" compare="sim_size" delta="200"/>
+            <output name="output_rev" value="complexity.pdf" ftype="pdf" compare="sim_size" delta="200"/>
         </test>
         <!-- single, batch -->
         <test expect_num_outputs="1">
             <param name="batch_cond|batch_select" value="batch"/>
             <param name="batch_cond|paired_cond|paired_select" value="single"/>
             <param name="batch_cond|paired_cond|reads" value="F3D0_S188_L001_R1_001.fastq.gz" ftype="fastqsanger.gz"/>
             <param name="n" value="10000"/>
-            <output name="output" value="complexitySmallSample.pdf" ftype="pdf" compare="sim_size"/>
+            <output name="output" value="complexity.pdf" ftype="pdf" compare="sim_size" delta="200"/>
         </test>
     </tests>
     <help><![CDATA[

diff --git a/tools/dada2/dada2_plotQualityProfile.xml b/tools/dada2/dada2_plotQualityProfile.xml
@@ -96,6 +96,7 @@ ggsave('output_rev.pdf', qp, width = 20,height = 15,units = c("cm"))
         </data>
     </outputs>
     <tests>
+        <!-- all tests are against the same file using a delta that should ensure that the pdf contains a plot -->
         <!-- paired joint, no-aggregate -->
         <test expect_num_outputs="2">
             <param name="batch_cond|batch_select" value="joint"/>
@@ -111,8 +112,8 @@ ggsave('output_rev.pdf', qp, width = 20,height = 15,units = c("cm"))
                 </collection>
             </param>
             <param name="batch_cond|aggregate" value="FALSE"/>
-            <output name="output_fwd" value="qualityProfileMultiple.pdf" ftype="pdf"/>
-            <output name="output_rev" value="qualityProfileMultiple_rev.pdf" ftype="pdf"/>
+            <output name="output_fwd" value="qualityProfile.pdf" ftype="pdf" compare="sim_size" delta="5000"/>
+            <output name="output_rev" value="qualityProfile.pdf" ftype="pdf" compare="sim_size" delta="5000"/>
         </test>
         <!-- paired-separate joint, no-aggregate (sim_size because element ids differ) -->
         <test expect_num_outputs="2">
@@ -121,8 +122,8 @@ ggsave('output_rev.pdf', qp, width = 20,height = 15,units = c("cm"))
             <param name="batch_cond|paired_cond|reads" value="F3D0_S188_L001_R1_001.fastq.gz" ftype="fastqsanger.gz"/>
             <param name="batch_cond|paired_cond|sdaer" value="F3D0_S188_L001_R2_001.fastq.gz" ftype="fastqsanger.gz"/>
             <param name="batch_cond|aggregate" value="FALSE"/>
-            <output name="output_fwd" value="qualityProfileMultiple.pdf" ftype="pdf" compare="sim_size"/>
-            <output name="output_rev" value="qualityProfileMultiple_rev.pdf" ftype="pdf" compare="sim_size"/>
+            <output name="output_fwd" value="qualityProfile.pdf" ftype="pdf" compare="sim_size" delta="5000"/>
+            <output name="output_rev" value="qualityProfile.pdf" ftype="pdf" compare="sim_size" delta="5000"/>
         </test>
         <!-- single, non-batch, aggregate, small sample -->
         <test expect_num_outputs="1">
@@ -131,7 +132,7 @@ ggsave('output_rev.pdf', qp, width = 20,height = 15,units = c("cm"))
             <param name="batch_cond|paired_cond|reads" value="F3D0_S188_L001_R1_001.fastq.gz,F3D0_S188_L001_R2_001.fastq.gz" ftype="fastqsanger.gz"/>
             <param name="n" value="10000"/>
             <param name="batch_cond|aggregate" value="TRUE"/>
-            <output name="output" value="qualityProfileSmallSample.pdf" ftype="pdf"/>
+            <output name="output" value="qualityProfile.pdf" ftype="pdf" compare="sim_size" delta="5000"/>
         </test>
 
         <!-- paired, batch -->
@@ -144,25 +145,25 @@ ggsave('output_rev.pdf', qp, width = 20,height = 15,units = c("cm"))
                     <element name="reverse" value="F3D0_S188_L001_R2_001.fastq.gz" ftype="fastqsanger.gz"/>
                 </collection>
             </param>
-            <output name="output_fwd" value="qualityProfile.pdf" ftype="pdf"/>
-            <output name="output_rev" value="qualityProfile_rev.pdf" ftype="pdf"/>
+            <output name="output_fwd" value="qualityProfile.pdf" ftype="pdf" compare="sim_size" delta="5000"/>
+            <output name="output_rev" value="qualityProfile.pdf" ftype="pdf" compare="sim_size" delta="5000"/>
         </test>
         <!-- paired-separate batch  (sim_size because element ids differ)-->
         <test expect_num_outputs="2">
             <param name="batch_cond|batch_select" value="batch"/>
             <param name="batch_cond|paired_cond|paired_select" value="separate"/>
             <param name="batch_cond|paired_cond|reads" value="F3D0_S188_L001_R1_001.fastq.gz" ftype="fastqsanger.gz"/>
             <param name="batch_cond|paired_cond|sdaer" value="F3D0_S188_L001_R2_001.fastq.gz" ftype="fastqsanger.gz"/>
-            <output name="output_fwd" value="qualityProfile.pdf" ftype="pdf" compare="sim_size"/>
-            <output name="output_rev" value="qualityProfile_rev.pdf" ftype="pdf" compare="sim_size"/>
+            <output name="output_fwd" value="qualityProfile.pdf" ftype="pdf" compare="sim_size" delta="5000"/>
+            <output name="output_rev" value="qualityProfile.pdf" ftype="pdf" compare="sim_size" delta="5000"/>
         </test>
         <!-- single, batch -->
         <test expect_num_outputs="1">
             <param name="batch_cond|batch_select" value="batch"/>
             <param name="batch_cond|paired_cond|paired_select" value="single"/>
             <param name="batch_cond|paired_cond|reads" value="F3D0_S188_L001_R1_001.fastq.gz" ftype="fastqsanger.gz"/>
             <param name="n" value="10000"/>
-            <output name="output" value="qualityProfileSmallSample.pdf" ftype="pdf" compare="sim_size"/>
+            <output name="output" value="qualityProfile.pdf" ftype="pdf" compare="sim_size" delta="5000"/>
         </test>
     </tests>
     <help><![CDATA[

diff --git a/tools/dada2/dada2_removeBimeraDenovo.xml b/tools/dada2/dada2_removeBimeraDenovo.xml
@@ -24,7 +24,7 @@ seqtab.nochim <- removeBimeraDenovo(unqs, method = "$method")
 ##   in the latter case the named int matrix is stored as tabular (rows=samples, columns=ASVs)
 ## - otherwise uniques-vector, i.e. a named integer vector
 
-#if $unqs.is_of_type('dada2_derep') or $unqs.is_of_type('dada2_dada')
+#if $unqs.is_of_type('dada2_dada')
     write.data( seqtab.nochim, '$stable_uniques', "dada2_uniques" )
 #else if $unqs.is_of_type('dada2_sequencetable')
     write.data( seqtab.nochim, '$stable_sequencetable', "dada2_sequencetable" )
@@ -54,7 +54,7 @@ if(class(unqs)=="list"){
    <outputs>
         <!-- fix output filters in a later release https://github.com/galaxyproject/galaxy/issues/7464 -->
         <data name="stable_uniques" format="dada2_uniques" label="${tool.name} on ${on_string}" from_work_dir="nonchim.uniques">
-            <filter>unqs.ext == "dada2_derep" or unqs.ext == "dada2_dada"</filter>
+            <filter>unqs.ext == "dada2_dada"</filter>
         </data>
         <data name="stable_mergepairs" format="dada2_mergepairs" label="${tool.name} on ${on_string}" from_work_dir="nonchim.mergepairs">
             <filter>unqs.ext == "dada2_mergepairs"</filter>
@@ -68,11 +68,6 @@ if(class(unqs)=="list"){
             <param name="unqs" ftype="dada2_sequencetable" value="makeSequenceTable_F3D0.tab"/>
             <output name="stable_sequencetable" value="removeBimeraDenovo_F3D0.tab" ftype="dada2_sequencetable" />
         </test>
-        <!-- derep input -->
-        <test expect_num_outputs="1">
-            <param name="unqs" ftype="dada2_derep" value="derepFastq_F3D0_R1.Rdata"/>
-            <output name="stable_uniques" value="removeBimeraDenovo_F3D0_derep_uniques.tab" ftype="dada2_uniques" />
-        </test>
         <!-- dada input -->
         <test expect_num_outputs="1">
             <param name="unqs" ftype="dada2_dada" value="dada_F3D0_R1.Rdata"/>
@@ -99,7 +94,7 @@ Usage
 
 **Input**
 
-- the results of makeSequenceTable (note that also the results of derep, dada, and mergePairs are accepted)
+- the results of makeSequenceTable (note that also the results of dada, and mergePairs are accepted)
 
 **Output**