Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

dada2 #5

Closed
wants to merge 23 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
95f398a
dada2: initial commit of rudiment dada2 wrappers
bernt-matthias Feb 24, 2019
ff2621b
dada2: added README
bernt-matthias Feb 26, 2019
af371b6
dada2 readme addition
bernt-matthias Feb 27, 2019
42eb676
dada2: start to have different data types (not all Rdata)
bernt-matthias Mar 1, 2019
308edb2
dada2: add data manager, taxonomy, sequence count
bernt-matthias Mar 7, 2019
eec95cc
dada2: shed.yml and reference data
bernt-matthias Mar 7, 2019
d63c840
dada2: added all reference data
bernt-matthias Mar 8, 2019
9616b7e
dada2: dadamanager :)
bernt-matthias Mar 11, 2019
999a151
dada2: bugfixing and documenting
bernt-matthias Mar 12, 2019
d986bdd
dada2: more work on tests
bernt-matthias Mar 12, 2019
5b1603b
dada2: more testing
bernt-matthias Apr 5, 2019
9901926
dada2:
bernt-matthias May 5, 2019
72e7870
dada2: allow separate input of paired data
bernt-matthias May 7, 2019
d83173b
dada2: version bump
bernt-matthias May 9, 2019
2f47c03
dada2: working data manager
bernt-matthias May 10, 2019
7831a1e
dada2: allow fastq input
bernt-matthias May 24, 2019
76b9afb
dada2: allow fastq input
bernt-matthias May 27, 2019
977f221
dada2: fix bug in filter and trim (format source)
bernt-matthias May 27, 2019
a547707
dada2: tests, plotQualityProfile
bernt-matthias May 28, 2019
7532506
dada2: add overview to two tools
bernt-matthias May 28, 2019
f89a767
dada2: added oom stdio test
bernt-matthias Jun 3, 2019
11ea529
dada2: fix typo in tests data name
bernt-matthias Jun 6, 2019
5d83143
dada2: data manager more explicit UNITE option
bernt-matthias Jun 17, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions data_managers/data_manager_dada2/.shed.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
name: data_manager_dada2
owner: matthias
description: "Data manager to download DADA2 reference databases"
homepage_url: "https://benjjneb.github.io"
long_description: |
DADA2: Fast and accurate sample inference from amplicon data with single-nucleotide resolution
remote_repository_url: "https://github.com/bernt-matthias/mb-galaxy-tools/tree/master/data_managers/data_manager_dada2"
type: unrestricted
categories:
- Data Managers
181 changes: 181 additions & 0 deletions data_managers/data_manager_dada2/data_manager/dada2_fetcher.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
<?xml version="1.0"?>
<tool id="dada_fetcher" name="dada2 data manager" tool_type="manage_data" version="0.0.7">
<description>Download reference databases</description>
<command detect_errors="exit_code"><![CDATA[
python '$__tool_directory__/data_manager.py'
--out '$out_file'
#set dataset = str($db_cond.db_select) + '_' + str($db_cond.version_select)
--dataset '$dataset'
]]>
</command>
<inputs>
<conditional name="db_cond">
<param name="db_select" type="select" label="Taxonomic database">
<option value="silva">Silva</option>
<option value="rdp">RDP</option>
<option value="greengenes">GreenGenes</option>
<option value="unite">UNITE Fungi: General Fasta</option>
<!-- UNITE Eukaryotes not yet supported https://github.com/benjjneb/dada2/issues/702 -->
<option value="RefSeq_RDP">NCBI RefSeq 16S rRNA database supplemented by RDP</option>
<option value="gtdb">GTDB: Genome Taxonomy Database (Bacteria &amp; Archaea)</option>
<option value="hitdb">HitDB (Human InTestinal 16S)</option>
<option value="silva_euk_18S">Silva Eukaryotic 18S</option>
<option value="PR2">Protist Ribosomal Reference database (PR2)</option>
</param>
<when value="silva">
<param name="version_select" type="select" label="Database version">
<option value="132">132</option>
<option value="128">128</option>
</param>
</when>
<when value="rdp">
<param name="version_select" type="select" label="Database version">
<option value="16">16</option>
<option value="14">14</option>
</param>
</when>
<when value="greengenes">
<param name="version_select" type="select" label="Database version">
<option value="13.84">13.84</option>
</param>
</when>
<when value="unite">
<param name="version_select" type="select" label="Database version">
<option value="8.0_fungi">release 8.0 for Fungi</option>
<option value="8.0_fungi_singletons">release 8.0 for Fungi including global and 97% singletons</option>
</param>
</when>
<when value="RefSeq_RDP">
<param name="version_select" type="select" label="Database version">
<option value="2018_05">05/2018</option>
</param>
</when>
<when value="gtdb">
<param name="version_select" type="select" label="Database version">
<option value="2018_11">11/2018</option>
</param>
</when>
<when value="hitdb">
<param name="version_select" type="select" label="Database version">
<option value="1">1</option>
</param>
</when>
<when value="silva_euk_18S">
<param name="version_select" type="select" label="Database version">
<option value="132">132</option>
</param>
</when>
<when value="PR2">
<param name="version_select" type="select" label="Database version">
<option value="4.11.1">4.11.1</option>
</param>
</when>
</conditional>
</inputs>
<outputs>
<data name="out_file" format="data_manager_json" />
</outputs>
<tests>
<test>
<param name="db_cond|db_select" value="silva"/>
<param name="db_cond|version_select" value="132"/>
<output name="out_file" file="silva132_json"/>
</test>
<test>
<param name="db_cond|db_select" value="rdp"/>
<param name="db_cond|version_select" value="16"/>
<output name="out_file" file="rdp16_json"/>
</test>
<test>
<param name="db_cond|db_select" value="greengenes"/>
<param name="db_cond|version_select" value="13.84"/>
<output name="out_file" file="greengenes13.84_json"/>
</test>
<test>
<param name="db_cond|db_select" value="unite"/>
<param name="db_cond|version_select" value="8.0_fungi"/>
<output name="out_file" file="unite8fungi_json"/>
</test>
<test>
<param name="db_cond|db_select" value="unite"/>
<param name="db_cond|version_select" value="8.0_fungi_singletons"/>
<output name="out_file" file="unite8fungisingletons_json"/>
</test>
<test>
<param name="db_cond|db_select" value="RefSeq_RDP"/>
<param name="db_cond|version_select" value="2018_05"/>
<output name="out_file" file="RefSeq_RDP2018_json"/>
</test>
<test>
<param name="db_cond|db_select" value="gtdb"/>
<param name="db_cond|version_select" value="2018_11"/>
<output name="out_file" file="gtdb2018_json"/>
</test>
<test>
<param name="db_cond|db_select" value="hitdb"/>
<param name="db_cond|version_select" value="1"/>
<output name="out_file" file="hitdb1_json"/>
</test>
<test>
<param name="db_cond|db_select" value="silva_euk_18S"/>
<param name="db_cond|version_select" value="132"/>
<output name="out_file" file="silvaeuk132_json"/>
</test>
<test>
<param name="db_cond|db_select" value="PR2"/>
<param name="db_cond|version_select" value="4.11.1"/>
<output name="out_file" file="PR24.11.1_json"/>
</test>
</tests>
<help><![CDATA[
Public Reference databases maintained by the DADA2 project
..........................................................

The following refrence databases which are describes as maintained by the DADA2 project (https://benjjneb.github.io/dada2/training.html) are available

- Silva (https://www.arb-silva.de/)
- RDP (http://rdp.cme.msu.edu/)
- GreenGenes (http://greengenes.secondgenome.com/)
- UNITE general FASTA (https://unite.ut.ee/repository.php)

While Silva and RDP contain reference databases for taxonomy and species assignment, the greengenes and UNITE databases only contains a reference database for taxonomy assignment.

For the Silva databases check the license information: http://www.arb-silva.de/silva-license-information.

Except for UNITE all reference databases are downloaded from the corresponding zenodo links that are listed on the DADA2 website. The UNITE databases are taken from the links provided on the UNITE website

More detailed informations in the reference data bases can be found on the DADA2 website and contained links: https://benjjneb.github.io/dada2/training.html.

Further public Reference databases listed by the DADA2 project
..............................................................

Several contributed reference databases are listed of the DADA2 project website (https://benjjneb.github.io/dada2/training.html):

- RefSeq + RDP (NCBI RefSeq 16S rRNA database supplemented by RDP)
- GTDB: Genome Taxonomy Database (More info: http://gtdb.ecogenomic.org/)
- HitDB version 1 (Human InTestinal 16S rRNA) (https://github.com/microbiome/HITdb)
- RDP fungi LSU
- Silva Eukaryotic 18S
- PR2 (https://github.com/pr2database/pr2database)

Except for PR2, all reference databases are downloaded from the corresponding zenodo links that are listed on the DADA2 website. The PR2 database is taken from their github page.

More detailed informations in the reference data bases can be found on the DADA2 website and contained links: https://benjjneb.github.io/dada2/training.html.
]]></help>
<citations>
<!-- silva -->
<citation type="doi">10.1093/nar/gks1219</citation>
<!-- rdp -->>
<citation type="doi">10.1093/nar/gkt1244</citation>
<!-- greengenes -->
<citation type="doi">10.1128/AEM.03006-05</citation>
<!-- unite -->
<citation type="doi">10.15156/BIO/786343</citation>
<!-- TODO gtdb ??? -->
<!-- hitdb -->
<citation type="doi">10.1186/s12864-015-2265-y</citation>
<!-- PR2 -->
<citation type="doi">10.1093/nar/gks1160</citation>
</citations>
</tool>

134 changes: 134 additions & 0 deletions data_managers/data_manager_dada2/data_manager/data_manager.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
import argparse
import json
import os
import shutil
import sys
import zipfile
try:
# For Python 3.0 and later
from urllib.request import Request, urlopen
except ImportError:
# Fall back to Python 2 imports
from urllib2 import Request, urlopen

DEFAULT_TAXLEVELS="Kingdom,Phylum,Class,Order,Family,Genus,Species"

FILE2NAME = {
"silva_132":"Silva version 132",
"silva_128":"Silva version 128",
"rdp_16":"RDP trainset 16",
"rdp_14":"RDP trainset 14",
"greengenes_13.84":"GreenGenes version 13.84",
"unite_8.0_fungi": "UNITE: General Fasta release 8.0 for Fungi",
"unite_8.0_fungi_singletons": "UNITE: General Fasta release 8.0 for Fungi including global and 97% singletons",
"RefSeq_RDP_2018_05": "NCBI RefSeq 16S rRNA database supplemented by RDP (05/2018)",
"gtdb_2018_11": "GTDB: Genome Taxonomy Database (Bacteria &amp; Archaea) (11/2018)",
"hitdb_1": "HitDB version 1 (Human InTestinal 16S rRNA)",
"silva_euk_18S_132": "Silva version 132 Eukaryotic 18S",
"PR2_4.11.1": "Protist Ribosomal Reference database (PR2) 4.11.1"
}

FILE2TAXURL = {
"silva_132":"https://zenodo.org/record/1172783/files/silva_nr_v132_train_set.fa.gz?download=1",
"silva_128":"https://zenodo.org/record/824551/files/silva_nr_v128_train_set.fa.gz?download=1",
"rdp_16":"https://zenodo.org/record/801828/files/rdp_train_set_16.fa.gz?download=1",
"rdp_14":"https://zenodo.org/record/158955/files/rdp_train_set_14.fa.gz?download=1",
"unite_8.0_fungi": "https://files.plutof.ut.ee/public/orig/EB/0C/EB0CCB3A871B77EA75E472D13926271076904A588D2E1C1EA5AFCF7397D48378.zip",
"unite_8.0_fungi_singletons": "https://files.plutof.ut.ee/doi/06/A2/06A2C86256EED64085670EB0C54B7115F6DAC8F311C656A9CB33E386CFABA0D0.zip",
"greengenes_13.84":"https://zenodo.org/record/158955/files/gg_13_8_train_set_97.fa.gz?download=1",
"RefSeq_RDP_2018_05": "https://zenodo.org/record/2541239/files/RefSeq-RDP16S_v2_May2018.fa.gz?download=1",
"gtdb_2018_11": "https://zenodo.org/record/2541239/files/GTDB_bac-arc_ssu_r86.fa.gz?download=1",
"hitdb_1": "https://zenodo.org/record/159205/files/hitdb_v1.00.fa.gz?download=1",
"silva_euk_18S_132": "https://zenodo.org/record/1447330/files/silva_132.18s.99_rep_set.dada2.fa.gz?download=1",
"PR2_4.11.1": "https://github.com/pr2database/pr2database/releases/download/4.11.1/pr2_version_4.11.1_dada2.fasta.gz"
}

FILE2SPECIESURL = {
"silva_132":"https://zenodo.org/record/1172783/files/silva_species_assignment_v132.fa.gz?download=1",
"silva_128":"https://zenodo.org/record/824551/files/silva_species_assignment_v128.fa.gz?download=1",
"rdp_16":"https://zenodo.org/record/801828/files/rdp_species_assignment_16.fa.gz?download=1",
"rdp_14":"https://zenodo.org/record/158955/files/rdp_species_assignment_14.fa.gz?download=1"
}

FILE2TAXLEVELS = {
"PR2_4.11.1": "Kingdom,Supergroup,Division,Class,Order,Family,Genus,Species"
}

def url_download(url, fname, workdir):
"""
download url to workdir/fname
"""
file_path = os.path.join(workdir, fname)
if not os.path.exists(workdir):
os.makedirs(workdir)
src = None
dst = None
try:
req = Request(url)
src = urlopen(req)
with open(file_path, 'wb') as dst:
while True:
chunk = src.read(2**10)
if chunk:
dst.write(chunk)
else:
break
finally:
if src:
src.close()

#special treatment of UNITE DBs: they are zip files containing two fasta (xyz.fasta and developer/xyz.fasta)
if fname.startswith("unite"):
import glob
import gzip
import shutil
import zipfile
# unzip download
zip_ref = zipfile.ZipFile(file_path, 'r')
zip_ref.extractall(workdir)
zip_ref.close()
# gzip top level fasta file
fastas = glob.glob("%s/*fasta"%workdir)
if len(fastas) != 1:
msg = "UNITE download %s contained %d fasta file(s): %s"%(url, len(fastas), " ".join(fastas))
raise Exception(msg)
with open(fastas[0], 'rb') as f_in:
with gzip.open(file_path, 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)


def remote_dataset(dataset, outjson):

with open(outjson) as jf:
params = json.loads(jf.read())

workdir = params['output_data'][0]['extra_files_path']
os.mkdir(workdir)
url_download( FILE2TAXURL[dataset], dataset+".taxonomy", workdir)

data_manager_json = {"data_tables":{}}
data_manager_entry = {}
data_manager_entry['value'] = dataset
data_manager_entry['name'] = FILE2NAME[dataset]
data_manager_entry['path'] = dataset+".taxonomy"
data_manager_entry['taxlevels'] = FILE2TAXLEVELS.get(dataset, DEFAULT_TAXLEVELS)
data_manager_json["data_tables"]["dada2_taxonomy"] = data_manager_entry

if FILE2SPECIESURL.get(dataset, False ):
url_download( FILE2SPECIESURL[dataset], dataset+".species", workdir)
data_manager_entry = {}
data_manager_entry['value'] = dataset
data_manager_entry['name'] = FILE2NAME[dataset]
data_manager_entry['path'] = dataset+".species"
data_manager_json["data_tables"]["dada2_species"] = data_manager_entry

with file(outjson, 'w') as jf:
jf.write(json.dumps(data_manager_json))

if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Create data manager json.')
parser.add_argument('--out', action='store', help='JSON filename')
parser.add_argument('--dataset', action='store', help='Download data set name')
args = parser.parse_args()

remote_dataset(args.dataset, args.out)
34 changes: 34 additions & 0 deletions data_managers/data_manager_dada2/data_manager_conf.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
<?xml version="1.0"?>
<data_managers>
<data_manager tool_file="data_manager/dada2_fetcher.xml" id="dada2_fetcher">
<data_table name="dada2_taxonomy">
<output>
<column name="value" />
<column name="name" />
<column name="path" output_ref="out_file">
<move type="file" relativize_symlinks="True">
<source>${path}</source>
<target base="${GALAXY_DATA_MANAGER_DATA_PATH}">dada2/${path}</target>
</move>
<value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/dada2/${path}</value_translation>
<value_translation type="function">abspath</value_translation>
</column>
<column name="taxlevels" />
</output>
</data_table>
<data_table name="dada2_species">
<output>
<column name="value" />
<column name="name" />
<column name="path" output_ref="out_file">
<move type="file" relativize_symlinks="True">
<source>${path}</source>
<target base="${GALAXY_DATA_MANAGER_DATA_PATH}">dada2/${path}</target>
</move>
<value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/dada2/${path}</value_translation>
<value_translation type="function">abspath</value_translation>
</column>
</output>
</data_table>
</data_manager>
</data_managers>
1 change: 1 addition & 0 deletions data_managers/data_manager_dada2/test-data/PR24.11.1_json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"data_tables": {"dada2_taxonomy": {"path": "PR2_4.11.1.taxonomy", "name": "Protist Ribosomal Reference database (PR2) 4.11.1", "value": "PR2_4.11.1", "taxlevels": "Kingdom,Supergroup,Division,Class,Order,Family,Genus,Species"}}}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"data_tables": {"dada2_taxonomy": {"path": "RefSeq_RDP_2018_05.taxonomy", "name": "NCBI RefSeq 16S rRNA database supplemented by RDP (05/2018)", "value": "RefSeq_RDP_2018_05", "taxlevels": "Kingdom,Phylum,Class,Order,Family,Genus,Species"}}}
9 changes: 9 additions & 0 deletions data_managers/data_manager_dada2/test-data/dada2_species.loc
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# This is a sample file distributed with Galaxy that is used to define a
# list of dada2 reference data sets for species assignment, using three
# tab separated columns:
#
# <unique_build_id> <display_name> <fasta_file_path>
#
# Datasets can be retrieved from http://busco.ezlab.org/frame_wget.html
#
# Datasets can be retrieved from https://benjjneb.github.io/dada2/training.html
9 changes: 9 additions & 0 deletions data_managers/data_manager_dada2/test-data/dada2_taxonomy.loc
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# This is a sample file distributed with Galaxy that is used to define a
# list of dada2 reference data sets for taxonomy assignment, using three
# tab separated columns:
#
# <unique_build_id> <display_name> <fasta_file_path> <taxlevels>
#
# Datasets can be retrieved from https://benjjneb.github.io/dada2/training.html
#
# taxlevels is a comma separated list of taxonomy levels
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"data_tables": {"dada2_taxonomy": {"path": "greengenes_13.84.taxonomy", "name": "GreenGenes version 13.84", "value": "greengenes_13.84", "taxlevels": "Kingdom,Phylum,Class,Order,Family,Genus,Species"}}}
1 change: 1 addition & 0 deletions data_managers/data_manager_dada2/test-data/gtdb2018_json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"data_tables": {"dada2_taxonomy": {"path": "gtdb_2018_11.taxonomy", "name": "GTDB: Genome Taxonomy Database (Bacteria &amp; Archaea) (11/2018)", "value": "gtdb_2018_11", "taxlevels": "Kingdom,Phylum,Class,Order,Family,Genus,Species"}}}
1 change: 1 addition & 0 deletions data_managers/data_manager_dada2/test-data/hitdb1_json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"data_tables": {"dada2_taxonomy": {"path": "hitdb_1.taxonomy", "name": "HitDB version 1 (Human InTestinal 16S rRNA)", "value": "hitdb_1", "taxlevels": "Kingdom,Phylum,Class,Order,Family,Genus,Species"}}}
Loading