-
Notifications
You must be signed in to change notification settings - Fork 441
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #2483 from bernt-matthias/topic/dada2
dada2
Showing
79 changed files
with
5,722 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
name: data_manager_dada2 | ||
owner: iuc | ||
description: Data manager to download DADA2 reference databases | ||
homepage_url: https://benjjneb.github.io/dada2/index.html | ||
long_description: | | ||
"DADA2: Fast and accurate sample inference from amplicon data with single-nucleotide resolution" | ||
remote_repository_url: "https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_dada2" | ||
type: unrestricted | ||
categories: | ||
- Data Managers |
181 changes: 181 additions & 0 deletions
181
data_managers/data_manager_dada2/data_manager/dada2_fetcher.xml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,181 @@ | ||
<?xml version="1.0"?> | ||
<tool id="dada2_fetcher" name="dada2 data manager" tool_type="manage_data" version="0.0.7"> | ||
<description>Download reference databases</description> | ||
<command detect_errors="exit_code"><![CDATA[ | ||
python '$__tool_directory__/data_manager.py' | ||
--out '$out_file' | ||
#set dataset = str($db_cond.db_select) + '_' + str($db_cond.version_select) | ||
--dataset '$dataset' | ||
]]> | ||
</command> | ||
<inputs> | ||
<conditional name="db_cond"> | ||
<param name="db_select" type="select" label="Taxonomic database"> | ||
<option value="silva">Silva</option> | ||
<option value="rdp">RDP</option> | ||
<option value="greengenes">GreenGenes</option> | ||
<option value="unite">UNITE Fungi: General Fasta</option> | ||
<!-- UNITE Eukaryotes not yet supported https://github.com/benjjneb/dada2/issues/702 --> | ||
<option value="RefSeq_RDP">NCBI RefSeq 16S rRNA database supplemented by RDP</option> | ||
<option value="gtdb">GTDB: Genome Taxonomy Database (Bacteria & Archaea)</option> | ||
<option value="hitdb">HitDB (Human InTestinal 16S)</option> | ||
<option value="silva_euk_18S">Silva Eukaryotic 18S</option> | ||
<option value="PR2">Protist Ribosomal Reference database (PR2)</option> | ||
</param> | ||
<when value="silva"> | ||
<param name="version_select" type="select" label="Database version"> | ||
<option value="132">132</option> | ||
<option value="128">128</option> | ||
</param> | ||
</when> | ||
<when value="rdp"> | ||
<param name="version_select" type="select" label="Database version"> | ||
<option value="16">16</option> | ||
<option value="14">14</option> | ||
</param> | ||
</when> | ||
<when value="greengenes"> | ||
<param name="version_select" type="select" label="Database version"> | ||
<option value="13.84">13.84</option> | ||
</param> | ||
</when> | ||
<when value="unite"> | ||
<param name="version_select" type="select" label="Database version"> | ||
<option value="8.0_fungi">release 8.0 for Fungi</option> | ||
<option value="8.0_fungi_singletons">release 8.0 for Fungi including global and 97% singletons</option> | ||
</param> | ||
</when> | ||
<when value="RefSeq_RDP"> | ||
<param name="version_select" type="select" label="Database version"> | ||
<option value="2018_05">05/2018</option> | ||
</param> | ||
</when> | ||
<when value="gtdb"> | ||
<param name="version_select" type="select" label="Database version"> | ||
<option value="2018_11">11/2018</option> | ||
</param> | ||
</when> | ||
<when value="hitdb"> | ||
<param name="version_select" type="select" label="Database version"> | ||
<option value="1">1</option> | ||
</param> | ||
</when> | ||
<when value="silva_euk_18S"> | ||
<param name="version_select" type="select" label="Database version"> | ||
<option value="132">132</option> | ||
</param> | ||
</when> | ||
<when value="PR2"> | ||
<param name="version_select" type="select" label="Database version"> | ||
<option value="4.11.1">4.11.1</option> | ||
</param> | ||
</when> | ||
</conditional> | ||
</inputs> | ||
<outputs> | ||
<data name="out_file" format="data_manager_json" /> | ||
</outputs> | ||
<tests> | ||
<test> | ||
<param name="db_cond|db_select" value="silva"/> | ||
<param name="db_cond|version_select" value="132"/> | ||
<output name="out_file" file="silva132_json"/> | ||
</test> | ||
<test> | ||
<param name="db_cond|db_select" value="rdp"/> | ||
<param name="db_cond|version_select" value="16"/> | ||
<output name="out_file" file="rdp16_json"/> | ||
</test> | ||
<test> | ||
<param name="db_cond|db_select" value="greengenes"/> | ||
<param name="db_cond|version_select" value="13.84"/> | ||
<output name="out_file" file="greengenes13.84_json"/> | ||
</test> | ||
<test> | ||
<param name="db_cond|db_select" value="unite"/> | ||
<param name="db_cond|version_select" value="8.0_fungi"/> | ||
<output name="out_file" file="unite8fungi_json"/> | ||
</test> | ||
<test> | ||
<param name="db_cond|db_select" value="unite"/> | ||
<param name="db_cond|version_select" value="8.0_fungi_singletons"/> | ||
<output name="out_file" file="unite8fungisingletons_json"/> | ||
</test> | ||
<test> | ||
<param name="db_cond|db_select" value="RefSeq_RDP"/> | ||
<param name="db_cond|version_select" value="2018_05"/> | ||
<output name="out_file" file="RefSeq_RDP2018_json"/> | ||
</test> | ||
<test> | ||
<param name="db_cond|db_select" value="gtdb"/> | ||
<param name="db_cond|version_select" value="2018_11"/> | ||
<output name="out_file" file="gtdb2018_json"/> | ||
</test> | ||
<test> | ||
<param name="db_cond|db_select" value="hitdb"/> | ||
<param name="db_cond|version_select" value="1"/> | ||
<output name="out_file" file="hitdb1_json"/> | ||
</test> | ||
<test> | ||
<param name="db_cond|db_select" value="silva_euk_18S"/> | ||
<param name="db_cond|version_select" value="132"/> | ||
<output name="out_file" file="silvaeuk132_json"/> | ||
</test> | ||
<test> | ||
<param name="db_cond|db_select" value="PR2"/> | ||
<param name="db_cond|version_select" value="4.11.1"/> | ||
<output name="out_file" file="PR24.11.1_json"/> | ||
</test> | ||
</tests> | ||
<help><![CDATA[ | ||
Public Reference databases maintained by the DADA2 project | ||
.......................................................... | ||
The following refrence databases which are describes as maintained by the DADA2 project (https://benjjneb.github.io/dada2/training.html) are available | ||
- Silva (https://www.arb-silva.de/) | ||
- RDP (http://rdp.cme.msu.edu/) | ||
- GreenGenes (http://greengenes.secondgenome.com/) | ||
- UNITE general FASTA (https://unite.ut.ee/repository.php) | ||
While Silva and RDP contain reference databases for taxonomy and species assignment, the greengenes and UNITE databases only contains a reference database for taxonomy assignment. | ||
For the Silva databases check the license information: http://www.arb-silva.de/silva-license-information. | ||
Except for UNITE all reference databases are downloaded from the corresponding zenodo links that are listed on the DADA2 website. The UNITE databases are taken from the links provided on the UNITE website | ||
More detailed informations in the reference data bases can be found on the DADA2 website and contained links: https://benjjneb.github.io/dada2/training.html. | ||
Further public Reference databases listed by the DADA2 project | ||
.............................................................. | ||
Several contributed reference databases are listed of the DADA2 project website (https://benjjneb.github.io/dada2/training.html): | ||
- RefSeq + RDP (NCBI RefSeq 16S rRNA database supplemented by RDP) | ||
- GTDB: Genome Taxonomy Database (More info: http://gtdb.ecogenomic.org/) | ||
- HitDB version 1 (Human InTestinal 16S rRNA) (https://github.com/microbiome/HITdb) | ||
- RDP fungi LSU | ||
- Silva Eukaryotic 18S | ||
- PR2 (https://github.com/pr2database/pr2database) | ||
Except for PR2, all reference databases are downloaded from the corresponding zenodo links that are listed on the DADA2 website. The PR2 database is taken from their github page. | ||
More detailed informations in the reference data bases can be found on the DADA2 website and contained links: https://benjjneb.github.io/dada2/training.html. | ||
]]></help> | ||
<citations> | ||
<!-- silva --> | ||
<citation type="doi">10.1093/nar/gks1219</citation> | ||
<!-- rdp -->> | ||
<citation type="doi">10.1093/nar/gkt1244</citation> | ||
<!-- greengenes --> | ||
<citation type="doi">10.1128/AEM.03006-05</citation> | ||
<!-- unite --> | ||
<citation type="doi">10.15156/BIO/786343</citation> | ||
<!-- TODO gtdb ??? --> | ||
<!-- hitdb --> | ||
<citation type="doi">10.1186/s12864-015-2265-y</citation> | ||
<!-- PR2 --> | ||
<citation type="doi">10.1093/nar/gks1160</citation> | ||
</citations> | ||
</tool> | ||
|
133 changes: 133 additions & 0 deletions
133
data_managers/data_manager_dada2/data_manager/data_manager.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,133 @@ | ||
import argparse | ||
import json | ||
import os | ||
try: | ||
# For Python 3.0 and later | ||
from urllib.request import Request, urlopen | ||
except ImportError: | ||
# Fall back to Python 2 imports | ||
from urllib2 import Request, urlopen | ||
|
||
DEFAULT_TAXLEVELS = "Kingdom,Phylum,Class,Order,Family,Genus,Species" | ||
|
||
FILE2NAME = { | ||
"silva_132": "Silva version 132", | ||
"silva_128": "Silva version 128", | ||
"rdp_16": "RDP trainset 16", | ||
"rdp_14": "RDP trainset 14", | ||
"greengenes_13.84": "GreenGenes version 13.84", | ||
"unite_8.0_fungi": "UNITE: General Fasta release 8.0 for Fungi", | ||
"unite_8.0_fungi_singletons": "UNITE: General Fasta release 8.0 for Fungi including global and 97% singletons", | ||
"RefSeq_RDP_2018_05": "NCBI RefSeq 16S rRNA database supplemented by RDP (05/2018)", | ||
"gtdb_2018_11": "GTDB: Genome Taxonomy Database (Bacteria & Archaea) (11/2018)", | ||
"hitdb_1": "HitDB version 1 (Human InTestinal 16S rRNA)", | ||
"silva_euk_18S_132": "Silva version 132 Eukaryotic 18S", | ||
"PR2_4.11.1": "Protist Ribosomal Reference database (PR2) 4.11.1" | ||
} | ||
|
||
FILE2TAXURL = { | ||
"silva_132": "https://zenodo.org/record/1172783/files/silva_nr_v132_train_set.fa.gz?download=1", | ||
"silva_128": "https://zenodo.org/record/824551/files/silva_nr_v128_train_set.fa.gz?download=1", | ||
"rdp_16": "https://zenodo.org/record/801828/files/rdp_train_set_16.fa.gz?download=1", | ||
"rdp_14": "https://zenodo.org/record/158955/files/rdp_train_set_14.fa.gz?download=1", | ||
"unite_8.0_fungi": "https://files.plutof.ut.ee/public/orig/EB/0C/EB0CCB3A871B77EA75E472D13926271076904A588D2E1C1EA5AFCF7397D48378.zip", | ||
"unite_8.0_fungi_singletons": "https://files.plutof.ut.ee/doi/06/A2/06A2C86256EED64085670EB0C54B7115F6DAC8F311C656A9CB33E386CFABA0D0.zip", | ||
"greengenes_13.84": "https://zenodo.org/record/158955/files/gg_13_8_train_set_97.fa.gz?download=1", | ||
"RefSeq_RDP_2018_05": "https://zenodo.org/record/2541239/files/RefSeq-RDP16S_v2_May2018.fa.gz?download=1", | ||
"gtdb_2018_11": "https://zenodo.org/record/2541239/files/GTDB_bac-arc_ssu_r86.fa.gz?download=1", | ||
"hitdb_1": "https://zenodo.org/record/159205/files/hitdb_v1.00.fa.gz?download=1", | ||
"silva_euk_18S_132": "https://zenodo.org/record/1447330/files/silva_132.18s.99_rep_set.dada2.fa.gz?download=1", | ||
"PR2_4.11.1": "https://github.com/pr2database/pr2database/releases/download/4.11.1/pr2_version_4.11.1_dada2.fasta.gz" | ||
} | ||
|
||
FILE2SPECIESURL = { | ||
"silva_132": "https://zenodo.org/record/1172783/files/silva_species_assignment_v132.fa.gz?download=1", | ||
"silva_128": "https://zenodo.org/record/824551/files/silva_species_assignment_v128.fa.gz?download=1", | ||
"rdp_16": "https://zenodo.org/record/801828/files/rdp_species_assignment_16.fa.gz?download=1", | ||
"rdp_14": "https://zenodo.org/record/158955/files/rdp_species_assignment_14.fa.gz?download=1" | ||
} | ||
|
||
FILE2TAXLEVELS = { | ||
"PR2_4.11.1": "Kingdom,Supergroup,Division,Class,Order,Family,Genus,Species" | ||
} | ||
|
||
|
||
def url_download(url, fname, workdir): | ||
""" | ||
download url to workdir/fname | ||
""" | ||
file_path = os.path.join(workdir, fname) | ||
if not os.path.exists(workdir): | ||
os.makedirs(workdir) | ||
src = None | ||
dst = None | ||
try: | ||
req = Request(url) | ||
src = urlopen(req) | ||
with open(file_path, 'wb') as dst: | ||
while True: | ||
chunk = src.read(2**10) | ||
if chunk: | ||
dst.write(chunk) | ||
else: | ||
break | ||
finally: | ||
if src: | ||
src.close() | ||
|
||
# special treatment of UNITE DBs: they are zip files containing two fasta (xyz.fasta and developer/xyz.fasta) | ||
if fname.startswith("unite"): | ||
import glob | ||
import gzip | ||
import shutil | ||
import zipfile | ||
# unzip download | ||
zip_ref = zipfile.ZipFile(file_path, 'r') | ||
zip_ref.extractall(workdir) | ||
zip_ref.close() | ||
# gzip top level fasta file | ||
fastas = glob.glob("%s/*fasta" % workdir) | ||
if len(fastas) != 1: | ||
msg = "UNITE download %s contained %d fasta file(s): %s" % (url, len(fastas), " ".join(fastas)) | ||
raise Exception(msg) | ||
with open(fastas[0], 'rb') as f_in: | ||
with gzip.open(file_path, 'wb') as f_out: | ||
shutil.copyfileobj(f_in, f_out) | ||
|
||
|
||
def remote_dataset(dataset, outjson): | ||
|
||
with open(outjson) as jf: | ||
params = json.loads(jf.read()) | ||
|
||
workdir = params['output_data'][0]['extra_files_path'] | ||
os.mkdir(workdir) | ||
url_download( FILE2TAXURL[dataset], dataset + ".taxonomy", workdir) | ||
|
||
data_manager_json = {"data_tables": {}} | ||
data_manager_entry = {} | ||
data_manager_entry['value'] = dataset | ||
data_manager_entry['name'] = FILE2NAME[dataset] | ||
data_manager_entry['path'] = dataset + ".taxonomy" | ||
data_manager_entry['taxlevels'] = FILE2TAXLEVELS.get(dataset, DEFAULT_TAXLEVELS) | ||
data_manager_json["data_tables"]["dada2_taxonomy"] = data_manager_entry | ||
|
||
if FILE2SPECIESURL.get(dataset, False ): | ||
url_download( FILE2SPECIESURL[dataset], dataset + ".species", workdir) | ||
data_manager_entry = {} | ||
data_manager_entry['value'] = dataset | ||
data_manager_entry['name'] = FILE2NAME[dataset] | ||
data_manager_entry['path'] = dataset + ".species" | ||
data_manager_json["data_tables"]["dada2_species"] = data_manager_entry | ||
|
||
with file(outjson, 'w') as jf: | ||
jf.write(json.dumps(data_manager_json)) | ||
|
||
|
||
if __name__ == '__main__': | ||
parser = argparse.ArgumentParser(description='Create data manager json.') | ||
parser.add_argument('--out', action='store', help='JSON filename') | ||
parser.add_argument('--dataset', action='store', help='Download data set name') | ||
args = parser.parse_args() | ||
|
||
remote_dataset(args.dataset, args.out) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
<?xml version="1.0"?> | ||
<data_managers> | ||
<data_manager tool_file="data_manager/dada2_fetcher.xml" id="dada2_fetcher"> | ||
<data_table name="dada2_taxonomy"> | ||
<output> | ||
<column name="value" /> | ||
<column name="name" /> | ||
<column name="path" output_ref="out_file"> | ||
<move type="file" relativize_symlinks="True"> | ||
<source>${path}</source> | ||
<target base="${GALAXY_DATA_MANAGER_DATA_PATH}">dada2/${path}</target> | ||
</move> | ||
<value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/dada2/${path}</value_translation> | ||
<value_translation type="function">abspath</value_translation> | ||
</column> | ||
<column name="taxlevels" /> | ||
</output> | ||
</data_table> | ||
<data_table name="dada2_species"> | ||
<output> | ||
<column name="value" /> | ||
<column name="name" /> | ||
<column name="path" output_ref="out_file"> | ||
<move type="file" relativize_symlinks="True"> | ||
<source>${path}</source> | ||
<target base="${GALAXY_DATA_MANAGER_DATA_PATH}">dada2/${path}</target> | ||
</move> | ||
<value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/dada2/${path}</value_translation> | ||
<value_translation type="function">abspath</value_translation> | ||
</column> | ||
</output> | ||
</data_table> | ||
</data_manager> | ||
</data_managers> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{"data_tables": {"dada2_taxonomy": {"path": "PR2_4.11.1.taxonomy", "name": "Protist Ribosomal Reference database (PR2) 4.11.1", "value": "PR2_4.11.1", "taxlevels": "Kingdom,Supergroup,Division,Class,Order,Family,Genus,Species"}}} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{"data_tables": {"dada2_taxonomy": {"path": "RefSeq_RDP_2018_05.taxonomy", "name": "NCBI RefSeq 16S rRNA database supplemented by RDP (05/2018)", "value": "RefSeq_RDP_2018_05", "taxlevels": "Kingdom,Phylum,Class,Order,Family,Genus,Species"}}} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
# This is a sample file distributed with Galaxy that is used to define a | ||
# list of dada2 reference data sets for species assignment, using three | ||
# tab separated columns: | ||
# | ||
# <unique_build_id> <display_name> <fasta_file_path> | ||
# | ||
# Datasets can be retrieved from http://busco.ezlab.org/frame_wget.html | ||
# | ||
# Datasets can be retrieved from https://benjjneb.github.io/dada2/training.html |
Oops, something went wrong.