diff --git a/Taxonomic-Profiling-Diamond-Megan/Snakefile-diamond-megan-ue.smk b/Taxonomic-Profiling-Diamond-Megan/Snakefile-diamond-megan-ue.smk new file mode 100644 index 0000000..f077e98 --- /dev/null +++ b/Taxonomic-Profiling-Diamond-Megan/Snakefile-diamond-megan-ue.smk @@ -0,0 +1,505 @@ +import os + +localrules: + ReadCounts, SplitFasta, MergeSam, UpdateNCBI, TaxonomyFiltered, TaxonomyUnfiltered + +configfile: "config.yaml" + +SAMPLES = config['samplenames'] +CHUNKS = [str(i) for i in list(range(0,config['diamond']['chunks']))] +CWD = os.getcwd() + +rule all: + input: + expand(os.path.join(CWD, "5-r2c", "{sample}.{type}.reads.txt"), sample = SAMPLES, + type = ["EC", "EGGNOG", "INTERPRO2GO", "SEED", "KEGG"]), + + expand(os.path.join(CWD, "6-c2c", "{sample}.{type}.counts.txt"), sample = SAMPLES, + type = ["EC", "EGGNOG", "INTERPRO2GO", "SEED", "KEGG"]), + + expand(os.path.join(CWD, "5-r2c", "{sample}.{type}.reads.{filt}.txt"), sample = SAMPLES, + type = ["NCBI", "GTDB"], filt = ["unfiltered", "filtered"]), + + expand(os.path.join(CWD, "6-c2c", "{sample}.{type}.counts.{filt}.txt"), sample = SAMPLES, + type = ["NCBI", "GTDB"], filt = ["unfiltered", "filtered"]), + + expand(os.path.join(CWD, "7-kraken-mpa-reports", "{sample}.diamond_megan.kreport.filtered.txt"), sample = SAMPLES), + expand(os.path.join(CWD, "7-kraken-mpa-reports", "{sample}.diamond_megan.mpa.filtered.txt"), sample = SAMPLES), + expand(os.path.join(CWD, "7-kraken-mpa-reports", "{sample}.diamond_megan.kreport.unfiltered.txt"), sample = SAMPLES), + expand(os.path.join(CWD, "7-kraken-mpa-reports", "{sample}.diamond_megan.mpa.unfiltered.txt"), sample = SAMPLES) + + +rule ReadCounts: + input: + os.path.join(CWD, "inputs", "{sample}.fasta") + output: + os.path.join(CWD, "4-rma", "{sample}.readcounts.txt") + threads: 2 + benchmark: + os.path.join(CWD, "benchmarks", "{sample}.ReadCounts.tsv") + shell: + "grep -c '>' {input} > {output}" + + +################################################## +# Diamond prep and run + +rule SplitFasta: + input: + os.path.join(CWD, "inputs", "{sample}.fasta") + output: + temp(expand(os.path.join(CWD, "1-chunks", "{{sample}}.fasta_chunk_000000{piece}"), piece = CHUNKS)) + conda: + "envs/exonerate.yml" + threads: 2 + benchmark: + os.path.join(CWD, "benchmarks", "{sample}.SplitFasta.tsv") + params: + chunks = config['diamond']['chunks'], + outdir = os.path.join(CWD, "1-chunks", "") + shell: + "fastasplit -c {params.chunks} -f {input} -o {params.outdir}" + +rule RunDiamond: + input: + os.path.join(CWD, "1-chunks", "{sample}.fasta_chunk_000000{piece}") + output: + temp(os.path.join(CWD, "2-diamond", "{sample}.{piece}.sam")) + conda: + "envs/diamond.yml" + threads: + config['diamond']['threads'] + params: + db = config['diamond']['db'], + block = config['diamond']['block_size'], + hits = config['diamond']['hit_limit'] + log: + os.path.join(CWD, "logs", "{sample}.{piece}.RunDiamond.log") + benchmark: + os.path.join(CWD, "benchmarks", "{sample}.{piece}.RunDiamond.tsv") + shell: + "diamond blastx -d {params.db} -q {input} -o {output} -f 101 -F 5000 " + "--range-culling {params.hits} -b {params.block} -p {threads} 2> {log}" + +################################################## +# MEGAN RMA prep and run + +rule MergeSam: + input: + expand(os.path.join(CWD, "2-diamond", "{{sample}}.{piece}.sam"), piece = CHUNKS) + output: + os.path.join(CWD, "3-merged", "{sample}.merged.sam") + conda: + "envs/python.yml" + threads: 1 + log: + os.path.join(CWD, "logs", "{sample}.MergeSam.log") + benchmark: + os.path.join(CWD, "benchmarks", "{sample}.MergeSam.tsv") + shell: + "python scripts/sam-merger-screen-cigar.py -i {input} -o {output} -l {log}" + +rule MakeRMAfiltered: + input: + sam = os.path.join(CWD, "3-merged", "{sample}.merged.sam"), + reads = os.path.join(CWD, "inputs", "{sample}.fasta") + output: + expand(os.path.join(CWD, "4-rma", "{{sample}}_filtered.protein.{mode}.rma"), mode = config['sam2rma']['readassignmentmode']) + threads: + config['sam2rma']['threads'] + params: + sam2rma = config['sam2rma']['path'], + db = config['sam2rma']['db'], + ram = config['sam2rma']['readassignmentmode'], + ms = config['sam2rma']['minSupportPercent'] + log: + os.path.join(CWD, "logs", f"{{sample}}.MakeRMA.filtered.{config['sam2rma']['readassignmentmode']}.log") + benchmark: + os.path.join(CWD, "benchmarks", f"{{sample}}.MakeRMA.filtered.{config['sam2rma']['readassignmentmode']}.tsv") + shell: + "{params.sam2rma} -i {input.sam} -r {input.reads} -o {output} -lg -alg longReads " + "-t {threads} -mdb {params.db} -ram {params.ram} --minSupportPercent {params.ms} " + "-v 2> {log}" + +rule MakeRMAunfiltered: + input: + sam = os.path.join(CWD, "3-merged", "{sample}.merged.sam"), + reads = os.path.join(CWD, "inputs", "{sample}.fasta") + output: + expand(os.path.join(CWD, "4-rma", "{{sample}}_unfiltered.protein.{mode}.rma"), mode = config['sam2rma']['readassignmentmode']) + threads: + config['sam2rma']['threads'] + params: + sam2rma = config['sam2rma']['path'], + db = config['sam2rma']['db'], + ram = config['sam2rma']['readassignmentmode'] + log: + os.path.join(CWD, "logs", f"{{sample}}.MakeRMA.unfiltered.{config['sam2rma']['readassignmentmode']}.log") + benchmark: + os.path.join(CWD, "benchmarks", f"{{sample}}.MakeRMA.unfiltered.{config['sam2rma']['readassignmentmode']}.tsv") + shell: + "{params.sam2rma} -i {input.sam} -r {input.reads} -o {output} -lg -alg longReads " + "-t {threads} -mdb {params.db} -ram {params.ram} --minSupportPercent 0 " + "-v 2> {log}" + +################################################## +# MEGAN RMA summaries - Read classification + +rule RunR2CforEC: + input: + expand(os.path.join(CWD, "4-rma", "{{sample}}_unfiltered.protein.{mode}.rma"), mode = config['sam2rma']['readassignmentmode']) + output: + os.path.join(CWD, "5-r2c", "{sample}.EC.reads.txt") + threads: + config['rma2info']['threads'] + params: + rma2info = config['rma2info']['path'] + log: + os.path.join(CWD, "logs", "{sample}.RunR2CforEC.log") + benchmark: + os.path.join(CWD, "benchmarks", "{sample}.RunR2CforEC.tsv") + shell: + "{params.rma2info} -i {input} -o {output} -r2c EC &> {log}" + +rule RunR2CforEGGNOG: + input: + expand(os.path.join(CWD, "4-rma", "{{sample}}_unfiltered.protein.{mode}.rma"), mode = config['sam2rma']['readassignmentmode']) + output: + os.path.join(CWD, "5-r2c", "{sample}.EGGNOG.reads.txt") + threads: + config['rma2info']['threads'] + params: + rma2info = config['rma2info']['path'] + log: + os.path.join(CWD, "logs", "{sample}.RunR2CforEGGNOG.log") + benchmark: + os.path.join(CWD, "benchmarks", "{sample}.RunR2CforEGGNOG.tsv") + shell: + "{params.rma2info} -i {input} -o {output} -r2c EGGNOG &> {log}" + +rule RunR2CforINTERPRO2GO: + input: + expand(os.path.join(CWD, "4-rma", "{{sample}}_unfiltered.protein.{mode}.rma"), mode = config['sam2rma']['readassignmentmode']) + output: + os.path.join(CWD, "5-r2c", "{sample}.INTERPRO2GO.reads.txt") + threads: + config['rma2info']['threads'] + params: + rma2info = config['rma2info']['path'] + log: + os.path.join(CWD, "logs", "{sample}.RunR2CforINTERPRO2GO.log") + benchmark: + os.path.join(CWD, "benchmarks", "{sample}.RunR2CforINTERPRO2GO.tsv") + shell: + "{params.rma2info} -i {input} -o {output} -r2c INTERPRO2GO &> {log}" + +rule RunR2CforSEED: + input: + expand(os.path.join(CWD, "4-rma", "{{sample}}_unfiltered.protein.{mode}.rma"), mode = config['sam2rma']['readassignmentmode']) + output: + os.path.join(CWD, "5-r2c", "{sample}.SEED.reads.txt") + threads: + config['rma2info']['threads'] + params: + rma2info = config['rma2info']['path'] + log: + os.path.join(CWD, "logs", "{sample}.RunR2CforSEED.log") + benchmark: + os.path.join(CWD, "benchmarks", "{sample}.RunR2CforSEED.tsv") + shell: + "{params.rma2info} -i {input} -o {output} -r2c SEED &> {log}" + +rule RunR2CforKEGG: + input: + expand(os.path.join(CWD, "4-rma", "{{sample}}_unfiltered.protein.{mode}.rma"), mode = config['sam2rma']['readassignmentmode']) + output: + os.path.join(CWD, "5-r2c", "{sample}.KEGG.reads.txt") + threads: + config['rma2info']['threads'] + params: + rma2info = config['rma2info']['path'] + log: + os.path.join(CWD, "logs", "{sample}.RunR2CforKEGG.log") + benchmark: + os.path.join(CWD, "benchmarks", "{sample}.RunR2CforKEGG.tsv") + shell: + "{params.rma2info} -i {input} -o {output} -r2c KEGG &> {log}" + +rule RunR2CforNCBIfiltered: + input: + expand(os.path.join(CWD, "4-rma", "{{sample}}_filtered.protein.{mode}.rma"), mode = config['sam2rma']['readassignmentmode']) + output: + os.path.join(CWD, "5-r2c", "{sample}.NCBI.reads.filtered.txt") + threads: + config['rma2info']['threads'] + params: + rma2info = config['rma2info']['path'] + log: + os.path.join(CWD, "logs", "{sample}.RunR2CforNCBIfiltered.log") + benchmark: + os.path.join(CWD, "benchmarks", "{sample}.RunR2CforNCBIfiltered.tsv") + shell: + "{params.rma2info} -i {input} -o {output} -r2c Taxonomy -n &> {log}" + +rule RunR2CforNCBIunfiltered: + input: + expand(os.path.join(CWD, "4-rma", "{{sample}}_unfiltered.protein.{mode}.rma"), mode = config['sam2rma']['readassignmentmode']) + output: + os.path.join(CWD, "5-r2c", "{sample}.NCBI.reads.unfiltered.txt") + threads: + config['rma2info']['threads'] + params: + rma2info = config['rma2info']['path'] + log: + os.path.join(CWD, "logs", "{sample}.RunR2CforNCBIunfiltered.log") + benchmark: + os.path.join(CWD, "benchmarks", "{sample}.RunR2CforNCBIunfiltered.tsv") + shell: + "{params.rma2info} -i {input} -o {output} -r2c Taxonomy -n &> {log}" + +rule RunR2CforGTDBfiltered: + input: + expand(os.path.join(CWD, "4-rma", "{{sample}}_filtered.protein.{mode}.rma"), mode = config['sam2rma']['readassignmentmode']) + output: + os.path.join(CWD, "5-r2c", "{sample}.GTDB.reads.filtered.txt") + threads: + config['rma2info']['threads'] + params: + rma2info = config['rma2info']['path'] + log: + os.path.join(CWD, "logs", "{sample}.RunR2CforGTDBfiltered.log") + benchmark: + os.path.join(CWD, "benchmarks", "{sample}.RunR2CforGTDBfiltered.tsv") + shell: + "{params.rma2info} -i {input} -o {output} -r2c GTDB -n &> {log}" + +rule RunR2CforGTDBunfiltered: + input: + expand(os.path.join(CWD, "4-rma", "{{sample}}_unfiltered.protein.{mode}.rma"), mode = config['sam2rma']['readassignmentmode']) + output: + os.path.join(CWD, "5-r2c", "{sample}.GTDB.reads.unfiltered.txt") + threads: + config['rma2info']['threads'] + params: + rma2info = config['rma2info']['path'] + log: + os.path.join(CWD, "logs", "{sample}.RunR2CforGTDBunfiltered.log") + benchmark: + os.path.join(CWD, "benchmarks", "{sample}.RunR2CforGTDBunfiltered.tsv") + shell: + "{params.rma2info} -i {input} -o {output} -r2c GTDB -n &> {log}" + + + +################################################## +# MEGAN RMA summaries - Class counts + +rule RunC2CforEC: + input: + expand(os.path.join(CWD, "4-rma", "{{sample}}_unfiltered.protein.{mode}.rma"), mode = config['sam2rma']['readassignmentmode']) + output: + os.path.join(CWD, "6-c2c", "{sample}.EC.counts.txt") + threads: + config['rma2info']['threads'] + params: + rma2info = config['rma2info']['path'] + log: + os.path.join(CWD, "logs", "{sample}.RunC2CforEC.log") + benchmark: + os.path.join(CWD, "benchmarks", "{sample}.RunC2CforEC.tsv") + shell: + "{params.rma2info} -i {input} -o {output} -c2c EC -p &> {log}" + +rule RunC2CforEGGNOG: + input: + expand(os.path.join(CWD, "4-rma", "{{sample}}_unfiltered.protein.{mode}.rma"), mode = config['sam2rma']['readassignmentmode']) + output: + os.path.join(CWD, "6-c2c", "{sample}.EGGNOG.counts.txt") + threads: + config['rma2info']['threads'] + params: + rma2info = config['rma2info']['path'] + log: + os.path.join(CWD, "logs", "{sample}.RunC2CforEGGNOG.log") + benchmark: + os.path.join(CWD, "benchmarks", "{sample}.RunC2CforEGGNOG.tsv") + shell: + "{params.rma2info} -i {input} -o {output} -c2c EGGNOG -p &> {log}" + +rule RunC2CforINTERPRO2GO: + input: + expand(os.path.join(CWD, "4-rma", "{{sample}}_unfiltered.protein.{mode}.rma"), mode = config['sam2rma']['readassignmentmode']) + output: + os.path.join(CWD, "6-c2c", "{sample}.INTERPRO2GO.counts.txt") + threads: + config['rma2info']['threads'] + params: + rma2info = config['rma2info']['path'] + log: + os.path.join(CWD, "logs", "{sample}.RunC2CforINTERPRO2GO.log") + benchmark: + os.path.join(CWD, "benchmarks", "{sample}.RunC2CforINTERPRO2GO.tsv") + shell: + "{params.rma2info} -i {input} -o {output} -c2c INTERPRO2GO -p &> {log}" + +rule RunC2CforSEED: + input: + expand(os.path.join(CWD, "4-rma", "{{sample}}_unfiltered.protein.{mode}.rma"), mode = config['sam2rma']['readassignmentmode']) + output: + os.path.join(CWD, "6-c2c", "{sample}.SEED.counts.txt") + threads: + config['rma2info']['threads'] + params: + rma2info = config['rma2info']['path'] + log: + os.path.join(CWD, "logs", "{sample}.RunC2CforSEED.log") + benchmark: + os.path.join(CWD, "benchmarks", "{sample}.RunC2CforSEED.tsv") + shell: + "{params.rma2info} -i {input} -o {output} -c2c SEED -p &> {log}" + +rule RunC2CforKEGG: + input: + expand(os.path.join(CWD, "4-rma", "{{sample}}_unfiltered.protein.{mode}.rma"), mode = config['sam2rma']['readassignmentmode']) + output: + os.path.join(CWD, "6-c2c", "{sample}.KEGG.counts.txt") + threads: + config['rma2info']['threads'] + params: + rma2info = config['rma2info']['path'] + log: + os.path.join(CWD, "logs", "{sample}.RunC2CforKEGG.log") + benchmark: + os.path.join(CWD, "benchmarks", "{sample}.RunC2CforKEGG.tsv") + shell: + "{params.rma2info} -i {input} -o {output} -c2c KEGG -p &> {log}" + +rule RunC2CforGTDBfiltered: + input: + expand(os.path.join(CWD, "4-rma", "{{sample}}_filtered.protein.{mode}.rma"), mode = config['sam2rma']['readassignmentmode']) + output: + os.path.join(CWD, "6-c2c", "{sample}.GTDB.counts.filtered.txt") + threads: + config['rma2info']['threads'] + params: + rma2info = config['rma2info']['path'] + log: + os.path.join(CWD, "logs", "{sample}.RunC2CforGTDBfiltered.log") + benchmark: + os.path.join(CWD, "benchmarks", "{sample}.RunC2CforGTDBfiltered.tsv") + shell: + "{params.rma2info} -i {input} -o {output} -c2c GTDB -n &> {log}" + +rule RunC2CforGTDBunfiltered: + input: + expand(os.path.join(CWD, "4-rma", "{{sample}}_unfiltered.protein.{mode}.rma"), mode = config['sam2rma']['readassignmentmode']) + output: + os.path.join(CWD, "6-c2c", "{sample}.GTDB.counts.unfiltered.txt") + threads: + config['rma2info']['threads'] + params: + rma2info = config['rma2info']['path'] + log: + os.path.join(CWD, "logs", "{sample}.RunC2CforGTDBunfiltered.log") + benchmark: + os.path.join(CWD, "benchmarks", "{sample}.RunC2CforGTDBunfiltered.tsv") + shell: + "{params.rma2info} -i {input} -o {output} -c2c GTDB -n &> {log}" + +rule RunC2CforNCBIfiltered: + input: + expand(os.path.join(CWD, "4-rma", "{{sample}}_filtered.protein.{mode}.rma"), mode = config['sam2rma']['readassignmentmode']) + output: + os.path.join(CWD, "6-c2c", "{sample}.NCBI.counts.filtered.txt") + threads: + config['rma2info']['threads'] + params: + rma2info = config['rma2info']['path'] + log: + os.path.join(CWD, "logs", "{sample}.RunC2CforNCBIfiltered.log") + benchmark: + os.path.join(CWD, "benchmarks", "{sample}.RunC2CforNCBIfiltered.tsv") + shell: + "{params.rma2info} -i {input} -o {output} -c2c Taxonomy -n -r &> {log}" + +rule RunC2CforNCBIunfiltered: + input: + expand(os.path.join(CWD, "4-rma", "{{sample}}_unfiltered.protein.{mode}.rma"), mode = config['sam2rma']['readassignmentmode']) + output: + os.path.join(CWD, "6-c2c", "{sample}.NCBI.counts.unfiltered.txt") + threads: + config['rma2info']['threads'] + params: + rma2info = config['rma2info']['path'] + log: + os.path.join(CWD, "logs", "{sample}.RunC2CforNCBIunfiltered.log") + benchmark: + os.path.join(CWD, "benchmarks", "{sample}.RunC2CforNCBIunfiltered.tsv") + shell: + "{params.rma2info} -i {input} -o {output} -c2c Taxonomy -n -r &> {log}" + +################################################## +# Make taxonomic reports + +rule UpdateNCBI: + input: + expand(os.path.join(CWD, "6-c2c", "{sample}.NCBI.counts.unfiltered.txt"), sample = SAMPLES) + output: + dummy = os.path.join(CWD, "7-kraken-mpa-reports", "NCBI-updated.txt"), + taxdump = temp(os.path.join(CWD, "taxdump.tar.gz")) + conda: + "envs/python.yml" + threads: + 1 + log: + os.path.join(CWD, "logs", "UpdateNCBI.prot.log") + benchmark: + os.path.join(CWD, "benchmarks", "UpdateNCBI.prot.tsv") + shell: + "python scripts/Run_ete3_NCBI_update.py -i {input} -o {output.dummy} &> {log}" + +rule TaxonomyUnfiltered: + input: + c2c = os.path.join(CWD, "6-c2c", "{sample}.NCBI.counts.unfiltered.txt"), + ncbiready = os.path.join(CWD, "7-kraken-mpa-reports", "NCBI-updated.txt"), + taxdump = os.path.join(CWD, "taxdump.tar.gz"), + readcount = os.path.join(CWD, "4-rma", "{sample}.readcounts.txt") + output: + inter1 = temp(os.path.join(CWD, "7-kraken-mpa-reports", "{sample}.diamond_megan.unfiltered.taxnames.txt")), + inter2 = temp(os.path.join(CWD, "7-kraken-mpa-reports", "{sample}.diamond_megan.unfiltered.taxids.txt")), + kreport = os.path.join(CWD, "7-kraken-mpa-reports", "{sample}.diamond_megan.kreport.unfiltered.txt"), + mpa = os.path.join(CWD, "7-kraken-mpa-reports", "{sample}.diamond_megan.mpa.unfiltered.txt") + conda: + "envs/python.yml" + threads: + 1 + log: + os.path.join(CWD, "logs", "{sample}.TaxonomyUnfilteredlog") + benchmark: + os.path.join(CWD, "benchmarks", "{sample}.TaxonomyUnfiltered.tsv") + shell: + "python scripts/Convert_MEGAN_RMA_NCBI_c2c-snake.py -i {input.c2c} -o1 {output.inter1} " + "-o2 {output.inter2} -m {output.mpa} -k {output.kreport} -r {input.readcount} &> {log}" + +rule TaxonomyFiltered: + input: + c2c = os.path.join(CWD, "6-c2c", "{sample}.NCBI.counts.filtered.txt"), + ncbiready = os.path.join(CWD, "7-kraken-mpa-reports", "NCBI-updated.txt"), + taxdump = os.path.join(CWD, "taxdump.tar.gz"), + readcount = os.path.join(CWD, "4-rma", "{sample}.readcounts.txt") + output: + inter1 = temp(os.path.join(CWD, "7-kraken-mpa-reports", "{sample}.diamond_megan.filtered.taxnames.txt")), + inter2 = temp(os.path.join(CWD, "7-kraken-mpa-reports", "{sample}.diamond_megan.filtered.taxids.txt")), + kreport = os.path.join(CWD, "7-kraken-mpa-reports", "{sample}.diamond_megan.kreport.filtered.txt"), + mpa = os.path.join(CWD, "7-kraken-mpa-reports", "{sample}.diamond_megan.mpa.filtered.txt") + conda: + "envs/python.yml" + threads: + 1 + log: + os.path.join(CWD, "logs", "{sample}.TaxonomyFilteredlog") + benchmark: + os.path.join(CWD, "benchmarks", "{sample}.TaxonomyFiltered.tsv") + shell: + "python scripts/Convert_MEGAN_RMA_NCBI_c2c-snake.py -i {input.c2c} -o1 {output.inter1} " + "-o2 {output.inter2} -m {output.mpa} -k {output.kreport} -r {input.readcount} &> {log}" diff --git a/Taxonomic-Profiling-Diamond-Megan/Snakefile-diamond-megan b/Taxonomic-Profiling-Diamond-Megan/Snakefile-diamond-megan.smk similarity index 93% rename from Taxonomic-Profiling-Diamond-Megan/Snakefile-diamond-megan rename to Taxonomic-Profiling-Diamond-Megan/Snakefile-diamond-megan.smk index 8f56fec..e42d209 100644 --- a/Taxonomic-Profiling-Diamond-Megan/Snakefile-diamond-megan +++ b/Taxonomic-Profiling-Diamond-Megan/Snakefile-diamond-megan.smk @@ -34,8 +34,6 @@ rule ReadCounts: os.path.join(CWD, "inputs", "{sample}.fasta") output: os.path.join(CWD, "4-rma", "{sample}.readcounts.txt") - conda: - "envs/general.yml" threads: 2 benchmark: os.path.join(CWD, "benchmarks", "{sample}.ReadCounts.tsv") @@ -52,7 +50,7 @@ rule SplitFasta: output: temp(expand(os.path.join(CWD, "1-chunks", "{{sample}}.fasta_chunk_000000{piece}"), piece = CHUNKS)) conda: - "envs/general.yml" + "envs/exonerate.yml" threads: 2 benchmark: os.path.join(CWD, "benchmarks", "{sample}.SplitFasta.tsv") @@ -68,7 +66,7 @@ rule RunDiamond: output: temp(os.path.join(CWD, "2-diamond", "{sample}.{piece}.sam")) conda: - "envs/general.yml" + "envs/diamond.yml" threads: config['diamond']['threads'] params: @@ -92,7 +90,7 @@ rule MergeSam: output: os.path.join(CWD, "3-merged", "{sample}.merged.sam") conda: - "envs/general.yml" + "envs/python.yml" threads: 1 log: os.path.join(CWD, "logs", "{sample}.MergeSam.log") @@ -107,8 +105,6 @@ rule MakeRMAfiltered: reads = os.path.join(CWD, "inputs", "{sample}.fasta") output: expand(os.path.join(CWD, "4-rma", "{{sample}}_filtered.protein.{mode}.rma"), mode = config['sam2rma']['readassignmentmode']) - conda: - "envs/general.yml" threads: config['sam2rma']['threads'] params: @@ -131,8 +127,6 @@ rule MakeRMAunfiltered: reads = os.path.join(CWD, "inputs", "{sample}.fasta") output: expand(os.path.join(CWD, "4-rma", "{{sample}}_unfiltered.protein.{mode}.rma"), mode = config['sam2rma']['readassignmentmode']) - conda: - "envs/general.yml" threads: config['sam2rma']['threads'] params: @@ -156,8 +150,6 @@ rule RunR2CforEC: expand(os.path.join(CWD, "4-rma", "{{sample}}_unfiltered.protein.{mode}.rma"), mode = config['sam2rma']['readassignmentmode']) output: os.path.join(CWD, "5-r2c", "{sample}.EC.reads.txt") - conda: - "envs/general.yml" threads: config['rma2info']['threads'] params: @@ -174,8 +166,6 @@ rule RunR2CforEGGNOG: expand(os.path.join(CWD, "4-rma", "{{sample}}_unfiltered.protein.{mode}.rma"), mode = config['sam2rma']['readassignmentmode']) output: os.path.join(CWD, "5-r2c", "{sample}.EGGNOG.reads.txt") - conda: - "envs/general.yml" threads: config['rma2info']['threads'] params: @@ -192,8 +182,6 @@ rule RunR2CforINTERPRO2GO: expand(os.path.join(CWD, "4-rma", "{{sample}}_unfiltered.protein.{mode}.rma"), mode = config['sam2rma']['readassignmentmode']) output: os.path.join(CWD, "5-r2c", "{sample}.INTERPRO2GO.reads.txt") - conda: - "envs/general.yml" threads: config['rma2info']['threads'] params: @@ -210,8 +198,6 @@ rule RunR2CforSEED: expand(os.path.join(CWD, "4-rma", "{{sample}}_unfiltered.protein.{mode}.rma"), mode = config['sam2rma']['readassignmentmode']) output: os.path.join(CWD, "5-r2c", "{sample}.SEED.reads.txt") - conda: - "envs/general.yml" threads: config['rma2info']['threads'] params: @@ -223,13 +209,27 @@ rule RunR2CforSEED: shell: "{params.rma2info} -i {input} -o {output} -r2c SEED &> {log}" +rule RunR2CforKEGG: + input: + expand(os.path.join(CWD, "4-rma", "{{sample}}_unfiltered.protein.{mode}.rma"), mode = config['sam2rma']['readassignmentmode']) + output: + os.path.join(CWD, "5-r2c", "{sample}.KEGG.reads.txt") + threads: + config['rma2info']['threads'] + params: + rma2info = config['rma2info']['path'] + log: + os.path.join(CWD, "logs", "{sample}.RunR2CforKEGG.log") + benchmark: + os.path.join(CWD, "benchmarks", "{sample}.RunR2CforKEGG.tsv") + shell: + "{params.rma2info} -i {input} -o {output} -r2c KEGG &> {log}" + rule RunR2CforNCBIfiltered: input: expand(os.path.join(CWD, "4-rma", "{{sample}}_filtered.protein.{mode}.rma"), mode = config['sam2rma']['readassignmentmode']) output: os.path.join(CWD, "5-r2c", "{sample}.NCBI.reads.filtered.txt") - conda: - "envs/general.yml" threads: config['rma2info']['threads'] params: @@ -246,8 +246,6 @@ rule RunR2CforNCBIunfiltered: expand(os.path.join(CWD, "4-rma", "{{sample}}_unfiltered.protein.{mode}.rma"), mode = config['sam2rma']['readassignmentmode']) output: os.path.join(CWD, "5-r2c", "{sample}.NCBI.reads.unfiltered.txt") - conda: - "envs/general.yml" threads: config['rma2info']['threads'] params: @@ -264,8 +262,6 @@ rule RunR2CforGTDBfiltered: expand(os.path.join(CWD, "4-rma", "{{sample}}_filtered.protein.{mode}.rma"), mode = config['sam2rma']['readassignmentmode']) output: os.path.join(CWD, "5-r2c", "{sample}.GTDB.reads.filtered.txt") - conda: - "envs/general.yml" threads: config['rma2info']['threads'] params: @@ -282,8 +278,6 @@ rule RunR2CforGTDBunfiltered: expand(os.path.join(CWD, "4-rma", "{{sample}}_unfiltered.protein.{mode}.rma"), mode = config['sam2rma']['readassignmentmode']) output: os.path.join(CWD, "5-r2c", "{sample}.GTDB.reads.unfiltered.txt") - conda: - "envs/general.yml" threads: config['rma2info']['threads'] params: @@ -300,15 +294,11 @@ rule RunR2CforGTDBunfiltered: ################################################## # MEGAN RMA summaries - Class counts - - rule RunC2CforEC: input: expand(os.path.join(CWD, "4-rma", "{{sample}}_unfiltered.protein.{mode}.rma"), mode = config['sam2rma']['readassignmentmode']) output: os.path.join(CWD, "6-c2c", "{sample}.EC.counts.txt") - conda: - "envs/general.yml" threads: config['rma2info']['threads'] params: @@ -325,8 +315,6 @@ rule RunC2CforEGGNOG: expand(os.path.join(CWD, "4-rma", "{{sample}}_unfiltered.protein.{mode}.rma"), mode = config['sam2rma']['readassignmentmode']) output: os.path.join(CWD, "6-c2c", "{sample}.EGGNOG.counts.txt") - conda: - "envs/general.yml" threads: config['rma2info']['threads'] params: @@ -343,8 +331,6 @@ rule RunC2CforINTERPRO2GO: expand(os.path.join(CWD, "4-rma", "{{sample}}_unfiltered.protein.{mode}.rma"), mode = config['sam2rma']['readassignmentmode']) output: os.path.join(CWD, "6-c2c", "{sample}.INTERPRO2GO.counts.txt") - conda: - "envs/general.yml" threads: config['rma2info']['threads'] params: @@ -361,8 +347,6 @@ rule RunC2CforSEED: expand(os.path.join(CWD, "4-rma", "{{sample}}_unfiltered.protein.{mode}.rma"), mode = config['sam2rma']['readassignmentmode']) output: os.path.join(CWD, "6-c2c", "{sample}.SEED.counts.txt") - conda: - "envs/general.yml" threads: config['rma2info']['threads'] params: @@ -374,13 +358,27 @@ rule RunC2CforSEED: shell: "{params.rma2info} -i {input} -o {output} -c2c SEED -p &> {log}" +rule RunC2CforKEGG: + input: + expand(os.path.join(CWD, "4-rma", "{{sample}}_unfiltered.protein.{mode}.rma"), mode = config['sam2rma']['readassignmentmode']) + output: + os.path.join(CWD, "6-c2c", "{sample}.KEGG.counts.txt") + threads: + config['rma2info']['threads'] + params: + rma2info = config['rma2info']['path'] + log: + os.path.join(CWD, "logs", "{sample}.RunC2CforKEGG.log") + benchmark: + os.path.join(CWD, "benchmarks", "{sample}.RunC2CforKEGG.tsv") + shell: + "{params.rma2info} -i {input} -o {output} -c2c KEGG -p &> {log}" + rule RunC2CforGTDBfiltered: input: expand(os.path.join(CWD, "4-rma", "{{sample}}_filtered.protein.{mode}.rma"), mode = config['sam2rma']['readassignmentmode']) output: os.path.join(CWD, "6-c2c", "{sample}.GTDB.counts.filtered.txt") - conda: - "envs/general.yml" threads: config['rma2info']['threads'] params: @@ -397,8 +395,6 @@ rule RunC2CforGTDBunfiltered: expand(os.path.join(CWD, "4-rma", "{{sample}}_unfiltered.protein.{mode}.rma"), mode = config['sam2rma']['readassignmentmode']) output: os.path.join(CWD, "6-c2c", "{sample}.GTDB.counts.unfiltered.txt") - conda: - "envs/general.yml" threads: config['rma2info']['threads'] params: @@ -415,8 +411,6 @@ rule RunC2CforNCBIfiltered: expand(os.path.join(CWD, "4-rma", "{{sample}}_filtered.protein.{mode}.rma"), mode = config['sam2rma']['readassignmentmode']) output: os.path.join(CWD, "6-c2c", "{sample}.NCBI.counts.filtered.txt") - conda: - "envs/general.yml" threads: config['rma2info']['threads'] params: @@ -433,8 +427,6 @@ rule RunC2CforNCBIunfiltered: expand(os.path.join(CWD, "4-rma", "{{sample}}_unfiltered.protein.{mode}.rma"), mode = config['sam2rma']['readassignmentmode']) output: os.path.join(CWD, "6-c2c", "{sample}.NCBI.counts.unfiltered.txt") - conda: - "envs/general.yml" threads: config['rma2info']['threads'] params: @@ -456,7 +448,7 @@ rule UpdateNCBI: dummy = os.path.join(CWD, "7-kraken-mpa-reports", "NCBI-updated.txt"), taxdump = temp(os.path.join(CWD, "taxdump.tar.gz")) conda: - "envs/general.yml" + "envs/python.yml" threads: 1 log: @@ -478,7 +470,7 @@ rule TaxonomyUnfiltered: kreport = os.path.join(CWD, "7-kraken-mpa-reports", "{sample}.diamond_megan.kreport.unfiltered.txt"), mpa = os.path.join(CWD, "7-kraken-mpa-reports", "{sample}.diamond_megan.mpa.unfiltered.txt") conda: - "envs/general.yml" + "envs/python.yml" threads: 1 log: @@ -501,7 +493,7 @@ rule TaxonomyFiltered: kreport = os.path.join(CWD, "7-kraken-mpa-reports", "{sample}.diamond_megan.kreport.filtered.txt"), mpa = os.path.join(CWD, "7-kraken-mpa-reports", "{sample}.diamond_megan.mpa.filtered.txt") conda: - "envs/general.yml" + "envs/python.yml" threads: 1 log: diff --git a/Taxonomic-Profiling-Diamond-Megan/envs/diamond.yml b/Taxonomic-Profiling-Diamond-Megan/envs/diamond.yml new file mode 100644 index 0000000..c58139d --- /dev/null +++ b/Taxonomic-Profiling-Diamond-Megan/envs/diamond.yml @@ -0,0 +1,7 @@ +name: diamond_env +channels: +- bioconda +- conda-forge +- defaults +dependencies: +- diamond >= 2.0.8 \ No newline at end of file diff --git a/Taxonomic-Profiling-Diamond-Megan/envs/exonerate.yml b/Taxonomic-Profiling-Diamond-Megan/envs/exonerate.yml new file mode 100644 index 0000000..143f01a --- /dev/null +++ b/Taxonomic-Profiling-Diamond-Megan/envs/exonerate.yml @@ -0,0 +1,7 @@ +name: exonerate_env +channels: +- bioconda +- conda-forge +- defaults +dependencies: +- exonerate == 2.4.0 \ No newline at end of file diff --git a/Taxonomic-Profiling-Diamond-Megan/envs/general.yml b/Taxonomic-Profiling-Diamond-Megan/envs/python.yml similarity index 67% rename from Taxonomic-Profiling-Diamond-Megan/envs/general.yml rename to Taxonomic-Profiling-Diamond-Megan/envs/python.yml index e74a4c7..2f00afd 100644 --- a/Taxonomic-Profiling-Diamond-Megan/envs/general.yml +++ b/Taxonomic-Profiling-Diamond-Megan/envs/python.yml @@ -1,11 +1,9 @@ -name: general_env +name: python_env channels: - bioconda - conda-forge - defaults dependencies: -- exonerate == 2.4.0 -- diamond >= 2.0.8 - python == 3.7 - pandas - numpy diff --git a/docs/Tutorial-Taxonomic-Profiling-Diamond-Megan.md b/docs/Tutorial-Taxonomic-Profiling-Diamond-Megan.md index 91cdcfa..8c1b881 100644 --- a/docs/Tutorial-Taxonomic-Profiling-Diamond-Megan.md +++ b/docs/Tutorial-Taxonomic-Profiling-Diamond-Megan.md @@ -18,7 +18,7 @@ The purpose of this snakemake workflow is to perform translation alignment of HiFi reads to a protein database, and summarize the resulting alignments using MEGAN-LR. This allows interactive taxonomic and functional analyses to be performed across samples. -This workflow will output absolute read counts and read-based classifications of the EC, EGGNOG, INTERPRO2GO, and SEED functional classes across all samples, along with read counts and read-based classifications of the NCBI and GTDB taxonomy classes. The NCBI taxonomic counts are now also provided in kraken report (kreport) and metaphlan (mpa) output formats. +This workflow will output absolute read counts and read-based classifications of the EC, EGGNOG, INTERPRO2GO, and SEED functional classes across all samples, along with read counts and read-based classifications of the NCBI and GTDB taxonomy classes. The NCBI taxonomic counts are now also provided in kraken report (kreport) and metaphlan (mpa) output formats. The KEGG annotations can be obtained by using MEGAN-UE. The general steps of the Taxonomic-Profiling-Diamond-Megan pipeline are shown below: @@ -39,8 +39,8 @@ This pipeline performed favorably in a recent benchmarking study evaluating seve This workflow requires [Anaconda](https://docs.anaconda.com/anaconda/)/[Conda](https://docs.conda.io/projects/conda/en/latest/index.html) and [Snakemake](https://snakemake.readthedocs.io/en/stable/) to be installed, and will require ~60GB memory and 40-200GB disk space per sample (see [Requirements section](#RFR)). All dependencies in the workflow are installed using conda and the environments are activated by snakemake for relevant steps. Snakemake v5+ is required, and the workflows have been tested using v5.19.3. - Clone the Taxonomic-Profiling-Diamond-Megan directory. -- Download MEGAN6 community edition from the [MEGAN download page](https://software-ab.informatik.uni-tuebingen.de/download/megan6/welcome.html) to obtain `sam2rma`and `rma2info`. **Ensure you have at least version 6.19.+** -- Download and unpack the newest MEGAN mapping file for NCBI-nr accessions from the [MEGAN download page](https://software-ab.informatik.uni-tuebingen.de/download/megan6/welcome.html). +- Download MEGAN6 community edition (or ultimate edition) from the [MEGAN download page](https://software-ab.informatik.uni-tuebingen.de/download/megan6/welcome.html) to obtain `sam2rma`and `rma2info`. **Ensure you have at least version 6.19.+** +- Download and unpack the newest MEGAN mapping file for NCBI-nr accessions from the [MEGAN download page](https://software-ab.informatik.uni-tuebingen.de/download/megan6/welcome.html). This should be the community edition or ultimate edition, depending on which MEGAN you've downloaded. - Download the NCBI-nr database from: ftp://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nr.gz* - Index the NCBI-nr database with DIAMOND using `diamond makedb --in nr.gz --db diamond_nr_db --threads 24`. This will result in a `diamond_nr_db.dmnd` file that is ~150GB. - Include all input HiFi fasta files (`SAMPLE.fasta`) in the `inputs/` folder. These can be files or symlinks. @@ -48,7 +48,7 @@ This workflow requires [Anaconda](https://docs.anaconda.com/anaconda/)/[Conda](h - Specify the full path to `sam2rma`, full path to `rma2info`, full path to the MEGAN database file , and full path to the indexed NCBI-nr database in `config.yaml`. Ensure the `hit_limit` argument (under `diamond`), `readassignmentmode` (under `sam2rma`), and `minSupportPercent` (under `sam2rma`) arguments are set correctly for your analysis. - Execute snakemake using the general commands below: ``` -snakemake --snakefile Snakefile-diamond-megan --configfile configs/Sample-Config.yaml --use-conda [additional arguments for local/HPC execution] +snakemake --snakefile Snakefile-diamond-megan.smk --configfile configs/Sample-Config.yaml --use-conda [additional arguments for local/HPC execution] ``` The choice of additional arguments to include depends on where and how you choose to run snakemake. Please refer to the [Executing Snakemake](#EXS) section for more details. @@ -77,11 +77,12 @@ Taxonomic-Profiling-Diamond-Megan │ ├── Run_ete3_NCBI_update.py │ └── sam-merger-screen-cigar.py │ -├── Snakefile-diamond-megan +├── Snakefile-diamond-megan.smk +├── Snakefile-diamond-megan-ue.smk │ └── config.yaml ``` -The `Snakefile-diamond-megan` file is the Snakefile for this snakemake workflow. It contains all of the rules of the workflow. +The `Snakefile-diamond-megan.smk` and `Snakefile-diamond-megan-ue.smk` files are the Snakefiles for this snakemake workflow. They contains all of the rules of the workflow. One is for MEGAN community edition, and the other for MEGAN ultimate edition. The `config.yaml` file is the main configuration file used in the snakemake workflow. It contains the main settings that will be used for various programs. @@ -118,9 +119,9 @@ If you intend to generate a graphic for the snakemake workflow graph, you will a ## Download MEGAN6 and MEGAN database file -MEGAN6 community edition should be downloaded from the [MEGAN download page](https://software-ab.informatik.uni-tuebingen.de/download/megan6/welcome.html). The `sam2rma` and `rma2info` binaries are required for this pipeline. Both binaries should be present in the `tools` bin distributed with MEGAN. **The full path to `sam2rma` and `rma2info` must be specified in `config.yaml`.** +MEGAN6 community edition or ultimate edition should be downloaded from the [MEGAN download page](https://software-ab.informatik.uni-tuebingen.de/download/megan6/welcome.html). The `sam2rma` and `rma2info` binaries are required for this pipeline. Both binaries should be present in the `tools` bin distributed with MEGAN. **The full path to `sam2rma` and `rma2info` must be specified in `config.yaml`.** The binaries will be slightly different between the community and ultimate editions. If you want to access KEGG annotations, use the ultimate edition. -The newest MEGAN mapping file for NCBI-nr accessions should also be downloaded from the [MEGAN download page](https://software-ab.informatik.uni-tuebingen.de/download/megan6/welcome.html). It must be unpacked to use it. The current unpacked file is ~7GB. **The full path to the unpacked database file (ending with `.db`) must be specified in `config.yaml`.** +The newest MEGAN mapping file for NCBI-nr accessions should also be downloaded from the [MEGAN download page](https://software-ab.informatik.uni-tuebingen.de/download/megan6/welcome.html). It must be unpacked to use it. The current unpacked file is ~7GB. **The full path to the unpacked database file (ending with `.db`) must be specified in `config.yaml`.** Note that there is one file for the community edition, and one for the ultimate edition. If you want access to KEGG annotations, use the ultimate edition. ## Download and index the NCBI-nr database @@ -180,13 +181,13 @@ The workflow must be executed from within the directory containing all the snake ### Test workflow It is a good idea to test the workflow for errors before running it. This can be done with the following command: ``` -snakemake -np --snakefile Snakefile-diamond-megan --configfile configs/Sample-Config.yaml +snakemake -np --snakefile Snakefile-diamond-megan.smk --configfile configs/Sample-Config.yaml ``` Let's unpack this command: - `snakemake` calls snakemake. - `-np` performs a 'dry-run' where the rule compatibilities are tested but they are not executed. -- `--snakefile Snakefile-diamond-megan` tells snakemake to run this particular snakefile. +- `--snakefile Snakefile-diamond-megan.smk` tells snakemake to run this particular snakefile. - `--configfile configs/Sample-Config.yaml` tells snakemake to include the samples listed in the sample configuration file. The dry run command should result in a sequence of jobs being displayed on screen. @@ -194,14 +195,14 @@ The dry run command should result in a sequence of jobs being displayed on scree ### Create workflow figure If there are no errors, you may wish to generate a figure of the directed acyclic graph (the workflow steps). You can do this using the following command: ``` -snakemake --dag --snakefile Snakefile-diamond-megan --configfile configs/Sample-Config.yaml | dot -Tsvg > taxfunc_analysis.svg +snakemake --dag --snakefile Snakefile-diamond-megan.smk --configfile configs/Sample-Config.yaml | dot -Tsvg > taxfunc_analysis.svg ``` Here the `--dag` flag creates an output that is piped to `dot`, and an svg file is created. This will show the workflow visually. ### Execute workflow Finally, you can execute the workflow using: ``` -snakemake --snakefile Snakefile-diamond-megan --configfile configs/Sample-Config.yaml -j 48 --use-conda +snakemake --snakefile Snakefile-diamond-megan.smk --configfile configs/Sample-Config.yaml -j 48 --use-conda ``` There are a couple important arguments that were added here: @@ -223,12 +224,12 @@ One easy way to run snakemake is to start an interactive session, and execute sn The same general commands are used as with "local" execution, but with some additional arguments to support cluster configuration. Below is an example of cluster configuration using SLURM: ``` -snakemake --snakefile Snakefile-diamond-megan --configfile configs/Sample-Config.yaml --use-conda --cluster "sbatch --partition=compute --cpus-per-task={threads}" -j 5 --jobname "{rule}.{wildcards}.{jobid}" --latency-wait 60 +snakemake --snakefile Snakefile-diamond-megan.smk --configfile configs/Sample-Config.yaml --use-conda --cluster "sbatch --partition=compute --cpus-per-task={threads}" -j 5 --jobname "{rule}.{wildcards}.{jobid}" --latency-wait 60 ``` Let's unpack this command: - `snakemake` calls snakemake. -- `--snakefile Snakefile-diamond-megan` tells snakemake to run this particular snakefile. +- `--snakefile Snakefile-diamond-megan.smk` tells snakemake to run this particular snakefile. - `--configfile configs/Sample-Config.yaml` tells snakemake to use this sample configuration file in the `configs/` directory. This file can have any name, as long as that name is provided here. - `--use-conda` this allows conda to install the programs and environments required for each step. It is essential. - `--cluster "sbatch --partition=compute --cpus-per-task={threads}"` are the settings for execution with SLURM, where 'compute' is the name of the machine. The threads argument will be automatically filled based on threads assigned to each rule. Note that the entire section in quotes can be replaced with an SGE equivalent (see below). @@ -239,7 +240,7 @@ Let's unpack this command: And here is an example using SGE instead: ``` -snakemake --snakefile Snakefile-diamond-megan --configfile configs/Sample-Config.yaml --use-conda --cluster "qsub -q default -pe smp {threads} -V -cwd -S /bin/bash" -j 5 --jobname "{rule}.{wildcards}.{jobid}" --latency-wait 60 +snakemake --snakefile Snakefile-diamond-megan.smk --configfile configs/Sample-Config.yaml --use-conda --cluster "qsub -q default -pe smp {threads} -V -cwd -S /bin/bash" -j 5 --jobname "{rule}.{wildcards}.{jobid}" --latency-wait 60 ``` - `--cluster "qsub -q default -pe smp {threads} -V -cwd -S /bin/bash"` are the settings for execution with SGE, where 'default' is the name of the machine. The threads argument will be automatically filled based on threads assigned to each rule. @@ -267,7 +268,8 @@ Taxonomic-Profiling-Diamond-Megan ├── envs/ ├── inputs/ ├── scripts/ -├── Snakefile-diamond-megan +├── Snakefile-diamond-megan.smk +├── Snakefile-diamond-megan-ue.smk ├── config.yaml │ ├── benchmarks/