From aa1aebf67c30c91105435ea69693e20453930484 Mon Sep 17 00:00:00 2001 From: Kelsey Florek Date: Fri, 16 Apr 2021 09:46:28 -0500 Subject: [PATCH] Monroe ONT Updates (#47) * added wdl fastq parser * monroe ont redesign, dropped nanopolish updated to use medaka models --- staphb_toolkit/core/docker_config.json | 8 +- staphb_toolkit/core/wdl_fastq_input.py | 42 ++++++ staphb_toolkit/toolkit_workflows.py | 31 ++--- .../workflows/monroe/configs/docker.config | 5 +- .../monroe/configs/docker_containers.config | 3 +- .../monroe/configs/ont_user_config.config | 16 +-- .../monroe/configs/singularity.config | 5 +- staphb_toolkit/workflows/monroe/monroe.config | 3 +- .../workflows/monroe/monroe_ont_assembly.nf | 127 +++++++----------- 9 files changed, 110 insertions(+), 130 deletions(-) create mode 100644 staphb_toolkit/core/wdl_fastq_input.py diff --git a/staphb_toolkit/core/docker_config.json b/staphb_toolkit/core/docker_config.json index 974cd35d..f285d2ac 100644 --- a/staphb_toolkit/core/docker_config.json +++ b/staphb_toolkit/core/docker_config.json @@ -25,12 +25,8 @@ "image": "docker://staphb/ariba", "tag": "latest" }, - "artic-ncov2019-medaka": { - "image": "docker://staphb/artic-ncov2019-medaka", - "tag": "latest" - }, - "artic-ncov2019-nanopolish": { - "image": "docker://staphb/artic-ncov2019-nanopolish", + "artic-ncov2019": { + "image": "docker://staphb/artic-ncov2019", "tag": "latest" }, "augur": { diff --git a/staphb_toolkit/core/wdl_fastq_input.py b/staphb_toolkit/core/wdl_fastq_input.py new file mode 100644 index 00000000..bb74dbca --- /dev/null +++ b/staphb_toolkit/core/wdl_fastq_input.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python3 + +import re +import os,sys + +#search a directory and return a dictonary of paired fastq files named by fastq name +def pe_search(path): + if not os.path.isdir(path): + raise ValueError(path + " " + "not found.") + path = os.path.abspath(path) + fastqs = [] + fastq_dict = {} + for root,dirs,files in os.walk(path): + for file in files: + if '.fastq' in file or '.fastq.gz' in file or '.fq.gz' in file or '.fq' in file: + fastqs.append(file) + fastqs.sort() + if not len(fastqs)%2 == 0: + print("There is an uneven number of Fastq files in: '"+path+"'") + sys.exit(1) + + for readA, readB in zip(fastqs[0::2], fastqs[1::2]): + samplename = re.split("(_R1)|(_1)",readA)[0] + fastq_dict[samplename] = {"read_1":readA,"read_2":readB} + return(fastq_dict) + +#search a directory and return a dictonary of fastq files named by fastq name +def se_search(path): + if not os.path.isdir(path): + raise ValueError(path + " " + "not found.") + path = os.path.abspath(path) + fastqs = [] + fastq_dict = {} + for root,dirs,files in os.walk(path): + for file in files: + if '.fastq' in file or '.fastq.gz' in file or '.fq.gz' in file or '.fq' in file: + fastqs.append(file) + fastqs.sort() + for read in fastqs: + samplename = re.split("(.fastq)|(.fq)",read)[0] + fastq_dict[samplename] = read + return(fastq_dict) diff --git a/staphb_toolkit/toolkit_workflows.py b/staphb_toolkit/toolkit_workflows.py index 994e2221..acb2705d 100644 --- a/staphb_toolkit/toolkit_workflows.py +++ b/staphb_toolkit/toolkit_workflows.py @@ -73,13 +73,12 @@ def error(self, message): ##monroe_ont_assembly---------------------------- ##medaka nanopolish subparser_monroe_ont_assembly = monroe_subparsers.add_parser('ont_assembly',help='Assembly SARS-CoV-2 genomes from ONT read data generated from ARTIC amplicons', add_help=False) - subparser_monroe_ont_assembly.add_argument('fast5_path', type=str,help="path to the location of the reads in a fast5 format") - subparser_monroe_ont_assembly.add_argument('--fastq_path', type=str,help="path to the location of the reads in a fastq format, needed if not performing bascalling") - subparser_monroe_ont_assembly.add_argument('--polish_method',type=str,choices=["medaka","nanopolish"],help="polishing method, default: medaka",default="medaka") - subparser_monroe_ont_assembly.add_argument('--summary', type=str,help="path to the location of the sequencing summary, only needed when using nanopolish from fastq data") + subparser_monroe_ont_assembly.add_argument('--fast5_path', type=str,help="path to the location of the reads in a fast5 format") + subparser_monroe_ont_assembly.add_argument('--fastq_path', type=str,help="path to the location of the reads in a fastq format") + subparser_monroe_ont_assembly.add_argument('--demultiplexed', default="false",action="store_const",const="true",help="flag if fastq files have already been demultiplexed") + subparser_monroe_ont_assembly.add_argument('--medaka_model',default="r941_min_high_g360",help="medaka model to use for polishing, default: r941_min_fast_g303") subparser_monroe_ont_assembly.add_argument('--run_prefix', type=str,help="desired run prefix. Default \"artic_ncov19\"",default="artic_ncov19") - subparser_monroe_ont_assembly.add_argument('--ont_basecalling', default=False, action="store_true",help="perform high accuracy basecalling using GPU (only use if you have setup a GPU compatable device)") - subparser_monroe_ont_assembly.add_argument('--primers', type=str,choices=["V1", "V2", "V3"], help="indicate which ARTIC primers were used (V1, V2, or V3)",required=True) + subparser_monroe_ont_assembly.add_argument('--primers', type=str,choices=["V1", "V2", "V3"], help="indicate which ARTIC primers were used (V1, V2, or V3), default V3",default="V3") subparser_monroe_ont_assembly.add_argument('--config','-c', type=str,help="Nextflow custom configuration.") subparser_monroe_ont_assembly.add_argument('--output','-o',metavar="",type=str,help="Path to ouput directory, default \"monroe_results\".",default="monroe_results") subparser_monroe_ont_assembly.add_argument('--resume', default="", action="store_const",const="-resume",help="resume a previous run") @@ -359,24 +358,16 @@ def error(self, message): if args.monroe_command == 'ont_assembly': #build command - if args.ont_basecalling: - read_paths = f"--basecalling --fast5_dir {args.fast5_path}" - elif args.fast5_path and args.fastq_path: - read_paths = f"--fast5_dir {args.fast5_path} --fastq_dir {args.fastq_path}" + if args.fast5_path: + read_paths = f"--fast5_dir {args.fast5_path}" + elif args.fastq_path: + read_paths = f"--fastq_dir {args.fastq_path}" else: subparser_monroe_ont_assembly.print_help() - print("Please provide path to both fastq and fast5 or perform basecalling.") + print("Please provide path to fast5 files to perform basecalling or fastq files, note: basecalling requires GPU") sys.exit(1) - if args.polish_method == "nanopolish" and args.summary: - seq_summary = f"--sequencing_summary {args.summary}" - elif args.polish_method == "nanopolish" and not args.summary and not args.ont_basecalling: - print("Nanopolish requires a sequencing summary file generated during basecalling.") - sys.exit(1) - else: - seq_summary = "" - - command = nextflow_path + f" {config} run {monroe_path}/monroe_ont_assembly.nf {profile} {args.resume} {read_paths} --pipe ont {seq_summary} --polishing {args.polish_method} --primers {args.primers} --outdir {args.output} --run_prefix {args.run_prefix} -with-trace {args.output}/logs/{exec_time}Monroe_trace.txt -with-report {args.output}/logs/{exec_time}Monroe_execution_report.html {work}" + command = nextflow_path + f" {config} run {monroe_path}/monroe_ont_assembly.nf {profile} {args.resume} {read_paths} --pipe ont --primers {args.primers} --outdir {args.output} --run_prefix {args.run_prefix} --medaka_model {args.medaka_model} --demultiplexed {args.demultiplexed} -with-trace {args.output}/logs/{exec_time}Monroe_trace.txt -with-report {args.output}/logs/{exec_time}Monroe_execution_report.html {work}" #run command using nextflow in a subprocess print("Starting the Monroe ONT assembly:") child = pexpect.spawn(command) diff --git a/staphb_toolkit/workflows/monroe/configs/docker.config b/staphb_toolkit/workflows/monroe/configs/docker.config index b5995741..26f8c185 100644 --- a/staphb_toolkit/workflows/monroe/configs/docker.config +++ b/staphb_toolkit/workflows/monroe/configs/docker.config @@ -17,12 +17,9 @@ process { stageInMode = 'copy' } withName:artic_guppyplex{ - container = artic_nanopolish_container + container = artic_medaka_container stageInMode = 'copy' } - withName:artic_nanopolish_pipeline{ - container = artic_nanopolish_container - } withName:artic_medaka_pipeline{ container = artic_medaka_container } diff --git a/staphb_toolkit/workflows/monroe/configs/docker_containers.config b/staphb_toolkit/workflows/monroe/configs/docker_containers.config index e7d6ed34..87c77337 100644 --- a/staphb_toolkit/workflows/monroe/configs/docker_containers.config +++ b/staphb_toolkit/workflows/monroe/configs/docker_containers.config @@ -10,7 +10,6 @@ trimmomatic_container = 'staphb/trimmomatic:0.39' bbtools_container = 'staphb/bbtools:38.76' guppy_gpu_container = "genomicpariscentre/guppy-gpu" guppy_cpu_container = "genomicpariscentre/guppy" -artic_nanopolish_container = "staphb/artic-ncov2019-nanopolish:1.1.0" -artic_medaka_container = "staphb/artic-ncov2019-medaka:1.1.0" +artic_medaka_container = "staphb/artic-ncov2019:1.3.0" pangolin_container = "staphb/pangolin:2.3.0-pangolearn-2021-02-18" theiagen_artic_medaka_container = "theiagen/artic-ncov2019:1.1.3" diff --git a/staphb_toolkit/workflows/monroe/configs/ont_user_config.config b/staphb_toolkit/workflows/monroe/configs/ont_user_config.config index 295402fb..ff40c1c3 100644 --- a/staphb_toolkit/workflows/monroe/configs/ont_user_config.config +++ b/staphb_toolkit/workflows/monroe/configs/ont_user_config.config @@ -43,10 +43,9 @@ guppy_gpu_container = "genomicpariscentre/guppy-gpu" guppy_cpu_container = "genomicpariscentre/guppy" -artic_nanopolish_container = "staphb/artic-ncov2019-nanopolish:1.1.0" -artic_medaka_container = "staphb/artic-ncov2019-medaka:1.1.0" +artic_medaka_container = "staphb/artic-ncov2019:1.3.0" pangolin_container = "staphb/pangolin:2.3.0-pangolearn-2021-02-18" -samtools_container = 'staphb/samtools:1.10' +samtools_container = "staphb/samtools:1.10" //##################### //###Pipeline Params### @@ -68,9 +67,9 @@ params.chunk_size = 500 params.min_length = 400 params.max_length = 700 -//ARTIC Nanopolish/Medaka Pipeline Parameters -params.polishing = "medaka" // polishing mode "nanopolish" or "medaka" +//ARTIC Medaka Pipeline Parameters params.normalise = 200 +params.medaka_model = "r941_min_high_g360" process { @@ -91,14 +90,9 @@ process { withName:artic_guppyplex{ cpus = 8 memory = '16 GB' - container = artic_nanopolish_container + container = artic_medaka_container stageInMode = 'copy' } - withName:artic_nanopolish_pipeline{ - cpus = 8 - memory = '16 GB' - container = artic_nanopolish_container - } withName:artic_medaka_pipeline{ cpus = 8 memory = '16 GB' diff --git a/staphb_toolkit/workflows/monroe/configs/singularity.config b/staphb_toolkit/workflows/monroe/configs/singularity.config index 6aebbb5d..109c9679 100644 --- a/staphb_toolkit/workflows/monroe/configs/singularity.config +++ b/staphb_toolkit/workflows/monroe/configs/singularity.config @@ -12,10 +12,7 @@ process { container = guppy_cpu_container } withName:artic_guppyplex{ - container = artic_nanopolish_container - } - withName:artic_nanopolish_pipeline{ - container = artic_nanopolish_container + container = artic_medaka_container } withName:artic_medaka_pipeline{ container = artic_medaka_container diff --git a/staphb_toolkit/workflows/monroe/monroe.config b/staphb_toolkit/workflows/monroe/monroe.config index 8ec1a8cd..10b5917a 100644 --- a/staphb_toolkit/workflows/monroe/monroe.config +++ b/staphb_toolkit/workflows/monroe/monroe.config @@ -39,8 +39,7 @@ params.chunk_size = 500 params.min_length = 400 params.max_length = 700 -//ARTIC Nanopolish/Medaka Pipeline Parameters -params.polishing = "medaka" // polishing mode "nanopolish" or "medaka" +//ARTIC Medaka Pipeline Parameters params.normalise = 200 //####################### diff --git a/staphb_toolkit/workflows/monroe/monroe_ont_assembly.nf b/staphb_toolkit/workflows/monroe/monroe_ont_assembly.nf index 034c13f8..71b354b9 100644 --- a/staphb_toolkit/workflows/monroe/monroe_ont_assembly.nf +++ b/staphb_toolkit/workflows/monroe/monroe_ont_assembly.nf @@ -8,20 +8,15 @@ //starting parameters params.fast5_dir = "" params.fastq_dir = "" -params.sequencing_summary = "" params.outdir = "" -params.primers = "" +params.primers = "V3" params.run_prefix = "artic_ncov19" params.pipe = "" - -// Input channels -Channel - .value( "${params.primers}") - .ifEmpty { exit 1, "Primers used must be included." } - .set { polish_primers } +params.demultiplexed = false +params.medaka_model = "r941_min_high_g360" // If we have fast5 files then start with basecalling -if(params.basecalling){ +if(params.fast5_dir){ Channel .fromPath( "${params.fast5_dir}") .ifEmpty { exit 1, "Cannot find any fast5 files in: ${params.fast5_dir} Path must not end with /" } @@ -50,38 +45,36 @@ if(params.basecalling){ // If we have already basecalled get fastqs and fast5s for polishing else { - Channel - .fromPath( "${params.fastq_dir}/*.fastq*") - .ifEmpty { exit 1, "Cannot find any fastq files in: ${params.fastq_dir} Path must not end with /" } - .set { fastq_reads } - - Channel - .fromPath( "${params.fast5_dir}") - .ifEmpty { exit 1, "Cannot find any fast5 files in: ${params.fast5_dir} Path must not end with /" } - .set { polish_fast5 } - - if(params.polishing == "nanopolish"){ + if(params.demultiplexed){ + Channel + .fromPath( "${params.fastq_dir}/barcode*",type:'dir') + .ifEmpty { exit 1, "Cannot find any fastq files in: ${params.fastq_dir} Path must not end with /" } + .set { demultiplexed_reads } + } + else{ Channel - .fromPath( "${params.sequencing_summary}") - .ifEmpty { exit 1, "Cannot find sequencing summary in: ${params.sequencing_summary}" } - .set { sequencing_summary } - } + .fromPath( "${params.fastq_dir}/*.fastq*") + .ifEmpty { exit 1, "Cannot find any fastq files in: ${params.fastq_dir} Path must not end with /" } + .set { fastq_reads } + } } -// Demultiplex fastqs -process guppy_demultiplexing { - publishDir "${params.outdir}/demultiplexing", mode: 'copy' +if(! params.demultiplexed){ + // Demultiplex fastqs + process guppy_demultiplexing { + publishDir "${params.outdir}/demultiplexing", mode: 'copy' - input: - file(fastqs) from fastq_reads.collect() + input: + file(fastqs) from fastq_reads.collect() - output: - path("barcode*",type:'dir') into demultiplexed_reads + output: + path("barcode*",type:'dir') into demultiplexed_reads - script: - """ - guppy_barcoder -t ${task.cpus} --require_barcodes_both_ends -i . -s . ${params.demultiplexing_params} -q 0 -r - """ + script: + """ + guppy_barcoder -t ${task.cpus} --require_barcodes_both_ends -i . -s . ${params.demultiplexing_params} -q 0 -r + """ + } } // Run artic gupplyplex @@ -105,58 +98,30 @@ process artic_guppyplex { """ } -// Run artic pipeline using nanopolish -if(params.polishing=="nanopolish"){ - process artic_nanopolish_pipeline { - publishDir "${params.outdir}/pipeline_nanopolish", mode: 'copy' - errorStrategy 'ignore' - - input: - val primers from polish_primers - tuple file(fastq), path(fast5path), file(sequencing_summary) from polish_files .combine(polish_fast5) .combine(sequencing_summary) - - output: - file *.primrtrimmed.bam into alignment_file - file "*{.primertrimmed.bam,.vcf,.variants.tab}" - file "*.consensus.fasta" into consensus_fasta - - script: - """ - # get samplename by dropping file extension - filename=${fastq} - samplename=\${filename%.*} - - artic minion --normalise ${params.normalise} --threads ${task.cpus} --scheme-directory /artic-ncov2019/primer_schemes --fast5-directory ${fast5path} --sequencing-summary ${sequencing_summary} --read-file ${fastq} nCoV-2019/${primers} \$samplename - """ - } -} - // Run artic pipeline using medaka -else { - process artic_medaka_pipeline { - publishDir "${params.outdir}/pipeline_medaka", mode: 'copy' - publishDir "${params.outdir}/assemblies", mode: 'copy', pattern: '*.fasta' - errorStrategy 'ignore' +process artic_medaka_pipeline { + publishDir "${params.outdir}/pipeline_medaka", mode: 'copy' + publishDir "${params.outdir}/assemblies", mode: 'copy', pattern: '*.fasta' + errorStrategy 'ignore' - input: - val primers from polish_primers - file(fastq) from polish_files + input: + file(fastq) from polish_files - output: - file "*.primertrimmed.rg.sorted.bam" into alignment_file - file "*{.primertrimmed.rg,.primers.vcf,.vcf.gz,.trimmed.rg,.fail.vcf}*" - file "*.consensus.fasta" into consensus_fasta + output: + file "*.primertrimmed.rg.sorted.bam" into alignment_file + file "*{.primertrimmed.rg,.primers.vcf,.vcf.gz,.trimmed.rg,.fail.vcf}*" + file "*.consensus.fasta" into consensus_fasta - script: - """ - # get samplename by dropping file extension - filename=${fastq} - samplename=\${filename%.*} + script: + """ + # get samplename by dropping file extension + filename=${fastq} + samplename=\${filename%.*} - artic minion --medaka --normalise ${params.normalise} --threads ${task.cpus} --scheme-directory /artic-ncov2019/primer_schemes --read-file ${fastq} nCoV-2019/${primers} \$samplename - """ - } + artic minion --medaka --medaka-model ${params.medaka_model} --normalise ${params.normalise} --threads ${task.cpus} --scheme-directory /fieldbioinformatics/test-data/primer-schemes --read-file ${fastq} nCoV-2019/${params.primers} \$samplename + """ } + //QC of read data process samtools { tag "$name"