diff --git a/README.md b/README.md index 36675cd..2b32ac3 100644 --- a/README.md +++ b/README.md @@ -7,32 +7,36 @@ to obtain consensus sequences, HA and NA subtypes, clade calls, and amino acid m ## Analyses -- Read trimming & QC: `fastp` -- Primer removal with `cutadapt` -- FASTQ quality reporting with `FastQC` -- Aggregate the reports with `multiqc` -- Sequence analysis with `FluViewer` +- Read trimming & QC: [fastp](https://github.com/OpenGene/fastp) +- Primer removal with [cutadapt](https://github.com/marcelm/cutadapt) +- FASTQ quality reporting with [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) +- Aggregate the reports with [MultiQC](https://multiqc.info/) +- Sequence analysis with [BCCDC-PHL/FluViewer](https://github.com/BCCDC-PHL/FluViewer) - Extract HPAI motif (applies to H5 sequences only) -- Clade calls for H1, H3 and H5 influenza using Nextclade (H1 and H3, custom Nextclade for H5 used at BCCDC-PHL) +- Clade calls for H1, H3 and H5 influenza using [Nextclade](https://github.com/nextstrain/nextclade) - Amino acid SNP calls against a specified reference -- Genotype calls using GenoFLU against curated database +- Genotype calls using GenoFLU against curated database ```mermaid flowchart TD - fastq(FASTQ Input) --> fastq_trimmed{Read Trimming: fastp} - fastq_trimmed --> primer_trimmed{Remove Primers: cutadapt} - fastq_trimmed --> MultiQC{MultiQC} - primer_trimmed --> FASTQC{FASTQC} - FASTQC --> MultiQC - primer_trimmed --> MultiQC --> MultiQC_report[MultiQC Report] - primer_trimmed --> FluViewer{FluViewer} - FluViewer --> HPAI{HPAI script} --> HPAI_result[HPAI Results] - FluViewer --> Segcov[Segment Coverage Plots] - FluViewer --> Consensus[Consensus Sequence] - Consensus --> SNPCaller{SNP Calling: snp-calling.py} --> SNPCalls[AA SNP Outputs] - Consensus --> CladeCaller{Clade Calling: Nextclade} --> CladeCalls[Clade Call Outputs] - FluViewer --> VCF[Variant Call Outputs] - Consensus --> GenoFLU[GenoFLU Genotype Calling] + fastq_input[FASTQ Input] --> fastp(fastp) + fastp -- trimmed_reads --> cutadapt(cutadapt) + cutadapt -- qc_stats --> multiqc(multiqc) + cutadapt -- primer_trimmed_reads --> fastqc(fastqc) + fastqc -- qc_stats --> multiqc + multiqc --> multiqc_report[MultiQC Report] + cutadapt -- primer_trimmed_reads --> normalize_reads(normalize_reads) + normalize_reads -- normalized_reads --> fluviewer(fluviewer) + fluviewer_db[FluViewer DB] --> fluviewer + fluviewer -- ha_consensus --> clade_calling(clade_calling) + clade_calling --> clade_calls[clade-calls] + fluviewer -- consensus_main --> snp_caling(snp_calling) + snp_calling --> snp_calls[snp-calls] + fluviewer -- consensus_main --> genoflu(genoflu) + fluviewer --> segment_coverage_plots[Segment Coverage Plots] + fluviewer --> consensus_sequence[Consensus Sequence] + fluviewer --> variants[Variants VCF] + genoflu --> genoflu_tsv[GenoFLU tsv] ``` @@ -54,8 +58,8 @@ Short read Illumina sequences, files ending in '.fastq.gz', '.fq.gz', '.fastq', For a full list of optional arguments, see: https://github.com/BCCDC-PHL/FluViewer | Argument | Description | Default Value | -|----------------------------|--------------------------------------------------------------------------------------------------|----------------:| -| `--target_depth` | Depth to normalize coverage to, where sufficient depth is available in inputs. | 200 | +|-----------------------|--------------------------------------------------------------------------------------------------|----------------:| +| `--target_depth` | Depth to normalize coverage to, where sufficient depth is available in inputs. | 200 | | `--min_depth` | Minimum read depth for base calling. | 20 | | `--min_q` | Minimum PHRED score for base quality and mapping quality. | 20 | | `--min_cov` | Minimum coverage of database reference sequence by contig, percentage. | 25 | @@ -64,7 +68,7 @@ For a full list of optional arguments, see: https://github.com/BCCDC-PHL/FluView **Example command:** ``` nextflow run BCCDC-PHL/fluviewer-nf \ - -r v0.2.2 \ + -r v0.3.0 \ -profile conda \ --cache ~/.conda/envs \ --fastq_input /path/to/your_fastqs \ @@ -77,23 +81,83 @@ nextflow run BCCDC-PHL/fluviewer-nf \ Outputs are written to the directory specified with the `--outdir` parameter. Below that are individual folders for each sample, containing the results of FluViewer, SNP calling, and clade calling. ``` -|-Run number -|-----FluViewer_version_output - |----sample_number - | |-----FluViewer outputs - | |-----SNP Call outputs - | |-----Clade Call outputs - | |-----GenoFLU outputs - |----sample_number - | |-----FluViewer outputs - | |-----SNP Call outputs - | |-----Clade Call outputs - | |-----GenoFLU outputs - | ... - |----Provenance_files - |----sample_number_multiqc_report.html - |----sample_number_report.html - |----sample_number_timeline.html + +├── _fluviewer-nf_multiqc_report.html +├── _fluviewer-nf_nextflow_report.html +├── _fluviewer-nf_nextflow_timeline.html +└── + ├── .fastp.html + ├── .fastp.json + ├── __provenance.yml + ├── _HA_consensus.fa + ├── _HPAI.tsv + ├── _NA_consensus.fa + ├── _alignment.bam + ├── _alignment.bam.bai + ├── _consensus_seqs.fa + ├── _contigs_blast.tsv + ├── _depth_of_cov.png + ├── _fluviewer_provenance.yml + ├── _genoflu.tsv + ├── _mapping_refs.fa + ├── _report.tsv + ├── _variants.vcf + ├── clade-calls + │ ├── _nextclade.aligned.fasta.gz + │ ├── _nextclade.csv + │ ├── _nextclade.json + │ ├── _nextclade.ndjson + │ ├── _nextclade.tsv + │ ├── _nextclade_HA1.translation.fasta.gz + │ ├── _nextclade_HA2.translation.fasta.gz + │ └── _nextclade_SigPep.translation.fasta.gz + ├── fluviewer_logs + │ ├── 01_assemble_contigs + │ │ ├── spades_stderr.txt + │ │ └── spades_stdout.txt + │ ├── 02_blast_contigs + │ │ ├── blastn_contigs_stderr.txt + │ │ └── blastn_contigs_stdout.txt + │ ├── 03_scaffolding + │ │ ├── blastn_scaffolds_stderr.txt + │ │ └── blastn_scaffolds_stdout.txt + │ ├── 04_read_mapping + │ │ ├── bwa_index_stderr.txt + │ │ ├── bwa_index_stdout.txt + │ │ ├── bwa_mem_stderr.txt + │ │ ├── bwa_mem_stdout.txt + │ │ ├── samtools_index_stderr.txt + │ │ ├── samtools_index_stdout.txt + │ │ ├── samtools_view_stderr.txt + │ │ └── samtools_view_stdout.txt + │ ├── 05_variant_calling + │ │ ├── freebayes_stderr.txt + │ │ └── freebayes_stdout.txt + │ ├── 06_consensus_calling + │ │ ├── bcftools_consensus_stderr.txt + │ │ ├── bcftools_consensus_stdout.txt + │ │ ├── bcftools_index_stderr.txt + │ │ ├── bcftools_index_stdout.txt + │ │ ├── bcftools_view_stderr.txt + │ │ └── bcftools_view_stdout.txt + │ ├── 07_summary_reporting + │ │ ├── samtools_depth_stderr.txt + │ │ ├── samtools_depth_stdout.txt + │ │ ├── samtools_idxstats_stderr.txt + │ │ └── samtools_idxstats_stdout.txt + │ ├── fluviewer.log + │ ├── makeblastdb_contigs_stderr.txt + │ └── makeblastdb_contigs_stdout.txt + └── snp-calls + ├── _HA_mutations.tsv + ├── _M_mutations.tsv + ├── _NA_mutations.tsv + ├── _NP_mutations.tsv + ├── _NS_mutations.tsv + ├── _PA_mutations.tsv + ├── _PB1_mutations.tsv + ├── _PB2_mutations.tsv + └── pairwise ``` Output for each sample includes: @@ -121,35 +185,50 @@ Output for each run includes: For each pipeline invocation, each sample will produce a `provenance.yml` file with the following contents. Note the below is a contrived example. ```yml -- pipeline_name: BCCDC-PHL/FluViewer-nf - pipeline_version: 0.2.2 - timestamp_analysis_start: 2023-11-21T05:43:25.541743 -- input_filename: {Sample}_R1.fastq.gz - input_path: /home/{USER{}}/Flu/test_data/test_production_run/{Sample}_R1.fastq.gz - sha256: 47380e49f10374660a2061d3571efe5339401484e646c2b47896fa701dbcf0a8 -- input_filename: {Sample}_R2.fastq.gz - input_path: /home/{USER}/Flu/test_data/test_production_run/{Sample}.fastq.gz - sha256: 39c95fd26af111ee9a6caeb840a7aced444b657550efea3ab7f74add0b30f69d +- pipeline_name: BCCDC-PHL/fluviewer-nf + pipeline_version: 0.3.0 + nextflow_session_id: 59fdd919-fc28-4af5-99e0-60355b11807c + nextflow_run_name: hopeful_bhaskara + timestamp_analysis_start: 2024-07-15T16:30:11.150887-07:00 +- input_filename: sample-01_R1.fastq.gz + file_type: fastq_input + sha256: 2c1d5b310a5ca11cc2b1665c094a064fc3aa597e06f392985dac639bd2ab4d81 +- input_filename: sample-01_R2.fastq.gz + file_type: fastq_input + sha256: 73745eed4badc3594cdd8554e90c75ae4b9b4599ca255064415ded433e598749 - process_name: fastp tools: - tool_name: fastp - tool_version: 0.23.1 + tool_version: 0.23.2 - process_name: cutadapt tools: - tool_name: cutadapt - tool_version: 4.1 + tool_version: 4.4 +- process_name: normalize_depth + tools: + - tool_name: bbnorm + tool_version: 39.01 - process_name: fluviewer tools: - - tool_name: FluViewer - tool_version: FluViewer v0.0.2 + - tool_name: fluviewer + tool_version: 0.1.11-6 + databases: + - database_name: FluViewer_db.fa + database_path: /path/to/FluViewer_db.fa + database_sha256: c9ba1af0a637671d86a14aceac3cbfde309ae9a5bd613d75c87ba2ff390b4c48 +process_name: nextclade +tools: + - tool_name: nextclade + tool_version: 3.8.1 + subcommand: run +- process_name: snp_caling + tools: + - tool_name: blastx + tool_version: 2.15.0+ databases: - - database_name: FluViewer_db_full_20220915.fasta - database_path: /home/{USER}/Flu/ref/FluViewer_db_full_20220915.fasta - database_sha256: 55b33afa21ad44ed1e6db896cf420fae6b1524c0ad205775a1ce9dd11595905d - -- process_name: nextclade - tool_name: nextclade - tool_version: 2.9.1 - Dataset location: /scratch/{USER}/flu/nextclade_datasets/nextclade_flu_h5nx_ha/ - Dataset version: "tag": "2023-04-02T12:00:00Z" + - database_name: blastx_subtype_db.fasta +- process_name: genoflu + tools: + - tool_name: genoflu + tool_version: 1.03 ``` diff --git a/modules/fluviewer.nf b/modules/fluviewer.nf index e442326..d0c705f 100644 --- a/modules/fluviewer.nf +++ b/modules/fluviewer.nf @@ -14,7 +14,7 @@ process normalize_depth { printf -- "- process_name: normalize_depth\\n" >> ${sample_id}_normalize_depth_provenance.yml printf -- " tools:\\n" >> ${sample_id}_normalize_depth_provenance.yml printf -- " - tool_name: bbnorm\\n" >> ${sample_id}_normalize_depth_provenance.yml - printf -- " tool_version: \$(bbnorm.sh --version)\\n" >> ${sample_id}_normalize_depth_provenance.yml + printf -- " tool_version: \$(bbnorm.sh --version 2>&1 | head -n 2 | tail -n 1 | cut -d ' ' -f 3)\\n" >> ${sample_id}_normalize_depth_provenance.yml bbnorm.sh \ -Xmx${max_memory_gb}g \