BCCDC-PHL · dfornika · Jul 16, 2024 · Jul 16, 2024 · Jul 16, 2024 · Jul 16, 2024
diff --git a/README.md b/README.md
@@ -7,32 +7,36 @@ to obtain consensus sequences, HA and NA subtypes, clade calls, and amino acid m
 
 ## Analyses
 
-- Read trimming & QC: `fastp`
-- Primer removal with `cutadapt`
-- FASTQ quality reporting with `FastQC`
-- Aggregate the reports with `multiqc`
-- Sequence analysis with `FluViewer` 
+- Read trimming & QC: [fastp](https://github.com/OpenGene/fastp)
+- Primer removal with [cutadapt](https://github.com/marcelm/cutadapt)
+- FASTQ quality reporting with [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)
+- Aggregate the reports with [MultiQC](https://multiqc.info/)
+- Sequence analysis with [BCCDC-PHL/FluViewer](https://github.com/BCCDC-PHL/FluViewer)
 - Extract HPAI motif (applies to H5 sequences only)
-- Clade calls for H1, H3 and H5 influenza using Nextclade (H1 and H3, custom Nextclade for H5 used at BCCDC-PHL)
+- Clade calls for H1, H3 and H5 influenza using [Nextclade](https://github.com/nextstrain/nextclade)
 - Amino acid SNP calls against a specified reference
-- Genotype calls using GenoFLU against curated database  
+- Genotype calls using GenoFLU against curated database
 
 ```mermaid
 flowchart TD
-  fastq(FASTQ Input) --> fastq_trimmed{Read Trimming: fastp}
-  fastq_trimmed --> primer_trimmed{Remove Primers: cutadapt}
-  fastq_trimmed --> MultiQC{MultiQC}
-  primer_trimmed --> FASTQC{FASTQC}
-  FASTQC --> MultiQC
-  primer_trimmed --> MultiQC --> MultiQC_report[MultiQC Report]
-  primer_trimmed --> FluViewer{FluViewer} 
-  FluViewer  --> HPAI{HPAI script} --> HPAI_result[HPAI Results]
-  FluViewer -->  Segcov[Segment Coverage Plots]
-  FluViewer --> Consensus[Consensus Sequence]
-  Consensus -->  SNPCaller{SNP Calling: snp-calling.py} --> SNPCalls[AA SNP Outputs]
-  Consensus -->  CladeCaller{Clade Calling: Nextclade} --> CladeCalls[Clade Call Outputs]
-  FluViewer --> VCF[Variant Call Outputs]
-  Consensus --> GenoFLU[GenoFLU Genotype Calling]
+  fastq_input[FASTQ Input] --> fastp(fastp)
+  fastp -- trimmed_reads --> cutadapt(cutadapt)
+  cutadapt -- qc_stats --> multiqc(multiqc)
+  cutadapt -- primer_trimmed_reads --> fastqc(fastqc)
+  fastqc -- qc_stats --> multiqc
+  multiqc --> multiqc_report[MultiQC Report]
+  cutadapt -- primer_trimmed_reads --> normalize_reads(normalize_reads)
+  normalize_reads -- normalized_reads --> fluviewer(fluviewer)
+  fluviewer_db[FluViewer DB] --> fluviewer
+  fluviewer  -- ha_consensus --> clade_calling(clade_calling)
+  clade_calling --> clade_calls[clade-calls]
+  fluviewer -- consensus_main --> snp_caling(snp_calling)
+  snp_calling --> snp_calls[snp-calls]
+  fluviewer -- consensus_main --> genoflu(genoflu)
+  fluviewer -->  segment_coverage_plots[Segment Coverage Plots]
+  fluviewer --> consensus_sequence[Consensus Sequence]
+  fluviewer --> variants[Variants VCF]
+  genoflu --> genoflu_tsv[GenoFLU tsv]
 ```
 
 
@@ -54,8 +58,8 @@ Short read Illumina sequences, files ending in '.fastq.gz', '.fq.gz', '.fastq',
 For a full list of optional arguments, see: https://github.com/BCCDC-PHL/FluViewer
 
 | Argument              | Description                                                                                      | Default Value   |
-|----------------------------|--------------------------------------------------------------------------------------------------|----------------:|
-| `--target_depth`       | Depth to normalize coverage to, where sufficient depth is available in inputs.                   |             200 |
+|-----------------------|--------------------------------------------------------------------------------------------------|----------------:|
+| `--target_depth`      | Depth to normalize coverage to, where sufficient depth is available in inputs.                   |             200 |
 | `--min_depth`         | Minimum read depth for base calling.                                                             |              20 |
 | `--min_q`             | Minimum PHRED score for base quality and mapping quality.                                        |              20 |
 | `--min_cov`           | Minimum coverage of database reference sequence by contig, percentage.                           |              25 |
@@ -64,7 +68,7 @@ For a full list of optional arguments, see: https://github.com/BCCDC-PHL/FluView
 **Example command:**
 ```
 nextflow run BCCDC-PHL/fluviewer-nf \
-  -r v0.2.2 \
+  -r v0.3.0 \
   -profile conda \
   --cache ~/.conda/envs \
   --fastq_input /path/to/your_fastqs \
@@ -77,23 +81,83 @@ nextflow run BCCDC-PHL/fluviewer-nf \
 Outputs are written to the directory specified with the `--outdir` parameter. Below that are individual folders for each sample, containing the results of FluViewer, SNP calling, and clade calling.
 
 ```
-|-Run number
-|-----FluViewer_version_output
-          |----sample_number
-          |       |-----FluViewer outputs
-          |       |-----SNP Call outputs
-          |       |-----Clade Call outputs
-          |       |-----GenoFLU outputs
-          |----sample_number
-          |       |-----FluViewer outputs
-          |       |-----SNP Call outputs
-          |       |-----Clade Call outputs
-          |       |-----GenoFLU outputs
-          | ...
-          |----Provenance_files
-          |----sample_number_multiqc_report.html
-          |----sample_number_report.html
-          |----sample_number_timeline.html
+<outdir>
+├── <run_name>_fluviewer-nf_multiqc_report.html
+├── <run_name>_fluviewer-nf_nextflow_report.html
+├── <run_name>_fluviewer-nf_nextflow_timeline.html
+└── <sample-id>
+    ├── <sample-id>.fastp.html
+    ├── <sample-id>.fastp.json
+    ├── <sample-id>_<date-time>_provenance.yml
+    ├── <sample-id>_HA_consensus.fa
+    ├── <sample-id>_HPAI.tsv
+    ├── <sample-id>_NA_consensus.fa
+    ├── <sample-id>_alignment.bam
+    ├── <sample-id>_alignment.bam.bai
+    ├── <sample-id>_consensus_seqs.fa
+    ├── <sample-id>_contigs_blast.tsv
+    ├── <sample-id>_depth_of_cov.png
+    ├── <sample-id>_fluviewer_provenance.yml
+    ├── <sample-id>_genoflu.tsv
+    ├── <sample-id>_mapping_refs.fa
+    ├── <sample-id>_report.tsv
+    ├── <sample-id>_variants.vcf
+    ├── clade-calls
+    │   ├── <sample-id>_nextclade.aligned.fasta.gz
+    │   ├── <sample-id>_nextclade.csv
+    │   ├── <sample-id>_nextclade.json
+    │   ├── <sample-id>_nextclade.ndjson
+    │   ├── <sample-id>_nextclade.tsv
+    │   ├── <sample-id>_nextclade_HA1.translation.fasta.gz
+    │   ├── <sample-id>_nextclade_HA2.translation.fasta.gz
+    │   └── <sample-id>_nextclade_SigPep.translation.fasta.gz
+    ├── fluviewer_logs
+    │   ├── 01_assemble_contigs
+    │   │   ├── spades_stderr.txt
+    │   │   └── spades_stdout.txt
+    │   ├── 02_blast_contigs
+    │   │   ├── blastn_contigs_stderr.txt
+    │   │   └── blastn_contigs_stdout.txt
+    │   ├── 03_scaffolding
+    │   │   ├── blastn_scaffolds_stderr.txt
+    │   │   └── blastn_scaffolds_stdout.txt
+    │   ├── 04_read_mapping
+    │   │   ├── bwa_index_stderr.txt
+    │   │   ├── bwa_index_stdout.txt
+    │   │   ├── bwa_mem_stderr.txt
+    │   │   ├── bwa_mem_stdout.txt
+    │   │   ├── samtools_index_stderr.txt
+    │   │   ├── samtools_index_stdout.txt
+    │   │   ├── samtools_view_stderr.txt
+    │   │   └── samtools_view_stdout.txt
+    │   ├── 05_variant_calling
+    │   │   ├── freebayes_stderr.txt
+    │   │   └── freebayes_stdout.txt
+    │   ├── 06_consensus_calling
+    │   │   ├── bcftools_consensus_stderr.txt
+    │   │   ├── bcftools_consensus_stdout.txt
+    │   │   ├── bcftools_index_stderr.txt
+    │   │   ├── bcftools_index_stdout.txt
+    │   │   ├── bcftools_view_stderr.txt
+    │   │   └── bcftools_view_stdout.txt
+    │   ├── 07_summary_reporting
+    │   │   ├── samtools_depth_stderr.txt
+    │   │   ├── samtools_depth_stdout.txt
+    │   │   ├── samtools_idxstats_stderr.txt
+    │   │   └── samtools_idxstats_stdout.txt
+    │   ├── fluviewer.log
+    │   ├── makeblastdb_contigs_stderr.txt
+    │   └── makeblastdb_contigs_stdout.txt
+    └── snp-calls
+        ├── <sample-id>_HA_mutations.tsv
+        ├── <sample-id>_M_mutations.tsv
+        ├── <sample-id>_NA_mutations.tsv
+        ├── <sample-id>_NP_mutations.tsv
+        ├── <sample-id>_NS_mutations.tsv
+        ├── <sample-id>_PA_mutations.tsv
+        ├── <sample-id>_PB1_mutations.tsv
+        ├── <sample-id>_PB2_mutations.tsv
+        └── pairwise
 ```
 
 Output for each sample includes:
@@ -121,35 +185,50 @@ Output for each run includes:
 For each pipeline invocation, each sample will produce a `provenance.yml` file with the following contents.  Note the below is a contrived example.  
 
 ```yml
-- pipeline_name: BCCDC-PHL/FluViewer-nf
-  pipeline_version: 0.2.2
-  timestamp_analysis_start: 2023-11-21T05:43:25.541743
-- input_filename: {Sample}_R1.fastq.gz
-  input_path: /home/{USER{}}/Flu/test_data/test_production_run/{Sample}_R1.fastq.gz
-  sha256: 47380e49f10374660a2061d3571efe5339401484e646c2b47896fa701dbcf0a8
-- input_filename: {Sample}_R2.fastq.gz
-  input_path: /home/{USER}/Flu/test_data/test_production_run/{Sample}.fastq.gz
-  sha256: 39c95fd26af111ee9a6caeb840a7aced444b657550efea3ab7f74add0b30f69d
+- pipeline_name: BCCDC-PHL/fluviewer-nf
+  pipeline_version: 0.3.0
+  nextflow_session_id: 59fdd919-fc28-4af5-99e0-60355b11807c
+  nextflow_run_name: hopeful_bhaskara
+  timestamp_analysis_start: 2024-07-15T16:30:11.150887-07:00
+- input_filename: sample-01_R1.fastq.gz
+  file_type: fastq_input
+  sha256: 2c1d5b310a5ca11cc2b1665c094a064fc3aa597e06f392985dac639bd2ab4d81
+- input_filename: sample-01_R2.fastq.gz
+  file_type: fastq_input
+  sha256: 73745eed4badc3594cdd8554e90c75ae4b9b4599ca255064415ded433e598749
 - process_name: fastp
   tools:
     - tool_name: fastp
-      tool_version: 0.23.1
+      tool_version: 0.23.2
 - process_name: cutadapt
   tools:
     - tool_name: cutadapt
-      tool_version: 4.1
+      tool_version: 4.4
+- process_name: normalize_depth
+  tools:
+    - tool_name: bbnorm
+      tool_version: 39.01
 - process_name: fluviewer
   tools:
-    - tool_name: FluViewer
-      tool_version: FluViewer v0.0.2
+    - tool_name: fluviewer
+      tool_version: 0.1.11-6
+  databases:
+    - database_name: FluViewer_db.fa
+      database_path: /path/to/FluViewer_db.fa
+      database_sha256: c9ba1af0a637671d86a14aceac3cbfde309ae9a5bd613d75c87ba2ff390b4c48
+process_name: nextclade
+tools:
+  - tool_name: nextclade
+    tool_version: 3.8.1
+    subcommand: run
+- process_name: snp_caling
+  tools:
+    - tool_name: blastx
+      tool_version: 2.15.0+
   databases:
-    - database_name: FluViewer_db_full_20220915.fasta
-      database_path: /home/{USER}/Flu/ref/FluViewer_db_full_20220915.fasta
-      database_sha256: 55b33afa21ad44ed1e6db896cf420fae6b1524c0ad205775a1ce9dd11595905d
-
-- process_name: nextclade
-  tool_name: nextclade
-  tool_version: 2.9.1
-  Dataset location: /scratch/{USER}/flu/nextclade_datasets/nextclade_flu_h5nx_ha/
-  Dataset version:   "tag": "2023-04-02T12:00:00Z"
+    - database_name: blastx_subtype_db.fasta
+- process_name: genoflu
+  tools:
+    - tool_name: genoflu
+      tool_version: 1.03
 ```
diff --git a/modules/fluviewer.nf b/modules/fluviewer.nf
@@ -14,7 +14,7 @@ process normalize_depth {
     printf -- "- process_name: normalize_depth\\n"                >> ${sample_id}_normalize_depth_provenance.yml
     printf -- "  tools:\\n"                                       >> ${sample_id}_normalize_depth_provenance.yml
     printf -- "    - tool_name: bbnorm\\n"                        >> ${sample_id}_normalize_depth_provenance.yml
-    printf -- "      tool_version: \$(bbnorm.sh --version)\\n"    >> ${sample_id}_normalize_depth_provenance.yml
+    printf -- "      tool_version: \$(bbnorm.sh --version 2>&1 | head -n 2 | tail -n 1 | cut -d ' ' -f 3)\\n" >> ${sample_id}_normalize_depth_provenance.yml
 
     bbnorm.sh \
 	-Xmx${max_memory_gb}g \