From f478d472163e3daf65f0970c4a7739143ba62050 Mon Sep 17 00:00:00 2001
From: Dan Fornika <dfornika@gmail.com>
Date: Fri, 18 Dec 2020 13:46:56 -0800
Subject: [PATCH] Add Reference Alignment + UTR Trimming, Dehosting and iVar
 update (#16)

* Index reference before samtools and ivar access the reference in callVariants step

* Try indexing the reference before callVariants

* Qc update (#56)

* Update qc.py to reflect changed standards

* Filter out Illumina Undetermined_S0

* Make sure to negate the matcher

* Add minReadsPerBarcode parameter for Nanopore workflow (#57)

The barcode<n*> directories of de-plexed Fastq input are filtered to
exclude any containing fewer than 5 files (guppy makes all of the
barcode directories, whether you used the barcodes or not).

This patch changes the filter to scan the files and count the records
within to 1) avoid excluding barcodes where many reads are in fewer
than 5 files and 2) avoid including barcodes where there are very few
reads in more than 5 files.

This threshold can be changed with the new optional parameter
minReadsPerBarcode, which defaults to 100 reads.

Co-authored-by: Keith James <dev@deoxybyte.co.uk>

* Remove unnecessary samtools view -bS (#48)

Co-authored-by: ac55-sanger <ac55@github.com>

* Added libxcb, default Qt library backend for graphics rendering, to dependencies (#60)

* use ivar default of 30 for illuminaKeepLen, and comment (#61)

Co-authored-by: David K. Jackson <david.jackson@physics.org>

* add basic host filter step

* remove hardcoded paths

* move host filter to within workflow

* synchronize host removal with SIGNAL

* limit host removal cpus

* Move performHostFilter to illuminaNcov workflow

* Add pysam dependency for filter_non_human_reads.py

* update ivar to 1.2.3

* increase minimum read length after trimming

* add step for filtering residual adapters missed by trimming

* fix samtools/pysam version in environment

* lower required match length for adapter check

* restrict short adapter matches to the suffix of the read

* update to ivar 1.3, add option for amplicon filtering

* remove CLIMB-specific modules/calls

* Align to ref (#15)

* Add alignment step using MAFFT and UTR-trimming step

* Remove extra = char

* Tidy up illumina environment

* Remove local executor definition

* Remove addition of whitespace

* Remove deletion of whitespace

* Remove addition of whitespace

* Publish dehosted fastqs

* Fix process name for alignment step in illumina workflow

* Set default viral_contig_name

* Add sampleId tag to performHostFilter process

* Enable testing against development branch

* Remove testing scripts not in use

* index bam files and publish indexes (#23)

* index bam files and publish indexes

* Move hostFilter to sequenceAnalysis, publish dehosted reads and consensus to nml_upload (#25)

Co-authored-by: Michael Laszloffy <michael.laszloffy@oicr.on.ca>
Co-authored-by: Michael Laszloffy <mlaszloffy@users.noreply.github.com>
Co-authored-by: Matt Bull <bullmj2@gmail.com>
Co-authored-by: Keith James <47220353+kjsanger@users.noreply.github.com>
Co-authored-by: Keith James <dev@deoxybyte.co.uk>
Co-authored-by: Ashwini Chhipa <58439277+ac55-sanger@users.noreply.github.com>
Co-authored-by: ac55-sanger <ac55@github.com>
Co-authored-by: nodrogluap <gordonp@ucalgary.ca>
Co-authored-by: dkj <david.jackson@sanger.ac.uk>
Co-authored-by: David K. Jackson <david.jackson@physics.org>
Co-authored-by: Jared Simpson <jared.simpson@gmail.com>
Co-authored-by: Lawrence Heisler <lheisler.bioinformatics@gmail.com>
---
 .github/workflows/pull_request.yml    |   1 +
 bin/filter_non_human_reads.py         |  67 ++++++++++++++++
 bin/filter_residual_adapters.py       |  85 ++++++++++++++++++++
 conf/base.config                      |  21 ++---
 conf/illumina.config                  |   2 +-
 environments/extras.yml               |   2 +-
 environments/illumina/environment.yml |   6 +-
 modules/illumina.nf                   | 108 +++++++++++++++++++++++---
 modules/qc.nf                         |   2 +-
 modules/upload.nf                     |  24 +-----
 modules/utils.nf                      |  20 +++++
 workflows/articNcovNanopore.nf        |  14 ----
 workflows/illuminaNcov.nf             |  37 ++++-----
 workflows/upload.nf                   |  14 ----
 14 files changed, 307 insertions(+), 96 deletions(-)
 create mode 100755 bin/filter_non_human_reads.py
 create mode 100755 bin/filter_residual_adapters.py
 create mode 100644 modules/utils.nf
 delete mode 100644 workflows/upload.nf

diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml
index 09d2f1e4..d76b441d 100644
--- a/.github/workflows/pull_request.yml
+++ b/.github/workflows/pull_request.yml
@@ -2,6 +2,7 @@ on:
   pull_request:
     branches:
     - master
+    - development
 name: master Pull Request
 jobs:
   test:
diff --git a/bin/filter_non_human_reads.py b/bin/filter_non_human_reads.py
new file mode 100755
index 00000000..7eb1c16f
--- /dev/null
+++ b/bin/filter_non_human_reads.py
@@ -0,0 +1,67 @@
+#!/usr/bin/env python
+import pysam
+import sys
+import argparse
+
+def filter_reads(contig_name, input_sam_fp, output_bam_fp):
+
+    # use streams if args are None
+    if input_sam_fp:
+        input_sam = pysam.AlignmentFile(input_sam_fp, 'r')
+    else:
+        input_sam = pysam.AlignmentFile('-', 'r')
+
+    if output_bam_fp:
+        output_bam = pysam.AlignmentFile(output_bam_fp, 'wb',
+                                         template=input_sam)
+    else:
+        output_bam = pysam.AlignmentFile('-', 'wb', template=input_sam)
+
+    # if read isn't mapped or mapped to viral reference contig name
+    viral_reads = 0
+    human_reads = 0
+    unmapped_reads = 0
+
+    # iterate over input from BWA
+    for read in input_sam:
+        # only look at primary alignments
+        if not read.is_supplementary and not read.is_secondary:
+            if read.reference_name == contig_name:
+                output_bam.write(read)
+                viral_reads += 1
+            elif read.is_unmapped:
+                output_bam.write(read)
+                unmapped_reads +=1
+            else:
+                human_reads += 1
+
+    total_reads = viral_reads + human_reads + unmapped_reads
+
+    print(f"viral read count = {viral_reads} "
+            f"({viral_reads/total_reads * 100:.2f}%)", file=sys.stderr)
+    print(f"human read count = {human_reads} "
+            f"({human_reads/total_reads * 100:.2f}%) ", file=sys.stderr)
+    print(f"unmapped read count = {unmapped_reads} "
+            f"({unmapped_reads/total_reads * 100:.2f}%)", file=sys.stderr)
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(description="Get all reads that are "
+                                                  "unmapped and map to a "
+                                                  "specific reference "
+                                                  "contig")
+    parser.add_argument('-i', '--input', required=False, default=False,
+                        help="Input SAM formatted file (stdin used if "
+                              " not specified)")
+
+    parser.add_argument('-o', '--output', required=False, default=False,
+                        help="Output BAM formatted file (stdout used if not "
+                             "specified)")
+
+    parser.add_argument('-c', '--contig_name', required=False,
+                        default="MN908947.3",
+                        help="Contig name to retain e.g. viral")
+
+    args = parser.parse_args()
+
+    filter_reads(args.contig_name, args.input, args.output)
diff --git a/bin/filter_residual_adapters.py b/bin/filter_residual_adapters.py
new file mode 100755
index 00000000..9d124292
--- /dev/null
+++ b/bin/filter_residual_adapters.py
@@ -0,0 +1,85 @@
+#!/usr/bin/env python
+import gzip
+import pysam
+import sys
+import argparse
+
+# require that filename contains the specified extension
+def require_extension(filename, ext):
+    assert(args.input_R1[-len(ext):] == ext)
+
+def contains_adapter(read_sequence, adapter_sequence, min_match_length):
+    # discard all reads that contain the full adapter 
+    if adapter_sequence in read_sequence:
+        return True
+
+    # discard all reads where the suffix of length min_match_length is contained within the adapter
+    # this handles the case where there is a fairly good adapter match that is truncated
+    # by the end of the read
+    rl = len(read_sequence)
+    if rl >= min_match_length:
+        read_suffix = read_sequence[(rl - min_match_length):]
+        if read_suffix in adapter_sequence:
+            return True
+    return False
+
+def filter_reads(filter_sequences, min_match_length, input_R1_fp, input_R2_fp, output_R1_fp, output_R2_fp):
+    
+    input_R1 = pysam.FastxFile(input_R1_fp)
+    input_R2 = pysam.FastxFile(input_R2_fp)
+    
+    output_R1 = gzip.open(output_R1_fp, 'w')
+    output_R2 = gzip.open(output_R2_fp, 'w')
+
+    reads_filtered = 0
+    reads_kept = 0
+    for R1, R2 in zip(input_R1, input_R2):
+        
+        discard = False
+        for f in filter_sequences:
+            if contains_adapter(R1.sequence, f, min_match_length) or contains_adapter(R2.sequence, f, min_match_length):
+                discard = True
+                break
+
+        if discard:
+            reads_filtered += 1
+            continue
+        else:
+            reads_kept += 1
+
+            fq1 = str(R1) + "\n"
+            fq2 = str(R2) + "\n"
+            output_R1.write(fq1.encode('utf-8'))
+            output_R2.write(fq2.encode('utf-8'))
+
+    print(f"reads kept: {reads_kept}, reads filtered: {reads_filtered}")
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(description="Discard read pairs that contain sequencing adapters that are missed by a trimmer")
+    parser.add_argument('--input_R1', required=True, default=False,
+                        help="Input fasta/fastq file for first half of pair")
+    
+    parser.add_argument('--input_R2', required=True, default=False,
+                        help="Input fasta/fastq file for second half of pair")
+ 
+    args = parser.parse_args()
+    
+    # this is designed to only work in the nextflow pipeline so we put strict requirements on the input/output names
+    in_ext = ".fq.gz"
+    require_extension(args.input_R1, in_ext)
+    require_extension(args.input_R2, in_ext)
+
+    out_ext = "_posttrim_filter.fq.gz"
+    output_R1 = args.input_R1.replace(in_ext, out_ext)
+    output_R2 = args.input_R2.replace(in_ext, out_ext)
+
+    # Illumina adapters that are occasionally leftover in reads
+    S7 = "CCGAGCCCACGAGAC"
+    P7 = "ATCTCGTATGCCGTCTTCTGCTTG"
+
+    # require a minimum match between read and adapter
+    min_length = 10
+    filter_sequences = [ S7, P7 ]
+
+    filter_reads(filter_sequences, min_length, args.input_R1, args.input_R2, output_R1, output_R2)
diff --git a/conf/base.config b/conf/base.config
index 4ae74c3e..e6fdfda0 100644
--- a/conf/base.config
+++ b/conf/base.config
@@ -10,6 +10,10 @@ params{
     ref = false
     bed = false
     profile = false
+    
+    // host filtering
+    composite_ref = false
+    viral_contig_name = 'MN908947.3'
 
     // Repo to download your primer scheme from
     schemeRepoURL = 'https://github.com/artic-network/artic-ncov2019.git'
@@ -21,7 +25,10 @@ params{
     scheme =  'nCoV-2019'
 
     // Scheme version
-    schemeVersion = 'V2'
+    schemeVersion = 'V3'
+
+    // For ivar's amplicon filter
+    primer_pairs_tsv = false
 
     // Run experimental medaka pipeline? Specify in the command using "--medaka"
     medaka = false
@@ -31,18 +38,6 @@ params{
  
     // Run nanopolish pipeline
     nanopolish = false
-
-    // Upload data to CLIMB?
-    upload = false
-
-    // CLIMB username
-    CLIMBUser = false
-
-    // CLIMB SSH pubkey
-    CLIMBkey = false
-
-    // CLIMB hostname
-    CLIMBHostname = false
 }
 
 
diff --git a/conf/illumina.config b/conf/illumina.config
index 4b39d48a..e5458111 100644
--- a/conf/illumina.config
+++ b/conf/illumina.config
@@ -28,7 +28,7 @@ params {
     allowNoprimer = true
 
     // Length of illumina reads to keep after primer trimming
-    illuminaKeepLen = 20
+    illuminaKeepLen = 50
 
     // Sliding window quality threshold for keeping reads after primer trimming (illumina)
     illuminaQualThreshold = 20
diff --git a/environments/extras.yml b/environments/extras.yml
index ba81b210..ced40cad 100644
--- a/environments/extras.yml
+++ b/environments/extras.yml
@@ -6,5 +6,5 @@ dependencies:
   - biopython
   - matplotlib
   - pandas
-  - samtools=1.10-0
+  - samtools
   - libxcb
diff --git a/environments/illumina/environment.yml b/environments/illumina/environment.yml
index 45eeae2c..311a53cf 100644
--- a/environments/illumina/environment.yml
+++ b/environments/illumina/environment.yml
@@ -8,6 +8,8 @@ dependencies:
   - biopython=1.74
   - pandas=0.23.0=py36_1
   - bwa=0.7.17=pl5.22.0_2
-  - samtools=1.9
+  - samtools=1.10
   - trim-galore=0.6.5
-  - ivar=1.2.2
+  - ivar=1.3
+  - pysam=0.16.0.1
+  - mafft=7.471
diff --git a/modules/illumina.nf b/modules/illumina.nf
index be08c1d3..131bef2f 100644
--- a/modules/illumina.nf
+++ b/modules/illumina.nf
@@ -27,6 +27,35 @@ process readTrimming {
     """
 }
 
+process filterResidualAdapters {
+    /**
+    * Discard reads that contain residual adapter sequences that indicate trimming may have failed
+    * @input tuple(sampleName, path(forward), path(reverse))
+    * @output untrim_filter_out tuple(sampleName, path("*_val_1.fq.gz"), path("*_val_2.fq.gz"))
+    */
+
+    tag { sampleName }
+
+    publishDir "${params.outdir}/${task.process.replaceAll(":","_")}", pattern: '*{1,2}_posttrim_filter.fq.gz', mode: 'copy'
+
+    cpus 2
+
+    input:
+    tuple(sampleName, path(forward), path(reverse))
+
+    output:
+    tuple(sampleName, path("*1_posttrim_filter.fq.gz"), path("*2_posttrim_filter.fq.gz")) optional true
+
+    script:
+    """
+    if [[ \$(gunzip -c ${forward} | head -n4 | wc -l) -eq 0 ]]; then
+      exit 0
+    else
+      filter_residual_adapters.py --input_R1 $forward --input_R2 $reverse 
+    fi
+    """
+}
+
 process indexReference {
     /**
     * Indexes reference fasta file in the scheme repo using bwa.
@@ -59,18 +88,19 @@ process readMapping {
 
     label 'largecpu'
 
-    publishDir "${params.outdir}/${task.process.replaceAll(":","_")}", pattern: "${sampleName}.sorted.bam", mode: 'copy'
+    publishDir "${params.outdir}/${task.process.replaceAll(":","_")}", pattern: "${sampleName}.sorted{.bam,.bam.bai}", mode: 'copy'
 
     input:
         tuple sampleName, path(forward), path(reverse), path(ref), path("*")
 
     output:
-        tuple(sampleName, path("${sampleName}.sorted.bam"))
+        tuple(sampleName, path("${sampleName}.sorted.bam"), path("${sampleName}.sorted.bam.bai"))
 
     script:
         """
         bwa mem -t ${task.cpus} ${ref} ${forward} ${reverse} | \
         samtools sort -o ${sampleName}.sorted.bam
+        samtools index ${sampleName}.sorted.bam
         """
 }
 
@@ -78,15 +108,15 @@ process trimPrimerSequences {
 
     tag { sampleName }
 
-    publishDir "${params.outdir}/${task.process.replaceAll(":","_")}", pattern: "${sampleName}.mapped.bam", mode: 'copy'
-    publishDir "${params.outdir}/${task.process.replaceAll(":","_")}", pattern: "${sampleName}.mapped.primertrimmed.sorted.bam", mode: 'copy'
+    publishDir "${params.outdir}/${task.process.replaceAll(":","_")}", pattern: "${sampleName}.mapped{.bam,.bam.bai}", mode: 'copy'
+    publishDir "${params.outdir}/${task.process.replaceAll(":","_")}", pattern: "${sampleName}.mapped.primertrimmed.sorted{.bam,.bam.bai}", mode: 'copy'
 
     input:
-    tuple sampleName, path(bam), path(bedfile)
+    tuple sampleName, path(bam), path(bam_index), path(bedfile)
 
     output:
-    tuple sampleName, path("${sampleName}.mapped.bam"), emit: mapped
-    tuple sampleName, path("${sampleName}.mapped.primertrimmed.sorted.bam" ), emit: ptrim
+    tuple sampleName, path("${sampleName}.mapped.bam"), path("${sampleName}.mapped.bam.bai"), emit: mapped
+    tuple sampleName, path("${sampleName}.mapped.primertrimmed.sorted.bam"), path("${sampleName}.mapped.primertrimmed.sorted.bam.bai" ), emit: ptrim
 
     script:
     if (params.allowNoprimer){
@@ -97,8 +127,9 @@ process trimPrimerSequences {
         """
         samtools view -F4 -o ${sampleName}.mapped.bam ${bam}
         samtools index ${sampleName}.mapped.bam
-        ${ivarCmd} -i ${sampleName}.mapped.bam -b ${bedfile} -m ${params.illuminaKeepLen} -q ${params.illuminaQualThreshold} -p ivar.out
+        ${ivarCmd} -i ${sampleName}.mapped.bam -b ${bedfile} -m ${params.illuminaKeepLen} -q ${params.illuminaQualThreshold} -f ${params.primer_pairs_tsv} -p ivar.out
         samtools sort -o ${sampleName}.mapped.primertrimmed.sorted.bam ivar.out.bam
+        samtools index ${sampleName}.mapped.primertrimmed.sorted.bam
         """
 }
 
@@ -109,13 +140,14 @@ process callVariants {
     publishDir "${params.outdir}/${task.process.replaceAll(":","_")}", pattern: "${sampleName}.variants.tsv", mode: 'copy'
 
     input:
-    tuple(sampleName, path(bam), path(ref))
+    tuple(sampleName, path(bam), path(bam_index), path(ref))
 
     output:
     tuple sampleName, path("${sampleName}.variants.tsv")
 
     script:
         """
+        samtools faidx ${ref}
         samtools mpileup -A -d 0 --reference ${ref} -B -Q 0 ${bam} |\
         ivar variants -r ${ref} -m ${params.ivarMinDepth} -p ${sampleName}.variants -q ${params.ivarMinVariantQuality} -t ${params.ivarMinFreqThreshold}
         """
@@ -128,7 +160,7 @@ process makeConsensus {
     publishDir "${params.outdir}/${task.process.replaceAll(":","_")}", pattern: "${sampleName}.primertrimmed.consensus.fa", mode: 'copy'
 
     input:
-        tuple(sampleName, path(bam))
+        tuple(sampleName, path(bam), path(bam_index))
 
     output:
         tuple(sampleName, path("${sampleName}.primertrimmed.consensus.fa"))
@@ -163,3 +195,59 @@ process cramToFastq {
         """
 }
 
+process alignConsensusToReference {
+    /**
+    * Aligns consensus sequence against reference using mafft. Uses the --keeplength
+    * flag to guarantee that all alignments remain the same length as the reference.
+    */
+
+    tag { sampleName }
+
+    publishDir "${params.outdir}/${task.process.replaceAll(":","_")}", pattern: "${sampleName}.primertrimmed.consensus.aln.fa", mode: 'copy'
+
+    input:
+        tuple(sampleName, path(consensus), path(reference))
+
+    output:
+        tuple(sampleName, path("${sampleName}.primertrimmed.consensus.aln.fa"))
+
+    script:
+        // Convert multi-line fasta to single line
+        awk_string = '/^>/ {printf("\\n%s\\n", $0); next; } { printf("%s", $0); }  END { printf("\\n"); }'
+        """
+        mafft \
+          --preservecase \
+          --keeplength \
+          --add \
+          ${consensus} \
+          ${reference} \
+          > ${sampleName}.with_ref.multi_line.alignment.fa
+        awk '${awk_string}' ${sampleName}.with_ref.multi_line.alignment.fa > ${sampleName}.with_ref.single_line.alignment.fa
+        tail -n 2 ${sampleName}.with_ref.single_line.alignment.fa > ${sampleName}.primertrimmed.consensus.aln.fa
+        """
+}
+
+process trimUTRFromAlignment {
+    /**
+    * Trim the aligned consensus to remove 3' and 5' UTR sequences.
+    */
+
+    tag { sampleName }
+
+    publishDir "${params.outdir}/${task.process.replaceAll(":","_")}", pattern: "${sampleName}.primertrimmed.consensus.aln.utr_trimmed.fa", mode: 'copy'
+
+    input:
+        tuple(sampleName, path(alignment))
+
+    output:
+        tuple(sampleName, path("${sampleName}.primertrimmed.consensus.aln.utr_trimmed.fa"))
+
+    script:
+    awk_string = '/^>/ { printf("%s\\n", $0); next; } { printf("%s", $0); } END { printf("\\n"); }'
+        """
+        echo -e "\$(head -n 1 ${alignment} | cut -c 2-):266-29674" > non_utr.txt
+        samtools faidx ${alignment}
+        samtools faidx -r non_utr.txt ${alignment} > ${sampleName}.primertrimmed.consensus.aln.utr_trimmed.multi_line.fa
+        awk '${awk_string}' ${sampleName}.primertrimmed.consensus.aln.utr_trimmed.multi_line.fa > ${sampleName}.primertrimmed.consensus.aln.utr_trimmed.fa
+        """
+}
diff --git a/modules/qc.nf b/modules/qc.nf
index a1db6fc8..b854db1c 100644
--- a/modules/qc.nf
+++ b/modules/qc.nf
@@ -6,7 +6,7 @@ process makeQCCSV {
     publishDir "${params.outdir}/qc_plots", pattern: "${sampleName}.depth.png", mode: 'copy'
 
     input:
-    tuple sampleName, path(bam), path(fasta), path(ref)
+    tuple sampleName, path(bam), path(bam_index), path(fasta), path(ref)
 
     output:
     path "${params.prefix}.${sampleName}.qc.csv", emit: csv
diff --git a/modules/upload.nf b/modules/upload.nf
index 6b9dceaf..da5f26cc 100644
--- a/modules/upload.nf
+++ b/modules/upload.nf
@@ -1,18 +1,17 @@
 process collateSamples {
     tag { sampleName }
 
-    publishDir "${params.outdir}/qc_pass_climb_upload/${params.prefix}", pattern: "${sampleName}", mode: 'copy'
+    publishDir "${params.outdir}/nml_upload/", pattern: "${params.prefix}/${sampleName}*", mode: 'copy'
 
     input:
-    tuple(sampleName, path(bam), path(fasta))
+    tuple(sampleName, path(fasta), path(fastq_r1), path(fastq_r2))
 
     output:
-    path("${sampleName}")
+    path("${params.prefix}/${sampleName}*")
 
     script:
     """
-    mkdir ${sampleName}
-    mv ${bam} ${fasta} ${sampleName}
+    mkdir ${params.prefix} && cp ${sampleName}* ${params.prefix}
     """
 }
 
@@ -31,18 +30,3 @@ process prepareUploadDirectory {
     """
 }
 
-
-process uploadToCLIMB {
-    tag { params.prefix }
-
-    input:
-    tuple(path(sshkey), path(uploadDir))
-
-    output:
-
-    script:
-    """
-    rsync -Lav -e "ssh -i ${sshkey} -l ${params.CLIMBUser}" ${uploadDir} ${params.CLIMBHostname}:upload/
-    """
-}
-
diff --git a/modules/utils.nf b/modules/utils.nf
new file mode 100644
index 00000000..881f0ed8
--- /dev/null
+++ b/modules/utils.nf
@@ -0,0 +1,20 @@
+process performHostFilter {
+    cpus 4
+
+    tag { sampleId }
+
+    publishDir "${params.outdir}/${task.process.replaceAll(":","_")}", pattern: "${sampleId}_hostfiltered_R*.fastq.gz", mode: 'copy'
+
+    input:
+        tuple(val(sampleId), path(forward), path(reverse))
+    output:
+        tuple sampleId, path("${sampleId}_hostfiltered_R1.fastq.gz"), path("${sampleId}_hostfiltered_R2.fastq.gz"), emit: fastqPairs
+
+    script:
+        """
+        bwa mem -t ${task.cpus} ${params.composite_ref} ${forward} ${reverse} | \
+            filter_non_human_reads.py -c ${params.viral_contig_name} > ${sampleId}.viral_and_nonmapping_reads.bam
+        samtools sort -n ${sampleId}.viral_and_nonmapping_reads.bam | \
+             samtools fastq -1 ${sampleId}_hostfiltered_R1.fastq.gz -2 ${sampleId}_hostfiltered_R2.fastq.gz -s ${sampleId}_singletons.fastq.gz -
+        """
+}
diff --git a/workflows/articNcovNanopore.nf b/workflows/articNcovNanopore.nf
index 9c2e8f53..2eebf235 100644
--- a/workflows/articNcovNanopore.nf
+++ b/workflows/articNcovNanopore.nf
@@ -17,11 +17,6 @@ include {bamToCram} from '../modules/out.nf'
 
 include {collateSamples} from '../modules/upload.nf'
 
-
-// import subworkflows
-include {CLIMBrsync} from './upload.nf'
-
-
 // workflow component for artic pipeline
 workflow sequenceAnalysisNanopolish {
     take:
@@ -132,13 +127,4 @@ workflow articNcovNanopore {
       } else if ( params.medaka ) {
           sequenceAnalysisMedaka(ch_fastqDirs)
       }
-
-      if ( params.upload ) {
-
-        Channel.fromPath("${params.CLIMBkey}")
-               .set{ ch_CLIMBkey }
-
-        CLIMBrsync(sequenceAnalysis.out.qc_pass, ch_CLIMBkey )
-      }
 }
-
diff --git a/workflows/illuminaNcov.nf b/workflows/illuminaNcov.nf
index 0eeeb036..26a90ba3 100644
--- a/workflows/illuminaNcov.nf
+++ b/workflows/illuminaNcov.nf
@@ -6,12 +6,16 @@ nextflow.preview.dsl = 2
 // import modules
 include {articDownloadScheme } from '../modules/artic.nf' 
 include {readTrimming} from '../modules/illumina.nf' 
+include {filterResidualAdapters} from '../modules/illumina.nf' 
 include {indexReference} from '../modules/illumina.nf'
 include {readMapping} from '../modules/illumina.nf' 
 include {trimPrimerSequences} from '../modules/illumina.nf' 
 include {callVariants} from '../modules/illumina.nf'
 include {makeConsensus} from '../modules/illumina.nf' 
 include {cramToFastq} from '../modules/illumina.nf'
+include {alignConsensusToReference} from '../modules/illumina.nf'
+include {trimUTRFromAlignment} from '../modules/illumina.nf'
+include {performHostFilter} from '../modules/utils'
 
 include {makeQCCSV} from '../modules/qc.nf'
 include {writeQCSummaryCSV} from '../modules/qc.nf'
@@ -20,10 +24,6 @@ include {bamToCram} from '../modules/out.nf'
 
 include {collateSamples} from '../modules/upload.nf'
 
-// import subworkflows
-include {CLIMBrsync} from './upload.nf'
-
-
 workflow prepareReferenceFiles {
     // Get reference fasta
     if (params.ref) {
@@ -87,9 +87,14 @@ workflow sequenceAnalysis {
       ch_bedFile
 
     main:
-      readTrimming(ch_filePairs)
 
-      readMapping(readTrimming.out.combine(ch_preparedRef))
+      performHostFilter(ch_filePairs)
+
+      readTrimming(performHostFilter.out)
+
+      filterResidualAdapters(readTrimming.out)
+
+      readMapping(filterResidualAdapters.out.combine(ch_preparedRef))
 
       trimPrimerSequences(readMapping.out.combine(ch_bedFile))
 
@@ -97,6 +102,10 @@ workflow sequenceAnalysis {
 
       makeConsensus(trimPrimerSequences.out.ptrim)
 
+      alignConsensusToReference(makeConsensus.out.combine(ch_preparedRef.map{ it[0] }))
+
+      trimUTRFromAlignment(alignConsensusToReference.out)
+
       makeQCCSV(trimPrimerSequences.out.ptrim.join(makeConsensus.out, by: 0)
                                    .combine(ch_preparedRef.map{ it[0] }))
 
@@ -111,9 +120,7 @@ workflow sequenceAnalysis {
 
       writeQCSummaryCSV(qc.header.concat(qc.pass).concat(qc.fail).toList())
 
-      collateSamples(qc.pass.map{ it[0] }
-                           .join(makeConsensus.out, by: 0)
-                           .join(trimPrimerSequences.out.mapped))     
+      collateSamples(makeConsensus.out.join(performHostFilter.out.fastqPairs))
 
       if (params.outCram) {
         bamToCram(trimPrimerSequences.out.mapped.map{it[0] } 
@@ -132,19 +139,9 @@ workflow ncovIllumina {
     main:
       // Build or download fasta, index and bedfile as required
       prepareReferenceFiles()
-      
+
       // Actually do analysis
       sequenceAnalysis(ch_filePairs, prepareReferenceFiles.out.bwaindex, prepareReferenceFiles.out.bedfile)
- 
-      // Upload files to CLIMB
-      if ( params.upload ) {
-        
-        Channel.fromPath("${params.CLIMBkey}")
-               .set{ ch_CLIMBkey }
-      
-        CLIMBrsync(sequenceAnalysis.out.qc_pass, ch_CLIMBkey )
-      }
-
 }
 
 workflow ncovIlluminaCram {
diff --git a/workflows/upload.nf b/workflows/upload.nf
deleted file mode 100644
index f645c41d..00000000
--- a/workflows/upload.nf
+++ /dev/null
@@ -1,14 +0,0 @@
-
-include {prepareUploadDirectory} from '../modules/upload.nf'
-include {uploadToCLIMB} from '../modules/upload.nf'
-
-workflow CLIMBrsync {
-    take:
-      ch_uploadDirectories
-      ch_CLIMBkey
-
-    main:
-      prepareUploadDirectory(ch_uploadDirectories.collect())
-      uploadToCLIMB(ch_CLIMBkey.combine(prepareUploadDirectory.out))
-}
-