From 8f5a4af08d482fe1907b24157602b151994becce Mon Sep 17 00:00:00 2001
From: Dan Fornika <dfornika@gmail.com>
Date: Mon, 16 Oct 2023 16:11:54 -0700
Subject: [PATCH] Add support for samplesheet input (#54)

* Add support for samplesheet input

* Update README
---
 README.md            | 20 +++++++++++++++++++-
 conf/illumina.config |  4 ++++
 main.nf              | 16 ++++++++++++----
 3 files changed, 35 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index c2ed82c5..70d71723 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,7 @@ This Nextflow pipeline automates the ARTIC network [nCoV-2019 novel coronavirus
 ##### Illumina
 
 ```
-nextflow run BCCDC-PHL/ncov2019-artic-nf [-profile conda,singularity,docker,slurm,lsf] \
+nextflow run BCCDC-PHL/ncov2019-artic-nf -profile conda \
   --illumina --prefix "output_file_prefix" \
   --primer_pairs_tsv /path/to/primer_pairs_tsv \
   --composite_ref /path/to/human_and_sars-cov-2_composite_ref \
@@ -27,6 +27,24 @@ For production use at large scale, where you will run the workflow many times, y
 
 Alternatively you can avoid just the cloning of the scheme repository to remain on a fixed revision of it over time by passing --schemeRepoURL /path/to/own/clone/of/github.com/artic-network/artic-ncov2019. This removes any internet access from the workflow except for the optional upload steps.
 
+###### SampleSheet Input
+
+Samples can also be provided to the pipeline via a `samplesheet.csv` file:
+
+```
+nextflow run BCCDC-PHL/ncov2019-artic-nf -profile conda \
+  --illumina --prefix "output_file_prefix" \
+  --primer_pairs_tsv /path/to/primer_pairs_tsv \
+  --composite_ref /path/to/human_and_sars-cov-2_composite_ref \
+  --samplesheet_input /path/to/samplesheet.csv
+```
+
+The `samplesheet.csv` file must include the headers:
+
+`ID,R1,R2`
+
+...and each record should be a comma-separated line consisting of the sample ID, the path to the R1 fastq file for that sample, and the path to the R2 fastq file for that sample.
+
 ##### Nanopore
 ###### Nanopolish
 
diff --git a/conf/illumina.config b/conf/illumina.config
index f7826afe..5f304a31 100644
--- a/conf/illumina.config
+++ b/conf/illumina.config
@@ -20,6 +20,10 @@ params {
     fastq_exts = ['.fastq.gz', '.fq.gz']
 
     fastqSearchPath = makeFastqSearchPath( params.illuminaSuffixes, params.fastq_exts )
+
+    // Provide sample ID and fastq paths via a samplesheet.csv with fields:
+    // ID,R1,R2
+    samplesheet_input = 'NO_FILE'
     
     // Use cram input instead of fastq files
     cram = false
diff --git a/main.nf b/main.nf
index c6d6b332..ffcacce6 100644
--- a/main.nf
+++ b/main.nf
@@ -22,8 +22,9 @@ if (params.profile){
 }
 
 if ( params.illumina ) {
-   if ( !params.directory ) {
+   if ( !params.directory && params.samplesheet_input == "NO_FILE" ) {
        println("Please supply a directory containing fastqs or CRAMs with --directory. Specify --cram if supplying a CRAMs directory")
+       println("Or provide a samplesheet (headers: ID,R1,R2) with --samplesheet_input")
        println("Use --help to print help")
        System.exit(1)
    }
@@ -78,9 +79,16 @@ workflow {
                   .set{ ch_cramFiles }
        }
        else {
-	   Channel.fromFilePairs( params.fastqSearchPath, flat: true)
-	          .filter{ !( it[0] =~ /Undetermined/ ) }
-	          .set{ ch_filePairs }
+           if ( params.samplesheet_input != "NO_FILE" ) {
+		Channel.fromPath(params.samplesheet_input)
+		    .splitCsv(header: true).map{ it -> [it['ID'], it['R1'], it['R2']] }
+		    .set{ ch_filePairs }
+	    }
+	    else {
+		Channel.fromFilePairs( params.fastqSearchPath, flat: true)
+	            .filter{ !( it[0] =~ /Undetermined/ ) }
+	            .set{ ch_filePairs }
+	    }
        }
    }
    else {