main.nf

#!/usr/bin/env nextflow
/*
========================================================================================
                         nf-core/predictorthologs
========================================================================================
 nf-core/predictorthologs Analysis Pipeline.
 #### Homepage / Documentation
 https://github.com/nf-core/predictorthologs
----------------------------------------------------------------------------------------
*/

def helpMessage() {
    // TODO nf-core: Add to this help message with new command line parameters
    log.info nfcoreHeader()
    log.info"""

    Usage:

    The typical command for running the pipeline is as follows:

    nextflow run nf-core/predictorthologs --reads '*_R{1,2}.fastq.gz' -profile docker

    Mandatory arguments:
      -profile [str]                Configuration profile to use. Can use multiple (comma separated)
                                    Available: conda, docker, singularity, test, awsbatch, <institute> and more
    Input Options:
      Sequencing reads (FASTQ format):
        --reads [file]                Path to input data (must be surrounded with quotes)
        --csv                         Comma-separated variable file containing the columns "sample_id" and "fasta" at minimum
                                      For differential hash expression, the columns "sig" and "group" are also required
      Protein input:
        --protein_fastas              Path to protein fastas

      Bam + bed file for intersection:
        --bam                         Path to a single bam file whose reads to intersect with the bed
        --bai                         Path to the above bam's bai index file, required for intersection
        --bed                         Path to a bed file containing regions of interest in the bam file

    hash2kmer options:
      --hashes                        Path to file of hashes whose sequence to find in the protein fastas, default None
      --sourmash_ksize               K-mer size to use to find matching k-mers in sequence, default 21
      --sourmash_molecule            Molecule type to use to find matching k-mers in sequence, default "protein"

   Differential hash expression options:
      --diff_hash_expression          If provided, compute enriched hashes in groups using logistic regression, by default don't do it
                                      This requires the --csv option and additional columns of "group" and "sig" in the csv
      --csv_has_is_aligned            If provided, then the --csv provided has a column named "is_aligned" that can be used to
                                      partition the signatures and differential hashes into aligned/unaligned bins

    Options:
      --single_end [bool]             Specifies that the input is single-end reads
      --skip_remove_duplicates_bam    If provided, skip removal of duplicates from bam file

    BLAST-like protein search options                        If not specified in the configuration file or you wish to overwrite any of the references
      --refseq_release        Valid terms from ftp://ftp.ncbi.nlm.nih.gov/refseq/release/,
                                      e.g. "complete", "archea", "plasmid", "protozoa", "viral".
                                      Default is "vertebrate_mammalian"
      --diamond_protein_fasta         Use all of manually curated, verified UniProt/SwissProt as the reference
                                      proteome for searching for orthologs
      --diamond_database              Pre-created database with DIAMOND
      --diamond_taxonmap_gz           Mapping of protein IDs to taxa
                                      Default is: "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/prot.accession2taxid.gz"
      --diamond_taxdmp_zip            Taxonomy dump file from NCBI
                                      Default is: "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip"


    Other options:
      --outdir [file]                 The output directory where the results will be saved
      --email [email]                 Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits
      --email_on_fail [email]         Same as --email, except only send mail if the workflow is not successful
      --max_multiqc_email_size [str]  Theshold size for MultiQC report to be attached in notification email. If file generated by pipeline exceeds the threshold, it will not be attached (Default: 25MB)
      -name [str]                     Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic

    AWSBatch options:
      --awsqueue [str]                The AWSBatch JobQueue that needs to be set when running on AWSBatch
      --awsregion [str]               The AWS Region for your AWS Batch job to run on
      --awscli [str]                  Path to the AWS CLI tool
    """.stripIndent()
}

// Show help message
if (params.help) {
    helpMessage()
    exit 0
}

/*
 * SET UP CONFIGURATION VARIABLES
 */

// Check if genome exists in the config file
if (params.genomes && params.genome && !params.genomes.containsKey(params.genome)) {
    exit 1, "The provided genome '${params.genome}' is not available in the iGenomes file. Currently the available genomes are ${params.genomes.keySet().join(", ")}"
}

// TODO nf-core: Add any reference files that are needed
// Configurable reference genomes
//
// NOTE - THIS IS NOT USED IN THIS PIPELINE, EXAMPLE ONLY
// If you want to use the channel below in a process, define the following:
//   input:
//   file fasta from ch_fasta
//
params.fasta = params.genome ? params.genomes[ params.genome ].fasta ?: false : false
if (params.fasta) { ch_fasta = file(params.fasta, checkIfExists: true) }

// Has the run name been specified by the user?
//  this has the bonus effect of catching both -name and --name
custom_runName = params.name
if (!(workflow.runName ==~ /[a-z]+_[a-z]+/)) {
    custom_runName = workflow.runName
}

////////////////////////////////////////////////////
/* --                   AWS                    -- */
////////////////////////////////////////////////////
if (workflow.profile.contains('awsbatch')) {
    // AWSBatch sanity checking
    if (!params.awsqueue || !params.awsregion) exit 1, "Specify correct --awsqueue and --awsregion parameters on AWSBatch!"
    // Check outdir paths to be S3 buckets if running on AWSBatch
    // related: https://github.com/nextflow-io/nextflow/issues/813
    if (!params.outdir.startsWith('s3:')) exit 1, "Outdir not on S3 - specify S3 Bucket to run on AWSBatch!"
    // Prevent trace files to be stored on S3 since S3 does not support rolling files.
    if (params.tracedir.startsWith('s3:')) exit 1, "Specify a local tracedir or run without trace! S3 cannot be used for tracefiles."
}

// Stage config files
ch_multiqc_config = file("$baseDir/assets/multiqc_config.yaml", checkIfExists: true)
ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multiqc_config, checkIfExists: true) : Channel.empty()
ch_output_docs = file("$baseDir/docs/output.md", checkIfExists: true)

////////////////////////////////////////////////////
/* --          Parse input reads               -- */
////////////////////////////////////////////////////

if (params.hashes) {
  Channel.fromPath(params.hashes)
      .ifEmpty { exit 1, "params.hashes was empty - no input files supplied" }
      .splitText()
      .map{ row -> tuple("hash", row.replaceAll("\\s+", "") )}
      .transpose()
      .dump( tag: 'ch_hash_to_group' )
      .into { ch_hash_to_group_for_joining; ch_hash_to_group_for_hash2kmer }

  ch_hash_to_group_for_hash2kmer
    .map{ it -> it[1] }
    .into{ ch_hashes_for_hash2kmer; ch_hashes_for_hash2sig }
}

if (params.bam && params.bed && params.bai && !(params.reads || params.readPaths )) {
    // params needed for intersection
    log.info "supplied bam, not looking at any supplied --reads"
    Channel.fromPath(params.bai)
        .ifEmpty { exit 1, "params.bai was empty - no input files supplied" }
        .set { ch_bai }
    Channel.fromPath(params.bam)
        .ifEmpty { exit 1, "params.bam was empty - no input files supplied" }
        .combine(ch_bai)
        .set { ch_bam_bai }
    Channel.fromPath(params.bed)
        .ifEmpty { exit 1, "params.bed was empty - no input files supplied" }
        .splitText()
        .map {row -> row.split()}
        .map { row -> [ row[3], row[0], row[1], row[2] ] } // get interval name, chrm, start and stop
        .combine(ch_bam_bai)
        .dump ( tag: 'ch_bam_bai' )
        .set {ch_bed_bam_bai}
} else if (params.bam && !params.skip_remove_duplicates_bam && !params.bai) {
    // deciding if sambamba steps are needed
    log.info "supplied bam and no skip_remove_duplicates flag specified"
    Channel.fromPath(params.bam)
        .ifEmpty { exit 1, "params.bam was empty, no input file supplied" }
        .into { ch_bam_for_dedup }
} else if (params.input_is_protein) {
  log.info 'Using protein fastas as input -- ignoring reads and bams'
  ////////////////////////////////////////////////////
  /* --          Parse protein fastas            -- */
  ////////////////////////////////////////////////////
  if (params.protein_fastas){
    Channel.fromPath(params.protein_fastas)
        .ifEmpty { exit 1, "params.protein_fastas was empty - no input files supplied" }
        .dump ( tag: 'ch_protein_fastas' )
        .set { ch_protein_fastas }
  } else if (params.csv && params.input_is_protein) {
    // Provided a csv file mapping sample_id to protein fasta path
    Channel
      .fromPath(params.csv)
      .splitCsv(header:true)
      .map{ row -> tuple(row.sample_id, tuple(file(row.fasta)))}
      .ifEmpty { exit 1, "params.csv (${params.csv}) was empty - no input files supplied" }
      .dump( tag: 'ch_protein_fastas__from_csv' )
      .set { ch_protein_fastas }
  } else if (params.protein_fasta_paths){
    Channel
      .from(params.protein_fasta_paths)
      .map { row -> file(row[1][0], checkIfExists: true) }
      .ifEmpty { exit 1, "params.protein_fasta_paths was empty - no input files supplied" }
      .dump(tag: "protein_fasta_paths")
      .set { ch_protein_fastas }
  }
  if (!(params.diff_hash_expression || params.hashes)) {
    // No hashes - just do a diamond blastp search for each peptide fasta
    // Not extracting the sequences containing hashes of interest
    ch_protein_fastas
      // add false for "hash" part
      .map { it -> tuple(false, 
                         file(it, checkIfExists: true).getBaseName(), 
                         file(it, checkIfExists: true))
            }
      // filter for non empty fasta files
      .filter { it -> it[2].size() > 0 }
      .dump ( tag: 'ch_protein_fastas__ch_protein_seq_for_diamond' )
      .set { ch_protein_seq_for_diamond }
  }

} else {
  // * Create a channel for input read files
  if (params.csv && params.csv_has_reads) {
    // Provided a csv file mapping sample_id to read(s) fastq path
    log.info "supplied csv, not looking at any supplied --reads or readPaths"
    if (params.single_end) {
      Channel
        .fromPath(params.csv)
        .splitCsv(header:true)
        .map{ row -> tuple(row.sample_id, tuple(file(row.read1)))}
        .ifEmpty { exit 1, "params.csv (${params.csv}) was empty - no input files supplied" }
        .dump(tag: "reads_single_end")
        .into { ch_read_files_fastqc; ch_read_files_trimming; ch_read_files_translate }
    } else {
      Channel
        .fromPath(params.csv)
        .splitCsv(header:true)
        .map{ row -> tuple(row.sample_id, tuple(file(row.read1), file(row.read2)))}
        .ifEmpty { exit 1, "params.csv (${params.csv}) was empty - no input files supplied" }
        .dump(tag: "reads_paired_end")
        .into { ch_read_files_fastqc; ch_read_files_trimming; ch_read_files_translate }
    }
   } else if (params.readPaths){
    log.info "supplied readPaths, not looking at any supplied --reads"
    if (params.single_end) {
      Channel
        .from(params.readPaths)
        .map { row -> [ row[0], [ file(row[1][0], checkIfExists: true) ] ] }
        .ifEmpty { exit 1, "params.readPaths was empty - no input files supplied" }
        .dump(tag: "reads_single_end")
        .into { ch_read_files_fastqc; ch_read_files_trimming; ch_read_files_translate }
    } else {
      Channel
        .from(params.readPaths)
        .map { row -> [ row[0], [ file(row[1][0], checkIfExists: true), file(row[1][1], checkIfExists: true) ] ] }
        .ifEmpty { exit 1, "params.readPaths was empty - no input files supplied" }
        .dump(tag: "reads_paired_end")
        .into { ch_read_files_fastqc; ch_read_files_trimming; ch_read_files_translate }
    }
  } else {
    Channel
      .fromFilePairs(params.reads, size: params.single_end ? 1 : 2)
      .ifEmpty { exit 1, "Cannot find any reads matching: ${params.reads}\nNB: Path needs to be enclosed in quotes!\nIf this is single-end data, please specify --single_end on the command line." }
      .dump(tag: "read_paths")
      .into { ch_read_files_fastqc; ch_read_files_trimming }
  }
}

if (params.hashes){
  Channel.fromPath(params.hashes)
      .ifEmpty { exit 1, "params.hashes was empty - no input files supplied" }
      .splitText()
      .map{ row -> tuple(row.replaceAll("\\s+", ""), "hash" )}
      .transpose()
      .into { ch_hash_to_group_for_joining_after_hash2kmer;
        ch_hash_to_group_for_joining_after_hash2sig;
        ch_hash_to_group_for_hash2kmer;
        ch_hash_to_group_for_hash2sig
       }

  ch_hash_to_group_for_hash2kmer
    .map{ it -> it[0] }
    .set{ ch_hashes_for_hash2kmer }
}


// Utility functions for sanitizing output
def groupCleaner(group) {
  return group.replaceAll(' ', '_').replaceAll('/', '-slash-').toLowerCase()
}

def hashCleaner(hash) {
  return hash.replaceAll('\\n', '')
}

////////////////////////////////////////////////////
/* --         Parse gene counting       -- */
////////////////////////////////////////////////////
if (params.csv_has_is_aligned) {
  if (params.csv) {
    Channel
      .fromPath ( params.csv )
      .splitCsv ( header:true )
      .branch { row ->
        aligned: row.is_aligned == "aligned"
        unaligned: row.is_aligned == "unaligned"
      }
      .set { ch_csv_is_aligned }

      // Create channel of signatures per group
    Channel
      .fromPath(params.csv)
      .splitCsv(header:true)
      .filter{ row -> row.is_aligned == 'unaligned' }
      .ifEmpty { exit 1, "is_aligned column can contain only aligned/unaligned values"}
      .dump( tag: 'csv_unaligned' )
      .map{ row -> tuple(row.group, file(row.sig, checkIfExists: true)) }
      .ifEmpty { exit 1, "params.csv (${params.csv}) 'group' or 'sig' column was empty" }
      .groupTuple()
      .dump( tag: 'ch_per_group_unaligned_sig' )
      .set{ ch_per_group_unaligned_sig }

    ch_csv_is_aligned.unaligned
      .dump( tag: 'ch_csv_is_aligned.unaligned' )
      .map{ row -> tuple(row.group, row.sample_id, row.sig, row.fasta) }
      .dump( tag: 'ch_unaligned_sig_fasta' )
      .into { ch_unaligned_sig_fasta }

  } else {
    exit 1, "Must provide --csv when doing filtering for aligned/unaligned hashes"
  }
}

////////////////////////////////////////////////////
/* --    Parse differential hash expression    -- */
////////////////////////////////////////////////////
if (params.diff_hash_expression) {
  if (params.csv) {
    // Create metadata csv channel
    ch_csv = Channel.fromPath(params.csv)

    // Create channel of all signatures, but a list within a list
    Channel
      .fromPath(params.csv)
      .splitCsv(header:true)
      .map{ row -> file(row.sig, checkIfExists: true) }
      .ifEmpty { exit 1, "params.csv (${params.csv}) 'sig' column was empty" }
      .collect()
      .map{ it -> [it] }   // Nest within a list so the combine() step keeps all the signatures together
      // [DUMP: ch_all_signatures_flat_list_for_diff_hash]
      //    [[MACA_24m_M_BM_60__unaligned__CCACCTAAGTCCAGGA_molecule-dayhoff_ksize-45_log2sketchsize-14_trackabundance-true.sig,
      //      MACA_24m_M_BM_60__unaligned__AGTTGGTCAAATCCGT_molecule-dayhoff_ksize-45_log2sketchsize-14_trackabundance-true.sig,
      //      10X_P1_14__unaligned__ACGGCCAAGCGTTGCC_molecule-dayhoff_ksize-45_log2sketchsize-14_trackabundance-true.sig,
      //      MACA_24m_M_BM_58__unaligned__CTAGTGAGTCCAACTA_molecule-dayhoff_ksize-45_log2sketchsize-14_trackabundance-true.sig,
      //      MACA_24m_M_SPLEEN_59__unaligned__GCGACCAGTCATCGGC_molecule-dayhoff_ksize-45_log2sketchsize-14_trackabundance-true.sig,
      //      10X_P4_2__unaligned__GACGTTACACCCATGG_molecule-dayhoff_ksize-45_log2sketchsize-14_trackabundance-true.sig,
      //      MACA_24m_M_HEPATOCYTES_58__unaligned__GCAGCCAAGTAGCGGT_molecule-dayhoff_ksize-45_log2sketchsize-14_trackabundance-true.sig,
      //      MACA_21m_F_NPC_54__unaligned__CCCAGTTTCGTAGATC_molecule-dayhoff_ksize-45_log2sketchsize-14_trackabundance-true.sig,
      //      10X_P4_2__unaligned__ATCGAGTCACCAGTTA_molecule-dayhoff_ksize-45_log2sketchsize-14_trackabundance-true.sig,
      //      10X_P5_0__unaligned__TCCACACCACATTTCT_molecule-dayhoff_ksize-45_log2sketchsize-14_trackabundance-true.sig]]
      .dump( tag: "ch_all_signatures_flat_list_for_diff_hash" )
      .into{ ch_all_signatures_flat_list_for_diff_hash }

    // Create channel of all signatures, completely flattened
    Channel
      .fromPath(params.csv)
      .splitCsv(header:true)
      .map{ row -> file(row.sig, checkIfExists: true) }
      .ifEmpty { exit 1, "params.csv (${params.csv}) 'sig' column was empty" }
      .collect()
      .into{ ch_all_signatures_flattened_for_finding_matches }

    // Create channel of fastas per group
    Channel
      .fromPath(params.csv)
      .splitCsv(header:true)
      .map{ row -> tuple(row.group, file(row.fasta, checkIfExists: true)) }
      .ifEmpty { exit 1, "params.csv (${params.csv}) 'fasta' column was empty" }
      .groupTuple()
      .dump( tag: 'ch_group_to_fasta' )
      .set{ ch_group_to_fasta }


    // Create channel of signatures per group
    Channel
      .fromPath(params.csv)
      .splitCsv(header:true)
      .map{ row -> tuple(row.group) }
      .unique()
      .ifEmpty { exit 1, "params.csv (${params.csv}) 'group' column was empty" }
      .dump(tag: 'csv_unique_groups')
      // [DUMP: csv_unique_groups] ['Mostly marrow unaligned']
      // [DUMP: csv_unique_groups] ['Liver unaligned']
      .combine( ch_all_signatures_flat_list_for_diff_hash )
      .dump(tag: 'ch_groups_with_all_signatures_for_diff_hash')
      // [DUMP: ch_groups_with_all_signatures_for_diff_hash]
      //    ['Mostly marrow unaligned',
      //      [MACA_24m_M_BM_60__unaligned__CCACCTAAGTCCAGGA_molecule-dayhoff_ksize-45_log2sketchsize-14_trackabundance-true.sig,
      //       MACA_24m_M_BM_60__unaligned__AGTTGGTCAAATCCGT_molecule-dayhoff_ksize-45_log2sketchsize-14_trackabundance-true.sig,
      //       10X_P1_14__unaligned__ACGGCCAAGCGTTGCC_molecule-dayhoff_ksize-45_log2sketchsize-14_trackabundance-true.sig,
      //       MACA_24m_M_BM_58__unaligned__CTAGTGAGTCCAACTA_molecule-dayhoff_ksize-45_log2sketchsize-14_trackabundance-true.sig,
      //       MACA_24m_M_SPLEEN_59__unaligned__GCGACCAGTCATCGGC_molecule-dayhoff_ksize-45_log2sketchsize-14_trackabundance-true.sig,
      //       10X_P4_2__unaligned__GACGTTACACCCATGG_molecule-dayhoff_ksize-45_log2sketchsize-14_trackabundance-true.sig,
      //       MACA_24m_M_HEPATOCYTES_58__unaligned__GCAGCCAAGTAGCGGT_molecule-dayhoff_ksize-45_log2sketchsize-14_trackabundance-true.sig,
      //       MACA_21m_F_NPC_54__unaligned__CCCAGTTTCGTAGATC_molecule-dayhoff_ksize-45_log2sketchsize-14_trackabundance-true.sig,
      //       10X_P4_2__unaligned__ATCGAGTCACCAGTTA_molecule-dayhoff_ksize-45_log2sketchsize-14_trackabundance-true.sig,
      //       10X_P5_0__unaligned__TCCACACCACATTTCT_molecule-dayhoff_ksize-45_log2sketchsize-14_trackabundance-true.sig]]
      // [DUMP: ch_groups_with_all_signatures_for_diff_hash]
      //  ['Liver unaligned',
      //    [MACA_24m_M_BM_60__unaligned__CCACCTAAGTCCAGGA_molecule-dayhoff_ksize-45_log2sketchsize-14_trackabundance-true.sig,
      //     MACA_24m_M_BM_60__unaligned__AGTTGGTCAAATCCGT_molecule-dayhoff_ksize-45_log2sketchsize-14_trackabundance-true.sig,
      //     10X_P1_14__unaligned__ACGGCCAAGCGTTGCC_molecule-dayhoff_ksize-45_log2sketchsize-14_trackabundance-true.sig,
      //     MACA_24m_M_BM_58__unaligned__CTAGTGAGTCCAACTA_molecule-dayhoff_ksize-45_log2sketchsize-14_trackabundance-true.sig,
      //     MACA_24m_M_SPLEEN_59__unaligned__GCGACCAGTCATCGGC_molecule-dayhoff_ksize-45_log2sketchsize-14_trackabundance-true.sig,
      //     10X_P4_2__unaligned__GACGTTACACCCATGG_molecule-dayhoff_ksize-45_log2sketchsize-14_trackabundance-true.sig,
      //     MACA_24m_M_HEPATOCYTES_58__unaligned__GCAGCCAAGTAGCGGT_molecule-dayhoff_ksize-45_log2sketchsize-14_trackabundance-true.sig,
      //     MACA_21m_F_NPC_54__unaligned__CCCAGTTTCGTAGATC_molecule-dayhoff_ksize-45_log2sketchsize-14_trackabundance-true.sig,
      //     10X_P4_2__unaligned__ATCGAGTCACCAGTTA_molecule-dayhoff_ksize-45_log2sketchsize-14_trackabundance-true.sig,
      //     10X_P5_0__unaligned__TCCACACCACATTTCT_molecule-dayhoff_ksize-45_log2sketchsize-14_trackabundance-true.sig]]
      .into { ch_groups_with_all_signatures_for_diff_hash }
    // exit 1, "testing"
  } else {
    exit 1, "--csv is required for differential hash expression!"
  }
}

////////////////////////////////////////////////////
/* --        Parse reference proteomes         -- */
////////////////////////////////////////////////////

// --- Parse Translate parameters ---
save_translate_csv = params.save_translate_csv
save_translate_json = params.save_translate_json

if (params.proteome_translate_fasta) {
  Channel.fromPath(params.proteome_translate_fasta, checkIfExists: true)
       .ifEmpty { exit 1, "Peptide fasta file not found: ${params.proteome_translate_fasta}" }
       .set{ ch_proteome_translate_fasta }
}

if (params.proteome_search_fasta) {
Channel.fromPath(params.proteome_search_fasta, checkIfExists: true)
     .ifEmpty { exit 1, "Reference proteome fasta file not found: ${params.proteome_search_fasta}" }
     .into{ ch_diamond_reference_fasta; ch_sourmash_reference_fasta }
}
if (params.taxonmap_gz) {
Channel.fromPath(params.taxonmap_gz, checkIfExists: true)
     .ifEmpty { exit 1, "Diamond Taxon map file not found: ${params.taxonmap_gz}" }
     .set{ ch_diamond_taxonmap_gz }
}
if (params.taxdmp_zip) {
Channel.fromPath(params.taxdmp_zip, checkIfExists: true)
     .ifEmpty { exit 1, "Diamond taxon dump file not found: ${params.taxdmp_zip}" }
     .set{ ch_diamond_taxdmp_zip }
}
if (params.diamond_database){
  Channel.fromPath(params.diamond_database, checkIfExists: true)
       .ifEmpty { exit 1, "Diamond database file not found: ${params.diamond_database}" }
       .set{ ch_diamond_db }
}
if (params.sourmash_index){
  Channel.fromPath(params.sourmash_index, checkIfExists: true)
       .ifEmpty { exit 1, "Sourmash SBT Index file not found: ${params.sourmash_index}" }
       .set{ ch_sourmash_index }
}

if (params.search_noncoding && params.infernal_db) {
  if (hasExtension(params.infernal_db, 'gz')) {
    Channel.fromPath(params.infernal_db, checkIfExists: true)
         .ifEmpty { exit 1, "Infernal database file not found: ${params.infernal_db}" }
         .set{ ch_infernal_db_gz }
  } else {
    Channel.fromPath(params.infernal_db, checkIfExists: true)
         .ifEmpty { exit 1, "Infernal database file not found: ${params.infernal_db}" }
         .set{ ch_infernal_db }
  }
}

if (params.search_noncoding && params.rfam_clan_info){
  Channel.fromPath(params.rfam_clan_info, checkIfExists: true)
       .ifEmpty { exit 1, "Rfam Clan Information file not found: ${params.rfam_clan_info}" }
       .set{ ch_rfam_clan_info }
}

//////////////////////////////////////////////////////////////////
/* -     Parse translate and diamond parameters         -- */
//////////////////////////////////////////////////////////////////
ch_peptide_ksize = Channel.from(params.translate_peptide_ksize?.toString().tokenize(',')).view()
ch_peptide_molecule = Channel.from(params.translate_peptide_molecule?.toString().tokenize(',')).view()

// Make cartesian product of molecule and ksize
ch_peptide_molecule
  .combine(ch_peptide_ksize)
  .dump ( tag: 'ch_translate_molecule_ksize' )
  .set { ch_translate_molecule_ksize }

jaccard_threshold = params.translate_jaccard_threshold
refseq_release = params.refseq_release
tablesize = params.translate_tablesize

//////////////////////////////////////////////////////////////////
/* -        Parse sourmash/hash2kmer parameters              -- */
//////////////////////////////////////////////////////////////////
sourmash_ksize = params.sourmash_ksize
sourmash_molecule = params.sourmash_molecule
sourmash_log2_sketch_size = params.sourmash_log2_sketch_size

//////////////////////////////////////////////////////////////////
/* -        Summarize reference proteome parameters          -- */
//////////////////////////////////////////////////////////////////
provided_reference_proteome = params.proteome_search_fasta || params.refseq_release
existing_reference = params.diamond_database || params.sourmash_index
need_refseq_download = !existing_reference && !params.proteome_search_fasta && params.refseq_release


//////////////////////////////////////////////////////////////////
/* -   Parse differential hash expression parameters         -- */
//////////////////////////////////////////////////////////////////
diff_hash_with_abundance = params.diff_hash_with_abundance
diff_hash_inverse_regularization_strength = params.diff_hash_inverse_regularization_strength
diff_hash_solver = params.diff_hash_solver
diff_hash_penalty = params.diff_hash_penalty

///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/* --                                                                     -- */
/* --                       HEADER LOG INFO                               -- */
/* --                                                                     -- */
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
using_hashes = params.diff_hash_expression || params.hashes
log.info nfcoreHeader()
def summary = [:]
if (workflow.revision) summary['Pipeline Release'] = workflow.revision
summary['Run Name']         = custom_runName ?: workflow.runName
// Input is sequencing reads --> need to convert to protein
if (params.csv) summary['CSV of samples']                                   = params.csv
if (params.bam) summary['bam']                                              = params.bam
if (params.bam) summary['bai']                                              = params.bai
if (params.bed) summary['bed']                                              = params.bed
if (params.reads) summary['Reads']                                          = params.reads
if (params.csv) summary['CSV of input reads']                               = params.csv
if (!params.input_is_protein) summary['sencha translate Ref']              = params.proteome_translate_fasta
// Input is protein -- have protein sequences and hashes
summary['Diff Hash']                                                        = params.diff_hash_expression
if (params.hashes) summary['Hashes']                                        = params.hashes
if (using_hashes) summary['sourmash ksize']                                 = params.sourmash_ksize
if (using_hashes) summary['sourmash molecule']                              = params.sourmash_molecule
if (params.diff_hash_expression) summary['Diff Hash abundance?']            = params.diff_hash_with_abundance
if (params.diff_hash_expression) summary['Diff Hash C']                     = params.diff_hash_inverse_regularization_strength
if (params.diff_hash_expression) summary['Diff Hash solver']                = params.diff_hash_solver
if (params.diff_hash_expression) summary['Diff Hash penalty']               = params.diff_hash_penalty
if (params.protein_fastas) summary['Input protein fastas']                  = params.protein_fastas
// How the DIAMOND search database is created
if (params.proteome_search_fasta) summary['Proteome search ref']            = params.proteome_search_fasta
summary['Protein searcher']                                                 = params.protein_searcher
if (params.hashes) summary['Hashes']                                        = params.hashes
if (params.hashes) summary['sourmash ksize']                                = params.sourmash_ksize
if (params.hashes) summary['sourmash molecule']                             = params.sourmash_molecule
if (need_refseq_download) summary['Refseq release']        = params.refseq_release
if (params.diamond_database) summary['DIAMOND pre-build database']     = params.diamond_database
if (params.protein_searcher == 'diamond') summary['Map sequences to taxon']     = params.taxonmap_gz
if (params.protein_searcher == 'diamond') summary['Taxonomy database dump']     = params.taxdmp_zip
summary['Data Type']        = params.single_end ? 'Single-End' : 'Paired-End'
summary['Max Resources']    = "$params.max_memory memory, $params.max_cpus cpus, $params.max_time time per job"
if (workflow.containerEngine) summary['Container'] = "$workflow.containerEngine - $workflow.container"
summary['Output dir']       = params.outdir
summary['Launch dir']       = workflow.launchDir
summary['Working dir']      = workflow.workDir
summary['Script dir']       = workflow.projectDir
summary['User']             = workflow.userName
if (workflow.profile.contains('awsbatch')) {
    summary['AWS Region']   = params.awsregion
    summary['AWS Queue']    = params.awsqueue
    summary['AWS CLI']      = params.awscli
}
summary['Config Profile'] = workflow.profile
if (params.config_profile_description) summary['Config Description'] = params.config_profile_description
if (params.config_profile_contact)     summary['Config Contact']     = params.config_profile_contact
if (params.config_profile_url)         summary['Config URL']         = params.config_profile_url
if (params.email || params.email_on_fail) {
    summary['E-mail Address']    = params.email
    summary['E-mail on failure'] = params.email_on_fail
    summary['MultiQC maxsize']   = params.max_multiqc_email_size
}
log.info summary.collect { k,v -> "${k.padRight(25)}: $v" }.join("\n")
log.info "-\033[2m--------------------------------------------------\033[0m-"

// Check the hostnames against configured profiles
checkHostname()

def create_workflow_summary(summary) {
    def yaml_file = workDir.resolve('workflow_summary_mqc.yaml')
    yaml_file.text  = """
    id: 'nf-core-predictorthologs-summary'
    description: " - this information is collected when the pipeline is started."
    section_name: 'czbiohub/nf-predictorthologs Workflow Summary'
    section_href: 'https://github.com/czbiohub/predictorthologs'
    plot_type: 'html'
    data: |
        <dl class=\"dl-horizontal\">
${summary.collect { k,v -> "            <dt>$k</dt><dd><samp>${v ?: '<span style=\"color:#999999;\">N/A</a>'}</samp></dd>" }.join("\n")}
        </dl>
    """.stripIndent()

   return yaml_file
}


/*
 * Parse software version numbers
 */
process get_software_versions {
    publishDir "${params.outdir}/pipeline_info", mode: 'copy',
        saveAs: { filename ->
                      if (filename.indexOf(".csv") > 0) filename
                      else null
                }

    output:
    file 'software_versions_mqc.yaml' into ch_software_versions_yaml
    file "software_versions.csv"

    script:
    // TODO nf-core: Get all tools to print their version number here
    // (base) root@aa580bfc0d2f:/# fastp --version
    // fastp 0.20.0
    // (base) root@aa580bfc0d2f:/# diamond version
    // diamond v0.9.30.131 (C) Max Planck Society for the Advancement of Science
    // Documentation, support and updates available at http://www.diamondsearch.org
    //
    // diamond version 0.9.30
    // (base) root@aa580bfc0d2f:/# samtools --version
    // samtools 1.10
    // Using htslib 1.10.2
    // Copyright (C) 2019 Genome Research Ltd.
    """
    echo $workflow.manifest.version > v_pipeline.txt
    echo $workflow.nextflow.version > v_nextflow.txt
    fastqc --version > v_fastqc.txt
    multiqc --version > v_multiqc.txt
    fastp --version > v_fastp.txt
    diamond version > v_diamond.txt
    samtools --version > v_samtools.txt
    sourmash -v &> v_sourmash.txt
    pip show sencha &> v_sencha.txt
    scrape_software_versions.py &> software_versions_mqc.yaml
    """
}


///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/* --                                                                     -- */
/* --               PREPROCESSING SAMBAMBA DEDUPLICATION                  -- */
/* --                                                                     -- */
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////

if (params.bam && !params.skip_remove_duplicates_bam && !params.bai){
    process sambamba_dedup {
        tag "${prefix}"
        label "process_high"
        publishDir "${params.outdir}/sambamba_dedup", mode: 'copy'

        input:
        file(bam) from ch_bam_for_dedup

        output:
        set val(prefix), file(bam_dedup) into ch_dedup_bam_for_index, ch_dedup_bam_for_samtools_fastq

        script:
        buffer_size = task.memory.toMega()
        prefix = "${bam.getBaseName()}_dedup"
        bam_dedup = "${prefix}.bam"
        """
        sambamba markdup --remove-duplicates --sort-buffer-size ${buffer_size} --nthreads $task.cpus ${bam} ${bam_dedup}
        """
    }
}

if (params.bam && !params.skip_remove_duplicates_bam && !params.bai){
    process sambamba_index {
        tag "${bam_name}"
        label "process_medium"
        publishDir "${params.outdir}/sambamba_index", mode: 'copy'

        input:
        set val(bam_name), file(bam_dedup) from ch_dedup_bam_for_index

        output:
        file(bai_dedup) into ch_dedup_bai

        script:
        bai_dedup = "${bam_name}.bai"
        """
        sambamba index  --nthreads $task.cpus ${bam_dedup} ${bai_dedup}
        """
    }
}

///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/* --                                                                     -- */
/* --               SAMTOOLS VIEW GENOMIC REGION TO FASTA                 -- */
/* --                                                                     -- */
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////

/*
 * STEP 0 - samtools view
 */

if (params.bam && !params.bed && !params.bai && !params.skip_remove_duplicates_bam) {
    process samtools_fastq_no_intersect {
    tag "$bam_name"
    label "process_low"
    publishDir "${params.outdir}/intersect_fastqs", mode: 'copy'

    input:
    set val(bam_name), file(bam_dedup) from ch_dedup_bam_for_samtools_fastq

    output:
    set val(bam_name), file(fastq) into ch_intersected

    script:
    fastq = "${bam_name}.fastq.gz"
    """
      samtools fastq -N ${bam_dedup} \\
      | gzip -c > ${fastq}
    """
    }
    ch_intersected
      // gzipped files are 20 bytes when empty
      .filter{ it[1].size() > 20 }
      .into { ch_read_files_fastqc; ch_read_files_trimming }
} else if (params.bam && params.bed && params.bai) {
    process samtools_view_fastq {
    tag "$interval_name"
    label "process_low"
    publishDir "${params.outdir}/intersect_fastqs", mode: 'copy'

    input:
    set val(interval_name), val(chrom), val(chromStart), val(chromEnd), file(bam), file(bai) from ch_bed_bam_bai

    output:
    set val(interval_name), file(fastq) into ch_intersected

    script:
    fastq = "${interval_name}.fastq.gz"
    """
    samtools view -hu $bam '${chrom}:${chromStart}-${chromEnd}' \\
      | samtools fastq -N - \\
      | gzip -c > ${fastq}
    """
    }
  ch_intersected
    // gzipped files are 20 bytes when empty
    .filter{ it[1].size() > 20 }
    .into { ch_read_files_fastqc; ch_read_files_trimming }
}

///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/* --                                                                     -- */
/* --                        FASTQ QC                                     -- */
/* --                                                                     -- */
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/*
 * STEP 1 - FastQC
 */
if (!params.input_is_protein && !params.skip_fastqc) {
  process fastqc {
      tag "$name"
      label 'process_medium'
      publishDir "${params.outdir}/fastqc", mode: 'copy',
          saveAs: { filename ->
                        filename.indexOf(".zip") > 0 ? "zips/$filename" : "$filename"
                  }

      input:
      set val(name), file(reads) from ch_read_files_fastqc

      output:
      file "*_fastqc.{zip,html}" into ch_fastqc_results

      script:
      """
      fastqc --quiet --threads $task.cpus $reads
      """
  }
} else {
  ch_fastqc_results = Channel.empty()
}


///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/* --                                                                     -- */
/* --                        ADAPTER TRIMMING                             -- */
/* --                                                                     -- */
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/*
 * STEP 2 - fastp for read trimming
 */

if (!params.skip_trimming && !(params.input_is_protein || params.protein_fastas || params.protein_fasta_paths) ){
  process fastp {
      label 'process_low'
      tag "$name"
      publishDir "${params.outdir}/fastp", mode: 'copy',
        saveAs: {filename ->
                    if (filename.indexOf(".fastq.gz") == -1) "logs/$filename"
                    else if (reads[1] == null) "single_end/$filename"
                    else if (reads[1] != null) "paired_end/$filename"
                    else null
                }

      input:
      set val(name), file(reads) from ch_read_files_trimming

      output:
      set val(name), file("*trimmed.fastq.gz") into ch_reads_trimmed
      file "*fastp.json" into ch_fastp_results
      file "*fastp.html" into ch_fastp_html

      script:
      // One set of reads --> single end
      if (reads[1] == null) {
          """
          fastp \\
              --low_complexity_filter \\
              --trim_poly_x \\
              --in1 ${reads} \\
              --out1 ${name}_R1_trimmed.fastq.gz \\
              --json ${name}_fastp.json \\
              --html ${name}_fastp.html
          """
      } else if (reads[1] != null ){
        // More than one set of reads --> paired end
          """
          fastp \\
              --low_complexity_filter \\
              --trim_poly_x \\
              --in1 ${reads[0]} \\
              --in2 ${reads[1]} \\
              --out1 ${name}_R1_trimmed.fastq.gz \\
              --out2 ${name}_R2_trimmed.fastq.gz \\
              --json ${name}_fastp.json \\
              --html ${name}_fastp.html
          """
      } else {
        """
        echo name ${name}
        echo reads: ${reads}
        echo "Number of reads is not equal to 1 or 2 --> don't know how to trim non-paired-end and non-single-end reads"
        """
      }
  }
} else if (!params.input_is_protein) {
  ch_reads_trimmed = ch_read_files_trimming
  ch_fastp_results = Channel.empty()
} else {
  ch_fastp_results = Channel.empty()
}


if (!(params.input_is_protein || params.protein_fastas || params.protein_fasta_paths) && params.protein_searcher == 'diamond'){
  ///////////////////////////////////////////////////////////////////////////////
  ///////////////////////////////////////////////////////////////////////////////
  /* --                                                                     -- */
  /* --     PREPARE PEPTIDE DATABASE TO PREDICT PROTEIN-CODING READS        -- */
  /* --                                                                     -- */
  ///////////////////////////////////////////////////////////////////////////////
  ///////////////////////////////////////////////////////////////////////////////
  /*
   * STEP 2 - sencha index
   */
  process make_protein_index {
    tag "${peptides}__${bloom_id}"
    label "process_low"

    publishDir "${params.outdir}/sencha/", mode: 'copy'

    input:
    file(peptides) from ch_proteome_translate_fasta.collect()
    set val(molecule), val(ksize) from ch_translate_molecule_ksize

    output:
    set val(bloom_id), val(molecule),  val(ksize), file("${peptides.simpleName}__${bloom_id}.bloomfilter") into ch_sencha_bloom_filters

    script:
    bloom_id = "molecule-${molecule}_ksize-${ksize}"
    """
    sencha index \\
      --tablesize ${tablesize} \\
      --molecule ${molecule} \\
      --peptide-ksize ${ksize} \\
      --save-as ${peptides.simpleName}__${bloom_id}.bloomfilter \\
      ${peptides}
    """
  }

  // From Paolo - how to do translate on ALL combinations of bloom filters
   ch_sencha_bloom_filters
      .groupTuple(by: [0, 1, 2])
      .combine(ch_reads_trimmed)
      .dump( tag: 'ch_sencha_bloom_filters_grouptuple' )
      // [DUMP: ch_sencha_bloom_filters_grouptuple]
      //    [molecule-protein_ksize-12,
      //     'protein',
      //      '12',
      //      [ncbi_refseq_vertebrate_mammalian_ptprc_plus__np_only__molecule-protein_ksize-12.bloomfilter],
      //    'bonobo_liver_ptprc',
      //    bonobo_liver_ptprc_R1_trimmed.fastq.gz]
    .set{ ch_sencha_bloom_filters_grouptuple }


  ///////////////////////////////////////////////////////////////////////////////
  ///////////////////////////////////////////////////////////////////////////////
  /* --                                                                     -- */
  /* --                   PREDICT PROTEIN-CODING READS                      -- */
  /* --                                                                     -- */
  ///////////////////////////////////////////////////////////////////////////////
  ///////////////////////////////////////////////////////////////////////////////
  /*
   * STEP 3 - sencha translate
   */
  process translate {
    tag "${sample_sketch_id}"
    label "process_low"
    label "process_long"
    publishDir "${params.outdir}/translate/${bloom_id}", mode: 'copy',
      saveAs: {
          filename ->
              if (save_translate_csv && filename.indexOf(".csv") > 0) "$filename"
              else if (save_translate_json && filename.indexOf(".json") > 0) "$filename"
              else "$filename"
          }

    input:
    tuple \
        val(bloom_id), val(alphabet), val(ksize), file(bloom_filter),  \
        val(sample_id), file(reads) \
        from ch_sencha_bloom_filters_grouptuple

    output:
    // TODO also extract nucleotide sequence of coding reads and do sourmash compute using only DNA on that?
    set val(sample_sketch_id), file(noncoding_nucleotides) into ch_noncoding_nucleotides_potentially_empty
    // Set first value to "false" so it's not treated as a differential hash, and only the sample_id is considered
    set val(false), val(sample_sketch_id), file(peptides_fasta) into ch_translated_proteins_potentially_empty
    set val(sample_sketch_id), file(coding_nucleotides) into ch_coding_nucleotides
    set val(sample_sketch_id), file(coding_scores) into ch_coding_scores_csv
    set val(sample_sketch_id), file(summary_json) into ch_coding_scores_json

    script:
    sample_sketch_id = "${sample_id}__${bloom_id}"
    noncoding_nucleotides = "${sample_sketch_id}__noncoding_reads_nucleotides.fasta"
    coding_nucleotides = "${sample_sketch_id}__coding_reads_nucleotides.fasta"
    peptides_fasta = "${sample_sketch_id}__coding_reads_peptides.fasta"
    coding_scores = "${sample_sketch_id}__coding_scores.csv"
    summary_json = "${sample_sketch_id}__coding_summary.json"
    """
    sencha translate \\
      --molecule ${alphabet} \\
      --peptide-ksize ${ksize} \\
      --jaccard-threshold ${jaccard_threshold} \\
      --noncoding-nucleotide-fasta ${noncoding_nucleotides} \\
      --coding-nucleotide-fasta ${coding_nucleotides} \\
      --csv ${coding_scores} \\
      --json-summary ${summary_json} \\
      --peptides-are-bloom-filter \\
      ${bloom_filter} \\
      ${reads} > ${peptides_fasta}
    """
  }

  // Remove empty files
  // it[0] = sample id
  // it[1] = bloom id
  // it[2] = sequence fasta file
  ch_translated_proteins_potentially_empty
    .filter{ it[2].size() > 0 }
    .dump(tag: "ch_translated_proteins_potentially_empty")
    // [DUMP: ch_translated_proteins_potentially_empty]
    //    ['NC-033660.1-74563649-74570299-+-516-0',
    //      molecule-protein,
    //      NC-033660.1-74563649-74570299-+-516-0__molecule-protein__coding_reads_peptides.fasta]
    .set{ ch_protein_seq_for_diamond }

  // Remove empty files
  // it[0] = sample bloom id
  // it[1] = sequence fasta file
  ch_noncoding_nucleotides_potentially_empty
    .filter { it[1].size() > 0 }
    .set { ch_noncoding_nucleotides }
}

///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/* --                                                                     -- */
/* --             PERFORM DIFFERENTIAL HASH EXPRESSION                    -- */
/* --                                                                     -- */
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/*
 * STEP 4 - convert hashes to k-mers
 */
 if (params.input_is_protein && params.csv && params.diff_hash_expression){
  // No protein fasta provided for searching for orthologs, need to
  // download refseq
  process diff_hash {
    tag "${group_cleaned}"
    label "process_medium"

    publishDir "${params.outdir}/diff_hash/${group}", mode: 'copy'

    input:
    set val(group), file(all_signatures) from ch_groups_with_all_signatures_for_diff_hash
    file metadata from ch_csv.collect()

    output:
    file("${group_cleaned}.log")
    file("*__hash_coefficients.csv")
    set val(group), file("*__informative_hashes.txt") into ch_informative_hashes_files

    script:
    group_cleaned = groupCleaner(group)
    abundance_flag = diff_hash_with_abundance ? '--with-abundance' : ''
    """
    differential_hash_expression.py \\
        --ksize ${sourmash_ksize} \\
        --input-is-protein \\
        --n-jobs ${task.cpus} \\
        --group1 '${group}' \\
        --${sourmash_molecule} \\
        --no-dna \\
        --metadata-csv ${metadata} \\
        --use-sig-basename \\
        --penalty ${diff_hash_penalty} \\
        --solver ${diff_hash_solver} \\
        --max-group-size 100 \\
        ${abundance_flag} \\
        --inverse-regularization-strength ${diff_hash_inverse_regularization_strength} \\
        > '${group_cleaned}.log'
    """
  }
  ch_informative_hashes_files
      .dump(tag: 'ch_informative_hashes_files')
      // [group_name, text_file]
      .map{ it -> tuple(it[1].splitText(), it[0])}
      // [['123\n', '456\n', '789\n'], group]
      .dump(tag: 'ch_informative_hashes_files_split')
      .transpose()
      // ['123\n', group]
      // ['456\n', group]
      // ['789\n', group]
      .dump(tag: 'ch_hash_to_group')
      .into {
        ch_hash_to_group_for_finding_matches
        ch_hash_to_group_for_joining_after_hash2kmer;
        ch_hash_to_group_for_joining_after_hash2sig;
        ch_hash_to_group_for_hash2kmer;
        ch_hash_to_group_for_hash2sig }

  ch_hash_to_group_for_finding_matches
    .map{ it -> it[0] }
    .unique()
    .into{ ch_informative_hashes_flattened }


  ///////////////////////////////////////////////////////////////////////////////
  ///////////////////////////////////////////////////////////////////////////////
  /* --                                                                     -- */
  /* --                 FIND SIGNATURES CONTAINING HASHES                   -- */
  /* --                                                                     -- */
  ///////////////////////////////////////////////////////////////////////////////
  ///////////////////////////////////////////////////////////////////////////////
  /*
  * STEP 7 - Find signatures containing hashes
  */
  process sigs_with_hash {
    tag "${hash_id}"
    label "process_low"

    publishDir "${params.outdir}/diff_hash/sigs_with_hash", mode: 'copy'

    input:
    val(hash) from ch_informative_hashes_flattened
    file(sigs) from ch_all_signatures_flattened_for_finding_matches

    output:
    file("*__matches.txt")

    script:
    hash_cleaned = hashCleaner(hash)
    hash_id = "hash-${hash_cleaned}"
    matches = "${hash_id}__matches.txt"
    """
    rg --threads ${task.cpus} --files-with-matches ${hash_cleaned} ${sigs} \\
      > ${matches}
    """
  }
}


///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/* --                                                                     -- */
/* --              DOWNLOAD REFSEQ REFERENCE PROTEOME                     -- */
/* --                                                                     -- */
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/*
 * STEP 6 - rsync to download refeseq
 */
 if (!existing_reference && need_refseq_download){
  // No protein fasta provided for searching for orthologs, need to
  // download refseq
  process download_refseq {
    tag "${refseq_release}"
    label "process_low"

    publishDir "${params.outdir}/ncbi_refseq/", mode: 'copy'

    output:
    // Enclose in parentheses to avoid "No such variable: process" error
    // Reference: https://github.com/nextflow-io/nextflow/issues/141
    file("${refseq_release}.fa.gz") into (ch_diamond_reference_fasta, ch_sourmash_reference_fasta)

    script:
    """
    rsync \\
          --prune-empty-dirs \\
          --archive \\
          --verbose \\
          --recursive \\
          --include '*protein.faa.gz' \\
          --exclude '/*' \\
          rsync://ftp.ncbi.nlm.nih.gov/refseq/release/${refseq_release}/ .
    zcat *.protein.faa.gz | gzip -c - > ${refseq_release}.fa.gz
    """
  }
}

if (params.hashes) {
  // Combine the extracted hashes with the known proteins
  ch_protein_fastas
    .map{ it -> it[1] }  // get only the file, not the sample id
    .collect()           // make a single flat list
    .map{ it -> [it] }   // Nest within a list so the next step does what I want
    .set{ ch_protein_fastas_flat_list }

  ch_hashes_for_hash2kmer
      .combine( ch_protein_fastas_flat_list )
      .set { ch_hashes_with_fastas_for_hash2kmer }
      // Desired output:
      // [1, ["a", "b", "c"]]
      // [2, ["a", "b", "c"]]
      // [3, ["a", "b", "c"]]
      // 1, 2, 3 = hashes
      // "a", "b", "c" = protein fasta files
} else if (params.diff_hash_expression) {

  ch_hash_to_group_for_hash2kmer
    .join( ch_group_to_fasta )
    .dump( tag: 'group_to_hashes_for_hash2kmer__combine__ch_group_to_fasta' )
    .into{ ch_hashes_with_fastas_for_hash2kmer }

  ch_hash_to_group_for_hash2sig
    .map{ it -> it[0] }
    .into{ ch_hashes_for_hash2sig }
}

///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/* --                                                                     -- */
/* --              EXTRACT SEQUENCES CONTAINING HASHES                    -- */
/* --                                                                     -- */
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/*
 * STEP 4 - convert hashes to k-mers & sequences -- but only needed for diamond search
 */
 do_hash2kmer = params.diff_hash_expression || params.hashes || params.do_featurecounts_orthology
 if (do_hash2kmer) {
  // No protein fasta provided for searching for orthologs, need to
  // download refseq
  process hash2kmer {
    tag "${hash_cleaned}"
    label "process_low"

    publishDir "${params.outdir}/hash2kmer/${hash_id}", mode: 'copy'

    input:
    tuple val(hash), file(peptide_fastas) from ch_hashes_with_fastas_for_hash2kmer

    output:
    file(kmers)
    set val(hash), file(sequences) into ch_seqs_from_hash2kmer, ch_seqs_from_hash2kmer_to_print, ch_seqs_from_hash2kmer_for_bam_of_hashes
    set val(hash), val(hash_id), file(sequences) into ch_seqs_with_hashes_for_filter_unaligned_reads, ch_seqs_with_hashes_for_bam_of_hashes

    script:
    hash_cleaned = hashCleaner(hash)
    hash_id = "hash-${hash_cleaned}"
    kmers = "${hash_id}__kmer.txt"
    sequences = "${hash_id}__sequences.fasta"
    first_flag = params.do_featurecounts_orthology ? '' : '--first'
    """
    echo ${hash_cleaned} >> hash.txt
    hash2kmer.py \\
        --ksize ${sourmash_ksize} \\
        --no-dna \\
        --input-is-protein \\
        --output-sequences ${sequences} \\
        --output-kmers ${kmers} \\
        --${sourmash_molecule} \\
        ${first_flag} \\
        hash.txt \\
        ${peptide_fastas}
    """
  }
  ch_seqs_from_hash2kmer_to_print.dump(tag: 'ch_seqs_from_hash2kmer_to_print')

  ch_hash_to_group_for_joining_after_hash2kmer
    .join(ch_seqs_from_hash2kmer)
    .dump(tag: 'ch_hash_to_group_for_joining__ch_protein_seq_from_hash2kmer')
    .set{ ch_protein_seq_for_diamond }
}

///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/* --                                                                     -- */
/* --       PREPARE PROTEIN SEQUENCES FOR SEARCHING WITH DIAMOND          -- */
/* --                                                                     -- */
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
if (params.protein_searcher == 'diamond') {

  ///////////////////////////////////////////////////////////////////////////////
  ///////////////////////////////////////////////////////////////////////////////
  /* --                                                                     -- */
  /* --                      PREPARE TAXA FOR DIAMOND                       -- */
  /* --                                                                     -- */
  ///////////////////////////////////////////////////////////////////////////////
  ///////////////////////////////////////////////////////////////////////////////
  /*
   * STEP 6 - unzip taxonomy information files for input to DIAMOND
   */
  if (!params.diamond_database ){
    process diamond_prepare_taxa {
      tag "${taxondmp_zip.baseName}"
      label "process_low"

      publishDir "${params.outdir}/ncbi_refseq/", mode: 'copy'

      input:
      file(taxondmp_zip) from ch_diamond_taxdmp_zip

      output:
      file("nodes.dmp") into ch_diamond_taxonnodes
      file("names.dmp") into ch_diamond_taxonnames

      script:
      """
      7z x ${taxondmp_zip}
      """
    }
  }


  ///////////////////////////////////////////////////////////////////////////////
  ///////////////////////////////////////////////////////////////////////////////
  /* --                                                                     -- */
  /* --                  MAKE DIAMOND PEPTIDE DATABASE                      -- */
  /* --                                                                     -- */
  ///////////////////////////////////////////////////////////////////////////////
  ///////////////////////////////////////////////////////////////////////////////
  /*
   * STEP 7 - make peptide search database for DIAMOND
   */
  if (!params.diamond_database && (params.proteome_search_fasta || params.refseq_release)){
    process diamond_makedb {
     tag "${reference_proteome.baseName}"
     label "process_medium"
     publishDir path: { params.save_reference ? "${params.outdir}/reference/diamond/" : params.outdir },
                saveAs: { params.save_reference ? it : null }, mode: "${params.publish_dir_mode}"

     input:
     file(reference_proteome) from ch_diamond_reference_fasta.collect()
     file(taxonnodes) from ch_diamond_taxonnodes.collect()
     file(taxonnames) from ch_diamond_taxonnames.collect()
     file(taxonmap_gz) from ch_diamond_taxonmap_gz.collect()

     output:
     file("${reference_proteome.simpleName}_db.dmnd") into ch_diamond_db

     script:
     """
     diamond makedb \\
         --threads ${task.cpus} \\
         -d ${reference_proteome.simpleName}_db \\
         --taxonmap ${taxonmap_gz} \\
         --taxonnodes ${taxonnodes} \\
         --taxonnames ${taxonnames} \\
         --in ${reference_proteome}
     """
   }
  }

  ///////////////////////////////////////////////////////////////////////////////
  ///////////////////////////////////////////////////////////////////////////////
  /* --                                                                     -- */
  /* --                  MAKE DIAMOND PEPTIDE DATABASE                      -- */
  /* --                                                                     -- */
  ///////////////////////////////////////////////////////////////////////////////
  ///////////////////////////////////////////////////////////////////////////////
  /*
   * STEP 8 - Search DIAMOND database for closest match to
   */
  process diamond_blastp {
    tag "${group}"
    label "process_low"

    publishDir "${params.outdir}/blastp/${subdir}", mode: 'copy'

    input:
    // Basenames from dumped channel:
    // [DUMP: ch_query_protein_sequences_with_diamond_db]
    //   [ENSPPYT00000000455__molecule-dayhoff,
    //   ENSPPYT00000000455__molecule-dayhoff__coding_reads_peptides.fasta,
    //   ncbi_refseq_vertebrate_mammalian_ptprc_db.dmnd]
    file(diamond_db) from ch_diamond_db.collect()
    set val(hash), val(group), file(coding_peptides) from ch_protein_seq_for_diamond

    output:
    file(tsv) into ch_diamond_blastp_output

    script:
    group_cleaned = groupCleaner(group)
    if (hash) {
      hash_cleaned = hashCleaner(hash)
      sample_id = "${group_cleaned}__hash-${hash_cleaned}"
      subdir = "${group_cleaned}"
    }
    else {
      sample_id = "${group_cleaned}"
      subdir = ""
    }
    tsv = "${sample_id}__diamond__${diamond_db.simpleName}.tsv"
    output_format = "--outfmt 6 qseqid sseqid pident evalue bitscore stitle staxids sscinames sskingdoms sphylums"
    """
    diamond blastp \\
        ${output_format} \\
        --threads ${task.cpus} \\
        --max-target-seqs 3 \\
        --db ${diamond_db} \\
        --evalue 0.00001  \\
        --query ${coding_peptides} \\
        > ${tsv}
    """
  }
}

///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/* --                                                                     -- */
/* --            PREPARE HASHES FOR SEARCHING WITH SOURMASH               -- */
/* --                                                                     -- */
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
if (params.protein_searcher == 'sourmash'){

  ///////////////////////////////////////////////////////////////////////////////
  ///////////////////////////////////////////////////////////////////////////////
  /* --                                                                     -- */
  /* --                    CONVERT HASHES TO SIGNATURES                     -- */
  /* --                                                                     -- */
  ///////////////////////////////////////////////////////////////////////////////
  ///////////////////////////////////////////////////////////////////////////////
  /*
   * STEP 4 - convert hashes to k-mers
   */
   if (params.protein_searcher == 'sourmash' && (params.hashes || params.diff_hash_expression)){
    // No protein fasta provided for searching for orthologs, need to
    // download refseq
    process hash2sig {
      tag "${hash_id}"
      label "process_low"

      publishDir "${params.outdir}/hash2sig/", mode: 'copy'

      input:
      val(hash) from ch_hashes_for_hash2sig

      output:
      set val(hash), val(hash_id), file("${sig}") into ch_hash_sigs_from_hash2sig_to_print, ch_hash_sigs_from_hash2sig_to_join

      script:
      hash_cleaned = hashCleaner(hash)
      hash_id = "hash-${hash_cleaned}"
      sig = "${hash_id}.sig"
      """
      echo ${hash_cleaned} >> hash.txt
      hash2sig.py \\
          --ksize ${sourmash_ksize} \\
          --no-dna \\
          --scaled 1 \\
          --input-is-protein \\
          --${sourmash_molecule} \\
          --output ${sig} \\
          hash.txt
      """
    }
    ch_hash_sigs_from_hash2sig_to_print.dump(tag: 'ch_hash_sigs_from_hash2sig_to_print')

    ch_hash_to_group_for_joining_after_hash2sig
      .join( ch_hash_sigs_from_hash2sig_to_join )
      // [DUMP: ch_hash_to_group_for_joining_after_hash2sig__ch_hash_sigs_from_hash2sig_to_join]
      // ['4406535782145158631\n', 'monocyte', hash-4406535782145158631, hash-4406535782145158631.sig]
      .dump( tag: 'ch_hash_to_group_for_joining_after_hash2sig__ch_hash_sigs_from_hash2sig_to_join' )
      .map{ it -> tuple(it[1], it[0], it[2], it[3]) }
      .dump( tag: 'ch_group_to_hash_sig' )
      // ['monocyte', '4406535782145158631\n', hash-4406535782145158631, hash-4406535782145158631.sig]
      .set{ ch_group_to_hash_sig }
  }

  ///////////////////////////////////////////////////////////////////////////////
  ///////////////////////////////////////////////////////////////////////////////
  /* --                                                                     -- */
  /* --                  MAKE SOURMASH INDEX                      -- */
  /* --                                                                     -- */
  ///////////////////////////////////////////////////////////////////////////////
  ///////////////////////////////////////////////////////////////////////////////
  /*
   * STEP 7 - make peptide search database for DIAMOND
   */
  process sourmash_db_compute {
   tag "${sample_id}"
   label "process_low"

   publishDir "${params.outdir}/sourmash/compute", mode: 'copy'

   input:
   file(reference_proteome) from ch_sourmash_reference_fasta

   output:
   file(output_log)
   file(sig) into ch_proteome_sig_for_sourmash_index

   script:
   sketch_id = "molecule-${sourmash_molecule}__ksize-${sourmash_ksize}__scaled-1__track_abundance-true"
   sample_id = "${reference_proteome.simpleName}__${sketch_id}"
   sig = "${sample_id}.sig"
   output_log = "${sample_id}.log"
   """
   sourmash compute \\
      --ksizes ${sourmash_ksize} \\
      --input-is-protein \\
      --track-abundance \\
      --singleton \\
      --scaled 1 \\
      --no-dna \\
      --${sourmash_molecule} \\
      --output ${sig}\\
      ${reference_proteome} \\
      2> ${output_log}
   """
 }

  process sourmash_db_index {
    tag "${reference_proteome_sig.baseName}"
    label "process_low"

    publishDir "${params.outdir}/sourmash/index", mode: 'copy'

    input:
    file(reference_proteome_sig) from ch_proteome_sig_for_sourmash_index.collect()

    output:
    set file(".sbt*"), file("*.sbt.json") into ch_sourmash_index

    script:
    sketch_id = "molecule-${sourmash_molecule}__ksize-${sourmash_ksize}__scaled-1__track_abundance-true"
    """
    sourmash index \\
        --ksize ${sourmash_ksize} \\
        --${sourmash_molecule} \\
        ${reference_proteome_sig.simpleName} \\
        ${reference_proteome_sig}
    """
  }

  if ( params.csv_has_is_aligned ) {
    ch_per_group_unaligned_sig
      .join( ch_group_to_hash_sig )
      // [DUMP: ch_group_to_hash_sig]
      // ['monocyte',
      //  [10X_P1_14__unaligned__GACTAACAGCATGGCA_molecule-dayhoff_ksize-45_log2sketchsize-14_trackabundance-true.sig,
      //   10X_P1_14__unaligned__AACTGGTAGGTTCCTA_molecule-dayhoff_ksize-45_log2sketchsize-14_trackabundance-true.sig,
      //   10X_P1_14__unaligned__CTAATGGCAGCATACT_molecule-dayhoff_ksize-45_log2sketchsize-14_trackabundance-true.sig,
      //   10X_P1_14__unaligned__ACACCCTGTAGCGTGA_molecule-dayhoff_ksize-45_log2sketchsize-14_trackabundance-true.sig,
      //   MACA_18m_M_LUNG_53__unaligned__TAAGTGCAGTGTCCCG_molecule-dayhoff_ksize-45_log2sketchsize-14_trackabundance-true.sig],
      // '2852067181280790833\n',
      //  hash-2852067181280790833,
      //  hash-2852067181280790833.sig]
      .dump( tag: 'ch_group_to_hash_sig_with_group_unaligned_sigs' )
      .into{ ch_group_to_hash_sig_with_group_unaligned_sigs }

    ///////////////////////////////////////////////////////////////////////////////
    ///////////////////////////////////////////////////////////////////////////////
    /* --                                                                     -- */
    /* --       SEARCH UNALIGNED HASHES FOR DIFFERENTIAL HASHES               -- */
    /* --                                                                     -- */
    ///////////////////////////////////////////////////////////////////////////////
    ///////////////////////////////////////////////////////////////////////////////
    /*
    * STEP 7 - Filter hashes for only unaligned ones
    */
    process is_hash_in_unaligned {
      tag "${sample_id}"
      label "process_low"

      publishDir "${params.outdir}/is_hash_in_unaligned", mode: 'copy'

      input:
      set val(group), file(group_unaligned_sigs), val(hash), val(hash_id), file(query_sig) from ch_group_to_hash_sig_with_group_unaligned_sigs

      output:
      set val(group), val(hash), val(hash_id), file(query_sig), file(matches) into ch_hash_sigs_in_unaligned

      script:
      group_cleaned = groupCleaner(group)
      hash_cleaned = hashCleaner(hash)
      sample_id = "${group_cleaned}__${hash_id}"
      matches = "${sample_id}__matches.txt"
      """
      rg --files-with-matches ${hash_cleaned} ${group_unaligned_sigs} > ${matches}
      """
    }
    ch_hash_sigs_in_unaligned
      .dump( tag: 'ch_hash_sigs_in_unaligned' )
      // Check that matches are nonempty
      .branch{
        aligned: it[4].size() == 0
        unaligned: it[4].size() > 0
      }
      .set{ ch_hashes_sigs_branched }

      ch_hashes_sigs_branched
        .unaligned
        .map { it -> tuple(it[0], it[1], it[2], it[3]) }
        .dump ( tag: 'ch_hashes_in_group_unaligned_sigs' )
        .set { ch_group_hash_sigs_to_query }

      ch_hashes_sigs_branched
        .aligned
        .map { it -> tuple(it[0], it[1], it[2]) }
        .dump ( tag: 'ch_hashes_in_group_aligned' )
        .set { ch_hashes_in_group_aligned }
  } else {
     // Search all hashes
     ch_group_hash_sigs_to_query = ch_group_to_hash_sig
   }

  process sourmash_db_search {
   tag "${group_cleaned}"
   label "process_low"

   publishDir "${params.outdir}/sourmash/search", mode: 'copy'

   input:
   set file(sbt_hidden_files), file(reference_sbt_json) from ch_sourmash_index.collect()
   set val(group), val(hash), val(hash_id), file(query_sig) from ch_group_hash_sigs_to_query

   output:
   file("${csv_output}")

   script:
   group_cleaned = groupCleaner(group)
   csv_output = "${group_cleaned}__${hash_id}.csv"
   sketch_id = "molecule-${sourmash_molecule}__ksize-${sourmash_ksize}__scaled-1__track_abundance-true"
   """
   sourmash search \\
       --containment \\
       --threshold 1e-100 \\
       --output ${csv_output} \\
       --ksize ${sourmash_ksize} \\
       --${sourmash_molecule} \\
       ${query_sig} \\
       ${reference_sbt_json}
   """
 }


}


///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/* --                                                                     -- */
/* --             SEARCH NONCODING RNAS WITH INFERNAL                     -- */
/* --                                                                     -- */
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/*
 * STEP 10 - Download/prepare Rfam databse
 */
if (params.search_noncoding && params.infernal_db) {
  /*
   * STEP 6 - unzip taxonomy information files for input to DIAMOND
   */
  if (hasExtension(params.infernal_db, "gz") ){
    process gunzip_infernal_cm {
        tag "$gz"
        publishDir path: { params.save_reference ? "${params.outdir}/reference/infernal" : params.outdir },
                   saveAs: { params.save_reference ? it : null }, mode: "${params.publish_dir_mode}"

        input:
        file gz from ch_infernal_db_gz

        output:
        file "${gz.baseName}" into ch_infernal_cm

        script:
        """
        gunzip -c --verbose --stdout --force ${gz} > ${gz.baseName}
        """
    }
  }

  process prepare_infernal_db {
      tag "${infernal_cm}"
      publishDir path: { params.save_reference ? "${params.outdir}/reference/infernal" : params.outdir },
                 saveAs: { params.save_reference ? it : null }, mode: "${params.publish_dir_mode}"

      input:
      file infernal_cm from ch_infernal_cm.collect()

      output:
      set val("${infernal_cm}"), file("${infernal_cm}*") into ch_infernal_db

      script:
      """
      cmpress ${infernal_cm}
      """
  }

  process search_noncoding {
      tag "${sample_id}"
      label "process_high"
      label "process_long"
      publishDir "${params.outdir}/infernal", mode: "${params.publish_dir_mode}"

      input:
      set val(db_name), file(db_index) from ch_infernal_db.collect()
      file rfam_clan_info from ch_rfam_clan_info.collect()
      set val(sample_id), file (fasta) from ch_noncoding_nucleotides

      output:
      file txt into ch_infernal_results

      script:
      txt = "${sample_id}.txt"
      """
      cmscan  \\
          --cut_ga \\
          --nohmmonly \\
          --clanin ${rfam_clan_info} \\
          --fmt 2 \\
          --rfam \\
          --cpu ${task.cpus} \\
          --tblout ${txt} \\
          ${db_name} \\
          ${fasta}
      """
  }

}


///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/* --                                                                     -- */
/* --                            MULTIQC                                  -- */
/* --                                                                     -- */
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/*
 * STEP 10 - MultiQC
 */
process multiqc {
    publishDir "${params.outdir}/MultiQC", mode: 'copy'

    input:
    file (multiqc_config) from ch_multiqc_config
    file (mqc_custom_config) from ch_multiqc_custom_config.collect().ifEmpty([])
    // TODO nf-core: Add in log files from your new processes for MultiQC to find!
    file ('fastqc/*') from ch_fastqc_results.collect().ifEmpty([])
    file ('software_versions/*') from ch_software_versions_yaml.collect()
    file ("fastp/*") from ch_fastp_results.collect().ifEmpty([])
    file workflow_summary from create_workflow_summary(summary)

    output:
    file "*multiqc_report.html" into ch_multiqc_report
    file "*_data"
    file "multiqc_plots"

    script:
    rtitle = custom_runName ? "--title \"$custom_runName\"" : ''
    rfilename = custom_runName ? "--filename " + custom_runName.replaceAll('\\W','_').replaceAll('_+','_') + "_multiqc_report" : ''
    custom_config_file = params.multiqc_config ? "--config $mqc_custom_config" : ''
    // TODO nf-core: Specify which MultiQC modules to use with -m for a faster run time
    """
    multiqc -f $rtitle $rfilename $custom_config_file -m fastqc -m fastp .
    touch multiqc_report.html multiqc_plots _data
    """
}

/*
 * STEP 11 - Output Description HTML
 */
process output_documentation {
    publishDir "${params.outdir}/pipeline_info", mode: 'copy'

    input:
    file output_docs from ch_output_docs

    output:
    file "results_description.html"

    script:
    """
    markdown_to_html.py $output_docs -o results_description.html
    """
}

/*
 * Completion e-mail notification
 */
workflow.onComplete {

    // Set up the e-mail variables
    def subject = "[nf-core/predictorthologs] Successful: $workflow.runName"
    if (!workflow.success) {
        subject = "[nf-core/predictorthologs] FAILED: $workflow.runName"
    }
    def email_fields = [:]
    email_fields['version'] = workflow.manifest.version
    email_fields['runName'] = custom_runName ?: workflow.runName
    email_fields['success'] = workflow.success
    email_fields['dateComplete'] = workflow.complete
    email_fields['duration'] = workflow.duration
    email_fields['exitStatus'] = workflow.exitStatus
    email_fields['errorMessage'] = (workflow.errorMessage ?: 'None')
    email_fields['errorReport'] = (workflow.errorReport ?: 'None')
    email_fields['commandLine'] = workflow.commandLine
    email_fields['projectDir'] = workflow.projectDir
    email_fields['summary'] = summary
    email_fields['summary']['Date Started'] = workflow.start
    email_fields['summary']['Date Completed'] = workflow.complete
    email_fields['summary']['Pipeline script file path'] = workflow.scriptFile
    email_fields['summary']['Pipeline script hash ID'] = workflow.scriptId
    if (workflow.repository) email_fields['summary']['Pipeline repository Git URL'] = workflow.repository
    if (workflow.commitId) email_fields['summary']['Pipeline repository Git Commit'] = workflow.commitId
    if (workflow.revision) email_fields['summary']['Pipeline Git branch/tag'] = workflow.revision
    email_fields['summary']['Nextflow Version'] = workflow.nextflow.version
    email_fields['summary']['Nextflow Build'] = workflow.nextflow.build
    email_fields['summary']['Nextflow Compile Timestamp'] = workflow.nextflow.timestamp

    // TODO nf-core: If not using MultiQC, strip out this code (including params.max_multiqc_email_size)
    // On success try attach the multiqc report
    def mqc_report = null
    try {
        if (workflow.success) {
            mqc_report = ch_multiqc_report.getVal()
            if (mqc_report.getClass() == ArrayList) {
                log.warn "[nf-core/predictorthologs] Found multiple reports from process 'multiqc', will use only one"
                mqc_report = mqc_report[0]
            }
        }
    } catch (all) {
        log.warn "[nf-core/predictorthologs] Could not attach MultiQC report to summary email"
    }

    // Check if we are only sending emails on failure
    email_address = params.email
    if (!params.email && params.email_on_fail && !workflow.success) {
        email_address = params.email_on_fail
    }

    // Render the TXT template
    def engine = new groovy.text.GStringTemplateEngine()
    def tf = new File("$baseDir/assets/email_template.txt")
    def txt_template = engine.createTemplate(tf).make(email_fields)
    def email_txt = txt_template.toString()

    // Render the HTML template
    def hf = new File("$baseDir/assets/email_template.html")
    def html_template = engine.createTemplate(hf).make(email_fields)
    def email_html = html_template.toString()

    // Render the sendmail template
    def smail_fields = [ email: email_address, subject: subject, email_txt: email_txt, email_html: email_html, baseDir: "$baseDir", mqcFile: mqc_report, mqcMaxSize: params.max_multiqc_email_size.toBytes() ]
    def sf = new File("$baseDir/assets/sendmail_template.txt")
    def sendmail_template = engine.createTemplate(sf).make(smail_fields)
    def sendmail_html = sendmail_template.toString()

    // Send the HTML e-mail
    if (email_address) {
        try {
            if (params.plaintext_email) { throw GroovyException('Send plaintext e-mail, not HTML') }
            // Try to send HTML e-mail using sendmail
            [ 'sendmail', '-t' ].execute() << sendmail_html
            log.info "[nf-core/predictorthologs] Sent summary e-mail to $email_address (sendmail)"
        } catch (all) {
            // Catch failures and try with plaintext
            [ 'mail', '-s', subject, email_address ].execute() << email_txt
            log.info "[nf-core/predictorthologs] Sent summary e-mail to $email_address (mail)"
        }
    }

    // Write summary e-mail HTML to a file
    def output_d = new File("${params.outdir}/pipeline_info/")
    if (!output_d.exists()) {
        output_d.mkdirs()
    }
    def output_hf = new File(output_d, "pipeline_report.html")
    output_hf.withWriter { w -> w << email_html }
    def output_tf = new File(output_d, "pipeline_report.txt")
    output_tf.withWriter { w -> w << email_txt }

    c_green = params.monochrome_logs ? '' : "\033[0;32m";
    c_purple = params.monochrome_logs ? '' : "\033[0;35m";
    c_red = params.monochrome_logs ? '' : "\033[0;31m";
    c_reset = params.monochrome_logs ? '' : "\033[0m";

    if (workflow.stats.ignoredCount > 0 && workflow.success) {
        log.info "-${c_purple}Warning, pipeline completed, but with errored process(es) ${c_reset}-"
        log.info "-${c_red}Number of ignored errored process(es) : ${workflow.stats.ignoredCount} ${c_reset}-"
        log.info "-${c_green}Number of successfully ran process(es) : ${workflow.stats.succeedCount} ${c_reset}-"
    }

    if (workflow.success) {
        log.info "-${c_purple}[nf-core/predictorthologs]${c_green} Pipeline completed successfully${c_reset}-"
    } else {
        checkHostname()
        log.info "-${c_purple}[nf-core/predictorthologs]${c_red} Pipeline completed with errors${c_reset}-"
    }

}

// Check file extension
def hasExtension(it, extension) {
    it.toString().toLowerCase().endsWith(extension.toLowerCase())
}

def nfcoreHeader() {
    // Log colors ANSI codes
    c_black = params.monochrome_logs ? '' : "\033[0;30m";
    c_blue = params.monochrome_logs ? '' : "\033[0;34m";
    c_cyan = params.monochrome_logs ? '' : "\033[0;36m";
    c_dim = params.monochrome_logs ? '' : "\033[2m";
    c_green = params.monochrome_logs ? '' : "\033[0;32m";
    c_purple = params.monochrome_logs ? '' : "\033[0;35m";
    c_reset = params.monochrome_logs ? '' : "\033[0m";
    c_white = params.monochrome_logs ? '' : "\033[0;37m";
    c_yellow = params.monochrome_logs ? '' : "\033[0;33m";

    return """    -${c_dim}--------------------------------------------------${c_reset}-
                                            ${c_green},--.${c_black}/${c_green},-.${c_reset}
    ${c_blue}        ___     __   __   __   ___     ${c_green}/,-._.--~\'${c_reset}
    ${c_blue}  |\\ | |__  __ /  ` /  \\ |__) |__         ${c_yellow}}  {${c_reset}
    ${c_blue}  | \\| |       \\__, \\__/ |  \\ |___     ${c_green}\\`-._,-`-,${c_reset}
                                            ${c_green}`._,._,\'${c_reset}
    ${c_purple}  nf-core/predictorthologs v${workflow.manifest.version}${c_reset}
    -${c_dim}--------------------------------------------------${c_reset}-
    """.stripIndent()
}

def checkHostname() {
    def c_reset = params.monochrome_logs ? '' : "\033[0m"
    def c_white = params.monochrome_logs ? '' : "\033[0;37m"
    def c_red = params.monochrome_logs ? '' : "\033[1;91m"
    def c_yellow_bold = params.monochrome_logs ? '' : "\033[1;93m"
    if (params.hostnames) {
        def hostname = "hostname".execute().text.trim()
        params.hostnames.each { prof, hnames ->
            hnames.each { hname ->
                if (hostname.contains(hname) && !workflow.profile.contains(prof)) {
                    log.error "====================================================\n" +
                            "  ${c_red}WARNING!${c_reset} You are running with `-profile $workflow.profile`\n" +
                            "  but your machine hostname is ${c_white}'$hostname'${c_reset}\n" +
                            "  ${c_yellow_bold}It's highly recommended that you use `-profile $prof${c_reset}`\n" +
                            "============================================================"
                }
            }
        }
    }
}