sanger-tol · priyanka-surana · Nov 30, 2022 · Nov 1, 2022 · Nov 1, 2022 · Nov 1, 2022
diff --git a/bin/add_merqury.py b/bin/add_merqury.py
@@ -0,0 +1,54 @@
+#!/usr/bin/env python3
+
+import argparse
+import os
+import json
+import sys
+import csv
+
+def parse_args(args=None):
+    Description = "Create a table by parsing json output to extract N50, BUSCO, QV and COMPLETENESS stats."
+
+    parser = argparse.ArgumentParser(description=Description)
+    parser.add_argument("FILE_IN", help="Input CSV file.")
+    parser.add_argument("SAMPLE", help="PacBio sample ID used for MerquryFK.")
+    parser.add_argument("QV", help="Input QV TSV file from MERQURYFK.")
+    parser.add_argument("COMPLETENESS", help="Input COMPLETENESS stats TSV file from MERQURYFK.")
+    parser.add_argument("FILE_OUT", help="Output CSV file.")
+    parser.add_argument('--version', action='version', version='%(prog)s 1.0')
+    return parser.parse_args(args)
+
+def make_dir(path):
+    if len(path) > 0:
+        os.makedirs(path, exist_ok=True)
+
+def extract_qv(file_in, writer):
+    with open(file_in, "r") as fin:
+        data = csv.DictReader(fin, delimiter="\t")
+        for row in data:
+            writer.writerow(["QV", row["QV"]])
+
+def extract_completeness(file_in, writer):
+    with open(file_in, "r") as fin:
+        data = csv.DictReader(fin, delimiter="\t")
+        for row in data:
+            writer.writerow(["Completeness", row["% Covered"]])
+
+def main(args=None):
+    args = parse_args(args)
+
+    out_dir = os.path.dirname(args.FILE_OUT)
+    make_dir(out_dir)
+
+    with open(args.FILE_OUT, "w") as fout:
+        with open(args.FILE_IN, "r") as fin:
+            for line in fin:
+                fout.write(line)
+
+        writer = csv.writer(fout)
+        writer.writerow(["##MerquryFK", "_".join(args.SAMPLE.split("_")[:-1])])
+        extract_qv(args.QV, writer)
+        extract_completeness(args.COMPLETENESS, writer)
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/bin/bed_filter.sh b/bin/bed_filter.sh
@@ -3,4 +3,4 @@
 infile=$1
 outfile=$2
 
-paste -d '\t' - - < $infile | sed 's/-/_/g' | awk 'BEGIN {FS="\t"; OFS="\t"} {if ($1 > $7) {print substr($4,1,length($4)-2),$12,$7,$8,"16",$6,$1,$2,"8",$11,$5} else { print substr($4,1,length($4)-2),$6,$1,$2,"8",$12,$7,$8,"16",$5,$11} }' | tr '_+' '01' > $outfile
+paste -d '\t' - - < $infile | sed 's/-/_/g' | awk 'BEGIN {FS="\t"; OFS="\t"} {if ($1 > $7) {print substr($4,1,length($4)-2),$12,$7,$8,"16",$6,$1,$2,"8",$11,$5} else { print substr($4,1,length($4)-2),$6,$1,$2,"8",$12,$7,$8,"16",$5,$11} }' | tr '_+' '01' | awk 'NF==11' > $outfile
diff --git a/bin/create_table.py b/bin/create_table.py
diff --git a/bin/summary_table.py b/bin/summary_table.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python3
+
+import argparse
+import os
+import json
+import sys
+import csv
+
+def parse_args(args=None):
+    Description = "Create a table by parsing json output to extract N50, BUSCO, QV and COMPLETENESS stats."
+
+    parser = argparse.ArgumentParser(description=Description)
+    parser.add_argument("GENOME", help="Input NCBI genome summary JSON file.")
+    parser.add_argument("SEQUENCE", help="Input NCBI sequence summary JSON file.")
+    parser.add_argument("BUSCO", help="Input BUSCO short summary JSON file.")
+    parser.add_argument("FILE_OUT", help="Output CSV file.")
+    parser.add_argument('--version', action='version', version='%(prog)s 1.0')
+    return parser.parse_args(args)
+
+def make_dir(path):
+    if len(path) > 0:
+        os.makedirs(path, exist_ok=True)
+
+def ncbi_stats(genome_in, seq_in, writer):
+    with open(genome_in, "r") as fin1:
+        data = json.load(fin1)
+    with open(seq_in, "r") as fin2:
+        seq = json.load(fin2)
+
+    data = data["reports"][0]
+    info = data["assembly_info"]
+    attr = info["biosample"]["attributes"]
+    stats = data["assembly_stats"]
+    seq = seq["reports"]
+
+    writer.writerow(["##Assembly_Information"])
+    writer.writerow(["Accession", data["accession"]])
+    if "common_name" in data["organism"]: writer.writerow(["Common_Name", data["organism"]["common_name"]])
+    writer.writerow(["Organism_Name", data["organism"]["organism_name"]])
+    writer.writerow(["ToL_ID", "".join([ pairs["value"] for pairs in attr if pairs["name"] == "tolid" ])])
+    writer.writerow(["Taxon_ID", data["organism"]["tax_id"]])
+    writer.writerow(["Assembly_Name", info["assembly_name"]])
+    writer.writerow(["Assembly_Level", info["assembly_level"]])
+    writer.writerow(["Life_Stage", "".join([ pairs["value"] for pairs in attr if pairs["name"] == "life_stage" ])])
+    writer.writerow(["Tissue", "".join([ pairs["value"] for pairs in attr if pairs["name"] == "tissue" ])])
+    writer.writerow(["Sex", "".join([ pairs["value"] for pairs in attr if pairs["name"] == "sex" ])])
+    writer.writerow(["##Assembly_Statistics"])
+    writer.writerow(["Total_Sequence", stats["total_sequence_length"]])
+    writer.writerow(["Chromosomes", stats["total_number_of_chromosomes"]])
+    writer.writerow(["Scaffolds", stats["number_of_scaffolds"]])
+    writer.writerow(["Scaffold_N50", stats["scaffold_n50"]])
+    writer.writerow(["Contigs", stats["number_of_contigs"]])
+    writer.writerow(["Contig_N50", stats["contig_n50"]])
+    writer.writerow(["GC_Percent", stats["gc_percent"]])
+    writer.writerow(["##Chromosome", "Length", "GC_Percent"])
+    for chrom in [[mol["chr_name"], mol["length"], mol["gc_percent"]] for mol in seq if "gc_percent" in mol and mol["assembly_unit"] != "non-nuclear"]:
+        writer.writerow(chrom)
+    writer.writerow(["##Organelle", "Length", "GC_Percent"])
+    for chrom in [[mol["assigned_molecule_location_type"], mol["length"], mol["gc_percent"]] for mol in seq if "gc_percent" in mol and mol["assembly_unit"] == "non-nuclear"]:
+        writer.writerow(chrom)
+
+def extract_busco(file_in, writer):
+    with open(file_in, "r") as fin:
+        data = json.load(fin)
+
+    writer.writerow(["##BUSCO"])
+    writer.writerow(["Lineage", data["lineage_dataset"]["name"]])
+    writer.writerow(["Summary", data["results"]["one_line_summary"]])
+
+def main(args=None):
+    args = parse_args(args)
+
+    out_dir = os.path.dirname(args.FILE_OUT)
+    make_dir(out_dir)
+
+    with open(args.FILE_OUT, "w") as fout:
+        writer = csv.writer(fout)
+        ncbi_stats(args.GENOME, args.SEQUENCE, writer)
+        extract_busco(args.BUSCO, writer)
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/bin/tol_input.sh b/bin/tol_input.sh
@@ -24,7 +24,7 @@ assembly=$(echo $genome | cut -f12 -d'/')
 gca=$(echo $genome | cut -f14 -d'/' | sed 's/.fasta.gz//')
 
 # Currently this will import a masked file, but once the `insdcdownload` pipeline goes in production, it will be unmasked
-gunzip -c $genome > ${gca}.fasta
+ln -s $genome
 
 analysis=$data/$taxon/$organism/analysis/$assembly
 

diff --git a/conf/base.config b/conf/base.config
@@ -1,7 +1,7 @@
 /*
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    nf-core/genomenote Nextflow base config file
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    sanger-tol/genomenote Nextflow base config file
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     A 'blank slate' config file, appropriate for general use on most high performance
     compute environments. Assumes that all software is installed and available on
     the PATH. Runs in `local` mode - all jobs will be run on the logged in environment.
@@ -14,7 +14,7 @@ process {
     memory = { check_max( 6.GB * task.attempt, 'memory' ) }
     time   = { check_max( 4.h  * task.attempt, 'time'   ) }
 
-    errorStrategy = { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'finish' }
+    errorStrategy = { task.exitStatus in [140,143,137,104,134,139] ? 'retry' : 'finish' }
     maxRetries    = 1
     maxErrors     = '-1'
 
@@ -24,6 +24,11 @@ process {
     //        If possible, it would be nice to keep the same label naming convention when
     //        adding in your local modules too.
     // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors
+    withLabel:process_single {
+        cpus   = { check_max( 1                  , 'cpus'    ) }
+        memory = { check_max( 6.GB * task.attempt, 'memory'  ) }
+        time   = { check_max( 4.h  * task.attempt, 'time'    ) }
+    }
     withLabel:process_low {
         cpus   = { check_max( 2     * task.attempt, 'cpus'    ) }
         memory = { check_max( 12.GB * task.attempt, 'memory'  ) }
@@ -42,11 +47,6 @@ process {
     withLabel:process_long {
         time   = { check_max( 20.h  * task.attempt, 'time'    ) }
     }
-    withLabel:process_samtools {
-        cpus   = { check_max( 8     * task.attempt, 'cpus'    ) }
-        memory = { check_max( 16.GB * task.attempt, 'memory'  ) }
-        time   = { check_max( 6.h   * task.attempt, 'time'    ) }
-    }
     withLabel:process_high_memory {
         memory = { check_max( 200.GB * task.attempt, 'memory' ) }
     }

diff --git a/conf/modules.config b/conf/modules.config
@@ -11,13 +11,6 @@
 */
 
 process {
-/*
-    publishDir = [
-        path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" },
-        mode: params.publish_dir_mode,
-        saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-    ]
-*/
     withName: SAMPLESHEET_CHECK {
         publishDir = [
             path: { "${params.outdir}/pipeline_info" },
@@ -55,7 +48,20 @@ process {
         ext.args = '--mode genome --tar'
     }
 
-    withName: CREATE_TABLE {
+    withName: SUMMARYSEQUENCE {
+        ext.prefix = { "${meta.id}_sequence" }
+        ext.args = "--report sequence"
+    }
+
+    withName: SUMMARYTABLE {
+        publishDir = [
+            path: { "${meta.outdir}/genome_statistics" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+
+    withName: ADDMERQURY {
         publishDir = [
             path: { "${meta.outdir}/genome_statistics" },
             mode: params.publish_dir_mode,

diff --git a/conf/test.config b/conf/test.config
@@ -20,7 +20,7 @@ params {
     max_time   = '6.h'
 
     // Input data
-    input   = "mMelMel3"
-    project = "/lustre/scratch123/tol/projects/.sandbox"
-    lineage_db = "/lustre/scratch123/tol/resources/nextflow/test-data/busco_2021_06_reduced"
+    input = "${projectDir}/assets/samplesheet.csv"
+    fasta = "/lustre/scratch123/tol/projects/.sandbox/data/mammals/Meles_meles/assembly/release/mMelMel3.2_paternal_haplotype/insdc/GCA_922984935.2.subset.fasta.gz"
+    lineage_db = "/lustre/scratch123/tol/resources/nextflow/busco_2021_06_reduced"
 }
diff --git a/conf/test_full.config b/conf/test_full.config
@@ -15,8 +15,7 @@ params {
     config_profile_description = 'Full test dataset to check pipeline function'
 
     // Input data for full size test
-    input      = "gfLaeSulp1"
-    project    = "/lustre/scratch123/tol/projects/.sandbox"
-    kmer       = "/lustre/scratch123/tol/projects/.sandbox/data/fungi/Laetiporus_sulphureus/genomic_data/gfLaeSulp1/hic-arima2/kmer"
+    input = "${projectDir}/assets/samplesheet_full.csv"
+    fasta = "/lustre/scratch124/tol/projects/darwin/data/insects/Ypsolopha_sequella/assembly/release/ilYpsSequ2.1/insdc/GCA_934047225.1.fasta.gz"
     lineage_db = "/lustre/scratch123/tol/resources/busco/v5"
 }
diff --git a/modules.json b/modules.json
@@ -2,30 +2,47 @@
     "name": "nf-core/genomenote",
     "homePage": "https://github.com/nf-core/genomenote",
     "repos": {
-        "nf-core/modules": {
-            "bedtools/bamtobed": {
-                "git_sha": "90aef30f432332bdf0ce9f4b9004aa5d5c4960bb"
-            },
-            "busco": {
-                "git_sha": "89a84538bede7c6919f7c042fdb4c79e5e2d9d2a"
-            },
-            "cooler/cload": {
-                "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d"
-            },
-            "cooler/zoomify": {
-                "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d"
-            },
-            "custom/dumpsoftwareversions": {
-                "git_sha": "82501fe6d0d12614db67751d30af98d16e63dc59"
-            },
-            "merquryfk/merquryfk": {
-                "git_sha": "96602d03faf2b3a22323a19032f6e7e97b532d70"
-            },
-            "samtools/faidx": {
-                "git_sha": "3eb99152cedbb7280258858e5df08478a4670696"
-            },
-            "samtools/view": {
-                "git_sha": "83b05a8dc05b9dd2f1979d1a9f08411fb1a82b11"
+        "https://github.com/nf-core/modules.git": {
+            "modules": {
+                "nf-core": {
+                    "bedtools/bamtobed": {
+                        "branch": "master",
+                        "git_sha": "219bbdaf9d08043a66f2b531609c1e8e4ac89021",
+                        "patch": "modules/nf-core/bedtools/bamtobed/bedtools-bamtobed.diff"
+                    },
+                    "busco": {
+                        "branch": "master",
+                        "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905"
+                    },
+                    "cooler/cload": {
+                        "branch": "master",
+                        "git_sha": "73890978012600b41dcc1b44136287a172b3fb71"
+                    },
+                    "cooler/zoomify": {
+                        "branch": "master",
+                        "git_sha": "73890978012600b41dcc1b44136287a172b3fb71"
+                    },
+                    "custom/dumpsoftwareversions": {
+                        "branch": "master",
+                        "git_sha": "8022c68e7403eecbd8ba9c49496f69f8c49d50f0"
+                    },
+                    "gunzip": {
+                        "branch": "master",
+                        "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905"
+                    },
+                    "merquryfk/merquryfk": {
+                        "branch": "master",
+                        "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905"
+                    },
+                    "samtools/faidx": {
+                        "branch": "master",
+                        "git_sha": "cf5b9c30a2adacc581793afb79fae5f5b50bed01"
+                    },
+                    "samtools/view": {
+                        "branch": "master",
+                        "git_sha": "cf5b9c30a2adacc581793afb79fae5f5b50bed01"
+                    }
+                }
             }
         }
     }