Merge pull request #127 from tkchafin/1.2.1

Patch 1.2.1: Fix issue where pipeline fails on missing fields from NCBI datasets JSON
sanger-tol · Aug 1, 2024 · 028ddae · 028ddae
2 parents 91194d2 + e12d883
commit 028ddae
Show file tree

Hide file tree

Showing 4 changed files with 112 additions and 58 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,12 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [[1.2.1](https://github.com/sanger-tol/genomenote/releases/tag/1.2.1)] [2024-07-12]
+
+### Enhancements & fixes
+
+- Bugfix: Now handles missing fields in `ncbi datasets` genome report
+
 ## [[1.2.0](https://github.com/sanger-tol/genomenote/releases/tag/1.2.0)] - Pyrenean Mountain Dog - [2024-05-01]
 
 ### Enhancements & fixes

diff --git a/CITATION.cff b/CITATION.cff
@@ -2,7 +2,7 @@
 # Visit https://bit.ly/cffinit to generate yours today!
 
 cff-version: 1.2.0
-title: sanger-tol/genomenote v1.2.0
+title: sanger-tol/genomenote v1.2.1
 message: >-
     If you use this software, please cite it using the
     metadata from this file.
@@ -34,5 +34,5 @@ identifiers:
 repository-code: "https://github.com/sanger-tol/genomenote"
 license: MIT
 commit: TODO
-version: 1.2.0
+version: 1.2.1
 date-released: "2022-10-07"
diff --git a/bin/create_table.py b/bin/create_table.py
@@ -6,120 +6,159 @@
 import sys
 import csv
 import re
+import math
 
 
 def parse_args(args=None):
     Description = "Create a table by parsing json output to extract N50, BUSCO, QV and COMPLETENESS stats."
 
     parser = argparse.ArgumentParser(description=Description)
-    parser.add_argument("--genome", help="Input NCBI genome summary JSON file.", required=True)
-    parser.add_argument("--sequence", help="Input NCBI sequence summary JSON file.", required=True)
+    parser.add_argument("--genome", required=True, help="Input NCBI genome summary JSON file.")
+    parser.add_argument("--sequence", required=True, help="Input NCBI sequence summary JSON file.")
     parser.add_argument("--busco", help="Input BUSCO short summary JSON file.")
     parser.add_argument("--qv", nargs="*", help="Input QV TSV file from MERQURYFK.")
     parser.add_argument("--completeness", nargs="*", help="Input COMPLETENESS stats TSV file from MERQURYFK.")
     parser.add_argument("--hic", action="append", help="HiC sample ID used for contact maps.")
     parser.add_argument("--flagstat", action="append", help="HiC flagstat file created by Samtools.")
-    parser.add_argument("--outcsv", help="Output CSV file.", required=True)
+    parser.add_argument("--outcsv", required=True, help="Output CSV file.")
     parser.add_argument("--version", action="version", version="%(prog)s 3.1")
     return parser.parse_args(args)
 
 
 def make_dir(path):
+    """
+    Creates a directory if it doesn't exist.
+
+    Parameters:
+    path (str): Path of the directory to be created.
+    """
     if len(path) > 0:
         os.makedirs(path, exist_ok=True)
 
 
-# check_samplesheet.py adds a suffix like "_T1", "_T2", etc, to the sample names
+# check_samplesheet.py adds a suffix like "_T1", "_T2", etc, to sample names
 # We usually don't want it in the final output
 def remove_sample_T_suffix(name):
+    """
+    Removes the suffix like "_T1", "_T2", etc., from sample names.
+
+    Parameters:
+    name (str): Sample name to be processed.
+
+    Returns:
+    str: Sample name with the suffix removed.
+    """
     return re.sub(r"_T\d+", "", name)
 
 
 def ncbi_stats(genome_in, seq_in, writer):
+    """
+    Extracts and writes assembly information and statistics from genome and
+    sequence JSON files to a CSV file.
+
+    Parameters:
+    genome_in (str): Path to the NCBI genome summary JSON file.
+    seq_in (str): Path to the NCBI sequence summary JSON file.
+    writer (csv.writer): CSV writer object to write the extracted data.
+    """
     with open(genome_in, "r") as fin1:
         data = json.load(fin1)
+    data = data.get("reports", [{}])[0]
+
     with open(seq_in, "r") as fin2:
-        seq = json.load(fin2)
+        seq = json.load(fin2).get("reports", [])
 
-    data = data["reports"][0]
-    info = data["assembly_info"]
-    attr = info["biosample"]["attributes"]
-    stats = data["assembly_stats"]
-    seq = seq["reports"]
+    info = data.get("assembly_info", {})
+    attr = info.get("biosample", {}).get("attributes", [])
+    stats = data.get("assembly_stats", {})
+    organism = data.get("organism", {})
 
+    # Write assembly information
     writer.writerow(["##Assembly_Information"])
-    writer.writerow(["Accession", data["accession"]])
-    if "common_name" in data["organism"]:
-        writer.writerow(["Common_Name", data["organism"]["common_name"]])
-    writer.writerow(["Organism_Name", data["organism"]["organism_name"]])
-    writer.writerow(
-        [
-            "ToL_ID",
-            "".join(pairs["value"] for pairs in attr if pairs["name"] == "tolid"),
-        ]
-    )
-    writer.writerow(["Taxon_ID", data["organism"]["tax_id"]])
-    writer.writerow(["Assembly_Name", info["assembly_name"]])
-    writer.writerow(["Assembly_Level", info["assembly_level"]])
-    writer.writerow(
-        [
-            "Life_Stage",
-            "".join(pairs["value"] for pairs in attr if pairs["name"] == "life_stage"),
-        ]
-    )
-    writer.writerow(
-        [
-            "Tissue",
-            "".join(pairs["value"] for pairs in attr if pairs["name"] == "tissue"),
-        ]
-    )
-    writer.writerow(["Sex", "".join(pairs["value"] for pairs in attr if pairs["name"] == "sex")])
+    writer.writerow(["Accession", data.get("accession", math.nan)])
+    writer.writerow(["Common_Name", organism.get("common_name", math.nan)])
+    writer.writerow(["Organism_Name", organism.get("organism_name", math.nan)])
+    tol_id = "".join(pairs.get("value", "") for pairs in attr if pairs.get("name") == "tolid")
+    writer.writerow(["ToL_ID", tol_id if tol_id else math.nan])
+    writer.writerow(["Taxon_ID", organism.get("tax_id", math.nan)])
+    writer.writerow(["Assembly_Name", info.get("assembly_name", math.nan)])
+    writer.writerow(["Assembly_Level", info.get("assembly_level", math.nan)])
+    life_stage = "".join(pairs.get("value", "") for pairs in attr if pairs.get("name") == "life_stage")
+    writer.writerow(["Life_Stage", life_stage if life_stage else math.nan])
+    tissue = "".join(pairs.get("value", "") for pairs in attr if pairs.get("name") == "tissue")
+    writer.writerow(["Tissue", tissue if tissue else math.nan])
+    sex = "".join(pairs.get("value", "") for pairs in attr if pairs.get("name") == "sex")
+    writer.writerow(["Sex", sex if sex else math.nan])
+
+    # Write assembly statistics
     writer.writerow(["##Assembly_Statistics"])
-    writer.writerow(["Total_Sequence", stats["total_sequence_length"]])
-    if "total_number_of_chromosomes" in stats:
-        writer.writerow(["Chromosomes", stats["total_number_of_chromosomes"]])
-    writer.writerow(["Scaffolds", stats["number_of_scaffolds"]])
-    writer.writerow(["Scaffold_N50", stats["scaffold_n50"]])
-    writer.writerow(["Contigs", stats["number_of_contigs"]])
-    writer.writerow(["Contig_N50", stats["contig_n50"]])
-    writer.writerow(["GC_Percent", stats["gc_percent"]])
+    writer.writerow(["Total_Sequence", stats.get("total_sequence_length", math.nan)])
+    writer.writerow(["Chromosomes", stats.get("total_number_of_chromosomes", math.nan)])
+    writer.writerow(["Scaffolds", stats.get("number_of_scaffolds", math.nan)])
+    writer.writerow(["Scaffold_N50", stats.get("scaffold_n50", math.nan)])
+    writer.writerow(["Contigs", stats.get("number_of_contigs", math.nan)])
+    writer.writerow(["Contig_N50", stats.get("contig_n50", math.nan)])
+    writer.writerow(["GC_Percent", stats.get("gc_percent", math.nan)])
+
     chromosome_header = False
     for mol in seq:
-        if "gc_percent" in mol and mol["assembly_unit"] != "non-nuclear":
+        if mol.get("gc_percent") is not None and mol.get("assembly_unit") != "non-nuclear":
             if not chromosome_header:
                 writer.writerow(["##Chromosome", "Length", "GC_Percent"])
                 chromosome_header = True
             writer.writerow(
                 [
-                    mol["chr_name"],
-                    round(mol["length"] / 1000000, 2),
-                    mol["gc_percent"],
+                    mol.get("chr_name", math.nan),
+                    round(mol.get("length", 0) / 1000000, 2) if mol.get("length") is not None else math.nan,
+                    mol.get("gc_percent", math.nan),
                 ]
             )
+
     organelle_header = False
     for mol in seq:
-        if "gc_percent" in mol and mol["assembly_unit"] == "non-nuclear":
+        if mol.get("gc_percent") is not None and mol.get("assembly_unit") == "non-nuclear":
             if not organelle_header:
                 writer.writerow(["##Organelle", "Length", "GC_Percent"])
                 organelle_header = True
             writer.writerow(
                 [
-                    mol["assigned_molecule_location_type"],
-                    round(mol["length"] / 1000000, 2),
-                    mol["gc_percent"],
+                    mol.get("assigned_molecule_location_type", math.nan),
+                    round(mol.get("length", 0) / 1000000, 2) if mol.get("length") is not None else math.nan,
+                    mol.get("gc_percent", math.nan),
                 ]
             )
 
 
 def extract_busco(file_in, writer):
+    """
+    Extracts BUSCO information from a JSON file and writes it to a CSV file.
+
+    Parameters:
+    file_in (str): Path to the BUSCO summary JSON file.
+    writer (csv.writer): CSV writer object to write the extracted data.
+    """
     with open(file_in, "r") as fin:
         data = json.load(fin)
 
-    writer.writerow(["##BUSCO", data["lineage_dataset"]["name"]])
-    writer.writerow(["Summary", data["results"]["one_line_summary"]])
+    lineage_dataset_name = data.get("lineage_dataset", {}).get("name", math.nan)
+    results_summary = data.get("results", {}).get("one_line_summary", math.nan)
+
+    writer.writerow(["##BUSCO", lineage_dataset_name])
+    writer.writerow(["Summary", results_summary])
 
 
 def extract_pacbio(qv, completeness, writer):
+    """
+    Extracts QV and completeness information from TSV files and writes it to a
+    CSV file.
+
+    NOTE: completeness and qv files have to be from matching specimen names
+
+    Parameters:
+    qv (list): List of paths to one or more QV TSV files.
+    completeness (list): List of paths to completeness stats TSV files.
+    writer (csv.writer): CSV writer object to write the extracted data.
+    """
     qval = 0
     qv_name = None
     for f in qv:
@@ -153,6 +192,15 @@ def extract_pacbio(qv, completeness, writer):
 
 
 def extract_mapped(sample, file_in, writer):
+    """
+    Extracts mapping information from a flagstat file and writes it to a CSV
+    file.
+
+    Parameters:
+    sample (str): Sample ID used for the HiC contact maps.
+    file_in (str): Path to the HiC flagstat file created by Samtools.
+    writer (csv.writer): CSV writer object to write the extracted data.
+    """
     writer.writerow(["##HiC", remove_sample_T_suffix(sample)])
     with open(file_in, "r") as fin:
         for line in fin:

diff --git a/nextflow.config b/nextflow.config
@@ -125,7 +125,7 @@ profiles {
     arm {
         docker.runOptions = '-u $(id -u):$(id -g) --platform=linux/amd64'
     }
-    singularity {        
+    singularity {
         singularity.enabled    = true
         singularity.autoMounts = true
         conda.enabled          = false
@@ -222,7 +222,7 @@ manifest {
     description     = """Creating standarised genome assembly publications"""
     mainScript      = 'main.nf'
     nextflowVersion = '!>=22.10.1'
-    version         = '1.2.0'
+    version         = '1.2.1'
     doi             = '10.5281/zenodo.7949384'
 }