diff --git a/CHANGELOG.md b/CHANGELOG.md index 7fd10221..8fa13bc2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,12 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [[1.2.1](https://github.com/sanger-tol/genomenote/releases/tag/1.2.1)] [2024-07-12] + +### Enhancements & fixes + +- Bugfix: Now handles missing fields in `ncbi datasets` genome report + ## [[1.2.0](https://github.com/sanger-tol/genomenote/releases/tag/1.2.0)] - Pyrenean Mountain Dog - [2024-05-01] ### Enhancements & fixes diff --git a/CITATION.cff b/CITATION.cff index 091ab90b..8a350dff 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -2,7 +2,7 @@ # Visit https://bit.ly/cffinit to generate yours today! cff-version: 1.2.0 -title: sanger-tol/genomenote v1.2.0 +title: sanger-tol/genomenote v1.2.1 message: >- If you use this software, please cite it using the metadata from this file. @@ -34,5 +34,5 @@ identifiers: repository-code: "https://github.com/sanger-tol/genomenote" license: MIT commit: TODO -version: 1.2.0 +version: 1.2.1 date-released: "2022-10-07" diff --git a/bin/create_table.py b/bin/create_table.py index dea6b94e..31ab9663 100755 --- a/bin/create_table.py +++ b/bin/create_table.py @@ -6,120 +6,159 @@ import sys import csv import re +import math def parse_args(args=None): Description = "Create a table by parsing json output to extract N50, BUSCO, QV and COMPLETENESS stats." parser = argparse.ArgumentParser(description=Description) - parser.add_argument("--genome", help="Input NCBI genome summary JSON file.", required=True) - parser.add_argument("--sequence", help="Input NCBI sequence summary JSON file.", required=True) + parser.add_argument("--genome", required=True, help="Input NCBI genome summary JSON file.") + parser.add_argument("--sequence", required=True, help="Input NCBI sequence summary JSON file.") parser.add_argument("--busco", help="Input BUSCO short summary JSON file.") parser.add_argument("--qv", nargs="*", help="Input QV TSV file from MERQURYFK.") parser.add_argument("--completeness", nargs="*", help="Input COMPLETENESS stats TSV file from MERQURYFK.") parser.add_argument("--hic", action="append", help="HiC sample ID used for contact maps.") parser.add_argument("--flagstat", action="append", help="HiC flagstat file created by Samtools.") - parser.add_argument("--outcsv", help="Output CSV file.", required=True) + parser.add_argument("--outcsv", required=True, help="Output CSV file.") parser.add_argument("--version", action="version", version="%(prog)s 3.1") return parser.parse_args(args) def make_dir(path): + """ + Creates a directory if it doesn't exist. + + Parameters: + path (str): Path of the directory to be created. + """ if len(path) > 0: os.makedirs(path, exist_ok=True) -# check_samplesheet.py adds a suffix like "_T1", "_T2", etc, to the sample names +# check_samplesheet.py adds a suffix like "_T1", "_T2", etc, to sample names # We usually don't want it in the final output def remove_sample_T_suffix(name): + """ + Removes the suffix like "_T1", "_T2", etc., from sample names. + + Parameters: + name (str): Sample name to be processed. + + Returns: + str: Sample name with the suffix removed. + """ return re.sub(r"_T\d+", "", name) def ncbi_stats(genome_in, seq_in, writer): + """ + Extracts and writes assembly information and statistics from genome and + sequence JSON files to a CSV file. + + Parameters: + genome_in (str): Path to the NCBI genome summary JSON file. + seq_in (str): Path to the NCBI sequence summary JSON file. + writer (csv.writer): CSV writer object to write the extracted data. + """ with open(genome_in, "r") as fin1: data = json.load(fin1) + data = data.get("reports", [{}])[0] + with open(seq_in, "r") as fin2: - seq = json.load(fin2) + seq = json.load(fin2).get("reports", []) - data = data["reports"][0] - info = data["assembly_info"] - attr = info["biosample"]["attributes"] - stats = data["assembly_stats"] - seq = seq["reports"] + info = data.get("assembly_info", {}) + attr = info.get("biosample", {}).get("attributes", []) + stats = data.get("assembly_stats", {}) + organism = data.get("organism", {}) + # Write assembly information writer.writerow(["##Assembly_Information"]) - writer.writerow(["Accession", data["accession"]]) - if "common_name" in data["organism"]: - writer.writerow(["Common_Name", data["organism"]["common_name"]]) - writer.writerow(["Organism_Name", data["organism"]["organism_name"]]) - writer.writerow( - [ - "ToL_ID", - "".join(pairs["value"] for pairs in attr if pairs["name"] == "tolid"), - ] - ) - writer.writerow(["Taxon_ID", data["organism"]["tax_id"]]) - writer.writerow(["Assembly_Name", info["assembly_name"]]) - writer.writerow(["Assembly_Level", info["assembly_level"]]) - writer.writerow( - [ - "Life_Stage", - "".join(pairs["value"] for pairs in attr if pairs["name"] == "life_stage"), - ] - ) - writer.writerow( - [ - "Tissue", - "".join(pairs["value"] for pairs in attr if pairs["name"] == "tissue"), - ] - ) - writer.writerow(["Sex", "".join(pairs["value"] for pairs in attr if pairs["name"] == "sex")]) + writer.writerow(["Accession", data.get("accession", math.nan)]) + writer.writerow(["Common_Name", organism.get("common_name", math.nan)]) + writer.writerow(["Organism_Name", organism.get("organism_name", math.nan)]) + tol_id = "".join(pairs.get("value", "") for pairs in attr if pairs.get("name") == "tolid") + writer.writerow(["ToL_ID", tol_id if tol_id else math.nan]) + writer.writerow(["Taxon_ID", organism.get("tax_id", math.nan)]) + writer.writerow(["Assembly_Name", info.get("assembly_name", math.nan)]) + writer.writerow(["Assembly_Level", info.get("assembly_level", math.nan)]) + life_stage = "".join(pairs.get("value", "") for pairs in attr if pairs.get("name") == "life_stage") + writer.writerow(["Life_Stage", life_stage if life_stage else math.nan]) + tissue = "".join(pairs.get("value", "") for pairs in attr if pairs.get("name") == "tissue") + writer.writerow(["Tissue", tissue if tissue else math.nan]) + sex = "".join(pairs.get("value", "") for pairs in attr if pairs.get("name") == "sex") + writer.writerow(["Sex", sex if sex else math.nan]) + + # Write assembly statistics writer.writerow(["##Assembly_Statistics"]) - writer.writerow(["Total_Sequence", stats["total_sequence_length"]]) - if "total_number_of_chromosomes" in stats: - writer.writerow(["Chromosomes", stats["total_number_of_chromosomes"]]) - writer.writerow(["Scaffolds", stats["number_of_scaffolds"]]) - writer.writerow(["Scaffold_N50", stats["scaffold_n50"]]) - writer.writerow(["Contigs", stats["number_of_contigs"]]) - writer.writerow(["Contig_N50", stats["contig_n50"]]) - writer.writerow(["GC_Percent", stats["gc_percent"]]) + writer.writerow(["Total_Sequence", stats.get("total_sequence_length", math.nan)]) + writer.writerow(["Chromosomes", stats.get("total_number_of_chromosomes", math.nan)]) + writer.writerow(["Scaffolds", stats.get("number_of_scaffolds", math.nan)]) + writer.writerow(["Scaffold_N50", stats.get("scaffold_n50", math.nan)]) + writer.writerow(["Contigs", stats.get("number_of_contigs", math.nan)]) + writer.writerow(["Contig_N50", stats.get("contig_n50", math.nan)]) + writer.writerow(["GC_Percent", stats.get("gc_percent", math.nan)]) + chromosome_header = False for mol in seq: - if "gc_percent" in mol and mol["assembly_unit"] != "non-nuclear": + if mol.get("gc_percent") is not None and mol.get("assembly_unit") != "non-nuclear": if not chromosome_header: writer.writerow(["##Chromosome", "Length", "GC_Percent"]) chromosome_header = True writer.writerow( [ - mol["chr_name"], - round(mol["length"] / 1000000, 2), - mol["gc_percent"], + mol.get("chr_name", math.nan), + round(mol.get("length", 0) / 1000000, 2) if mol.get("length") is not None else math.nan, + mol.get("gc_percent", math.nan), ] ) + organelle_header = False for mol in seq: - if "gc_percent" in mol and mol["assembly_unit"] == "non-nuclear": + if mol.get("gc_percent") is not None and mol.get("assembly_unit") == "non-nuclear": if not organelle_header: writer.writerow(["##Organelle", "Length", "GC_Percent"]) organelle_header = True writer.writerow( [ - mol["assigned_molecule_location_type"], - round(mol["length"] / 1000000, 2), - mol["gc_percent"], + mol.get("assigned_molecule_location_type", math.nan), + round(mol.get("length", 0) / 1000000, 2) if mol.get("length") is not None else math.nan, + mol.get("gc_percent", math.nan), ] ) def extract_busco(file_in, writer): + """ + Extracts BUSCO information from a JSON file and writes it to a CSV file. + + Parameters: + file_in (str): Path to the BUSCO summary JSON file. + writer (csv.writer): CSV writer object to write the extracted data. + """ with open(file_in, "r") as fin: data = json.load(fin) - writer.writerow(["##BUSCO", data["lineage_dataset"]["name"]]) - writer.writerow(["Summary", data["results"]["one_line_summary"]]) + lineage_dataset_name = data.get("lineage_dataset", {}).get("name", math.nan) + results_summary = data.get("results", {}).get("one_line_summary", math.nan) + + writer.writerow(["##BUSCO", lineage_dataset_name]) + writer.writerow(["Summary", results_summary]) def extract_pacbio(qv, completeness, writer): + """ + Extracts QV and completeness information from TSV files and writes it to a + CSV file. + + NOTE: completeness and qv files have to be from matching specimen names + + Parameters: + qv (list): List of paths to one or more QV TSV files. + completeness (list): List of paths to completeness stats TSV files. + writer (csv.writer): CSV writer object to write the extracted data. + """ qval = 0 qv_name = None for f in qv: @@ -153,6 +192,15 @@ def extract_pacbio(qv, completeness, writer): def extract_mapped(sample, file_in, writer): + """ + Extracts mapping information from a flagstat file and writes it to a CSV + file. + + Parameters: + sample (str): Sample ID used for the HiC contact maps. + file_in (str): Path to the HiC flagstat file created by Samtools. + writer (csv.writer): CSV writer object to write the extracted data. + """ writer.writerow(["##HiC", remove_sample_T_suffix(sample)]) with open(file_in, "r") as fin: for line in fin: diff --git a/nextflow.config b/nextflow.config index 6dd519b5..9cfa62ec 100644 --- a/nextflow.config +++ b/nextflow.config @@ -125,7 +125,7 @@ profiles { arm { docker.runOptions = '-u $(id -u):$(id -g) --platform=linux/amd64' } - singularity { + singularity { singularity.enabled = true singularity.autoMounts = true conda.enabled = false @@ -222,7 +222,7 @@ manifest { description = """Creating standarised genome assembly publications""" mainScript = 'main.nf' nextflowVersion = '!>=22.10.1' - version = '1.2.0' + version = '1.2.1' doi = '10.5281/zenodo.7949384' }