Merge pull request #83 from sanger-tol/busco_tax_ids

Get the BUSCO ODB name from the BUSCO definition file rather than GoaT
sanger-tol · Oct 18, 2023 · bdc772c · bdc772c
2 parents c792b14 + 7a6f3ff
commit bdc772c
Show file tree

Hide file tree

Showing 8 changed files with 121 additions and 25 deletions.
diff --git a/README.md b/README.md
@@ -22,7 +22,7 @@
 3. Filter BED ([`GNU sort`](https://www.gnu.org/software/coreutils/manual/html_node/sort-invocation.html), [`filter bed`](https://raw.githubusercontent.com/sanger-tol/genomenote/main/bin/filter_bed.sh))
 4. Contact maps ([`Cooler cload`](https://cooler.readthedocs.io/en/latest/cli.html#cooler-cload-pairs), [`Cooler zoomify`](https://cooler.readthedocs.io/en/latest/cli.html#cooler-zoomify), [`Cooler dump`](https://cooler.readthedocs.io/en/latest/cli.html#cooler-dump))
 5. Summary statistics ([`NCBI datasets summary genome accession`](https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/datasets/summary/genome/datasets_summary_genome_accession/))
-6. Genome completeness ([`GoaT API`](https://goat.genomehubs.org/api-docs/), [`BUSCO`](https://busco.ezlab.org))
+6. Genome completeness ([`NCBI API`](https://www.ncbi.nlm.nih.gov/datasets/docs/v1/reference-docs/rest-api/), [`BUSCO`](https://busco.ezlab.org))
 7. Consensus quality and k-mer completeness ([`FASTK`](https://github.com/thegenemyers/FASTK), [`MERQURY.FK`](https://github.com/thegenemyers/MERQURY.FK))
 8. Collated summary table ([`createtable`](https://raw.githubusercontent.com/sanger-tol/genomenote/main/bin/create_table.py))
 9. Present results and visualisations ([`MultiQC`](http://multiqc.info/), [`R`](https://www.r-project.org/))

diff --git a/assets/mapping_taxids-busco_dataset_name.eukaryota_odb10.2019-12-16.txt b/assets/mapping_taxids-busco_dataset_name.eukaryota_odb10.2019-12-16.txt
@@ -0,0 +1,67 @@
+422676	aconoidasida
+7898	actinopterygii
+5338	agaricales
+155619	agaricomycetes
+33630	alveolata
+5794	apicomplexa
+6854	arachnida
+6656	arthropoda
+4890	ascomycota
+8782	aves
+5204	basidiomycota
+68889	boletales
+3699	brassicales
+134362	capnodiales
+33554	carnivora
+91561	cetartiodactyla
+34395	chaetothyriales
+3041	chlorophyta
+5796	coccidia
+28738	cyprinodontiformes
+7147	diptera
+147541	dothideomycetes
+3193	embryophyta
+33392	endopterygota
+314146	euarchontoglires
+33682	euglenozoa
+2759	eukaryota
+5042	eurotiales
+147545	eurotiomycetes
+9347	eutheria
+72025	fabales
+4751	fungi
+314147	glires
+1028384	glomerellales
+5178	helotiales
+7524	hemiptera
+7399	hymenoptera
+5125	hypocreales
+50557	insecta
+314145	laurasiatheria
+147548	leotiomycetes
+7088	lepidoptera
+4447	liliopsida
+40674	mammalia
+33208	metazoa
+6029	microsporidia
+6447	mollusca
+4827	mucorales
+1913637	mucoromycota
+6231	nematoda
+33183	onygenales
+9126	passeriformes
+5820	plasmodium
+92860	pleosporales
+38820	poales
+5303	polyporales
+9443	primates
+4891	saccharomycetes
+8457	sauropsida
+4069	solanales
+147550	sordariomycetes
+33634	stramenopiles
+32523	tetrapoda
+155616	tremellomycetes
+7742	vertebrata
+33090	viridiplantae
+71240	eudicots
diff --git a/bin/get_odb.py b/bin/get_odb.py
@@ -8,11 +8,15 @@
 import re
 
 
+NCBI_TAXONOMY_API = "https://api.ncbi.nlm.nih.gov/datasets/v1/taxonomy/taxon/%s"
+
+
 def parse_args(args=None):
-    Description = "Get ODB database value using GOAT API"
+    Description = "Get ODB database value using NCBI API and BUSCO configuration file"
 
     parser = argparse.ArgumentParser(description=Description)
-    parser.add_argument("ASSEMBLY", help="Assembly GCA value.")
+    parser.add_argument("NCBI_SUMMARY_JSON", help="NCBI entry for this assembly for this assembly (in JSON).")
+    parser.add_argument("LINEAGE_TAX_IDS", help="Mapping between BUSCO lineages and taxon IDs.")
     parser.add_argument("FILE_OUT", help="Output CSV file.")
     parser.add_argument("--version", action="version", version="%(prog)s 1.0")
     return parser.parse_args(args)
@@ -23,17 +27,29 @@ def make_dir(path):
         os.makedirs(path, exist_ok=True)
 
 
-def get_odb(asm, file_out):
-    # Using API, get JSON file with all associated ODB10 databases
-    response = requests.get(
-        "https://goat.genomehubs.org/api/v2/search?query=tax_lineage%28queryA.taxon_id%29&queryA=assembly--assembly_id%3D"
-        + asm
-        + "&result=taxon&fields=odb10_lineage"
-    ).json()["results"]
-    # Extract ODB10 lineage values
-    odb_arr = [r["result"]["fields"]["odb10_lineage"]["value"] for r in response]
-    # The closest [0] OBD10 lineage is selected, unless one of the lineage values is "eutheria", then choose "eutheria"
-    odb_val = "eutheria_odb10" if "eutheria_odb10" in odb_arr else odb_arr[0]
+def get_odb(ncbi_summary, lineage_tax_ids, file_out):
+    # Read the mapping between the BUSCO lineages and their taxon_id
+    with open(lineage_tax_ids) as file_in:
+        lineage_tax_ids_dict = {}
+        for line in file_in:
+            arr = line.split()
+            lineage_tax_ids_dict[int(arr[0])] = arr[1]
+
+    # Get the taxon_id of this species / assembly
+    with open(ncbi_summary) as file_in:
+        data = json.load(file_in)
+    tax_id = data["reports"][0]["organism"]["tax_id"]
+
+    # Using API, get the taxon_ids of all parents
+    response = requests.get(NCBI_TAXONOMY_API % tax_id).json()
+    ancestor_taxon_ids = response["taxonomy_nodes"][0]["taxonomy"]["lineage"]
+
+    # Do the intersection to find the ancestors that have a BUSCO lineage
+    odb_arr = [lineage_tax_ids_dict[taxon_id] for taxon_id in ancestor_taxon_ids if taxon_id in lineage_tax_ids_dict]
+
+    # The most recent [-1] OBD10 lineage is selected, unless one of the lineage values is "eutheria", then choose "eutheria"
+    # NOTE: this rule is a guess that hasn't been confirmed by Karen
+    odb_val = "eutheria_odb10" if "eutheria_odb10" in odb_arr else odb_arr[-1]
 
     out_dir = os.path.dirname(file_out)
     make_dir(out_dir)
@@ -44,7 +60,7 @@ def get_odb(asm, file_out):
 
 def main(args=None):
     args = parse_args(args)
-    get_odb(args.ASSEMBLY, args.FILE_OUT)
+    get_odb(args.NCBI_SUMMARY_JSON, args.LINEAGE_TAX_IDS, args.FILE_OUT)
 
 
 if __name__ == "__main__":

diff --git a/modules/local/goat/odb.nf → modules/local/ncbidatasets/get_odb.nf b/modules/local/goat/odb.nf → modules/local/ncbidatasets/get_odb.nf
@@ -1,4 +1,4 @@
-process GOAT_ODB {
+process NCBI_GET_ODB {
     tag "$meta.id"
     label 'process_single'
 
@@ -8,7 +8,8 @@ process GOAT_ODB {
         'biocontainers/requests:2.26.0' }"
 
     input:
-    tuple val(meta), path(fasta)
+    tuple val(meta), path(ncbi_summary)
+    path lineage_tax_ids
 
     output:
     tuple val(meta), path("*.busco_odb.csv"), emit: csv
@@ -21,7 +22,7 @@ process GOAT_ODB {
     def args = task.ext.args ?: ''
     def prefix = task.ext.prefix ?: "${meta.id}"
     """
-    get_odb.py ${prefix} ${prefix}.busco_odb.csv
+    get_odb.py $ncbi_summary $lineage_tax_ids ${prefix}.busco_odb.csv
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":

diff --git a/nextflow.config b/nextflow.config
@@ -16,6 +16,8 @@ params {
 
     // Database
     lineage_db                  = null
+    // Taken from https://busco-data.ezlab.org/v5/data/placement_files/mapping_taxids-busco_dataset_name.eukaryota_odb10.2019-12-16.txt.tar.gz
+    lineage_tax_ids             = "${projectDir}/assets/mapping_taxids-busco_dataset_name.eukaryota_odb10.2019-12-16.txt"
 
     // References
     fasta                       = null

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -64,6 +64,13 @@
                     "format": "directory-path",
                     "description": "Local directory where clade-specific BUSCO lineage datasets are stored.",
                     "fa_icon": "fas fa-folder-open"
+                },
+                "lineage_tax_ids": {
+                    "type": "string",
+                    "format": "file-path",
+                    "description": "Local file that holds a mapping between BUSCO lineages and taxon IDs.",
+                    "help_text": "Initialised from https://busco-data.ezlab.org/v5/data/placement_files/mapping_taxids-busco_dataset_name.eukaryota_odb10.2019-12-16.txt.tar.gz",
+                    "fa_icon": "fas fa-file-code"
                 }
             },
             "fa_icon": "fas fa-database"

diff --git a/subworkflows/local/genome_statistics.nf b/subworkflows/local/genome_statistics.nf
@@ -4,7 +4,7 @@
 
 include { NCBIDATASETS_SUMMARYGENOME as SUMMARYGENOME   } from '../../modules/local/ncbidatasets/summarygenome'
 include { NCBIDATASETS_SUMMARYGENOME as SUMMARYSEQUENCE } from '../../modules/local/ncbidatasets/summarygenome'
-include { GOAT_ODB                                      } from '../../modules/local/goat/odb'
+include { NCBI_GET_ODB                                  } from '../../modules/local/ncbidatasets/get_odb'
 include { BUSCO                                         } from '../../modules/nf-core/busco/main'
 include { FASTK_FASTK                                   } from '../../modules/nf-core/fastk/fastk/main'
 include { MERQURYFK_MERQURYFK                           } from '../../modules/nf-core/merquryfk/merquryfk/main'
@@ -14,6 +14,7 @@ include { CREATETABLE                                   } from '../../modules/lo
 workflow GENOME_STATISTICS {
     take:
     genome                 // channel: [ meta, fasta ]
+    lineage_tax_ids        // channel: /path/to/lineage_tax_ids
     lineage_db             // channel: /path/to/buscoDB
     pacbio                 // channel: [ meta, kmer_db or reads ]
     flagstat               // channel: [ meta, flagstat ]
@@ -34,12 +35,12 @@ workflow GENOME_STATISTICS {
 
 
     // Get ODB lineage value
-    GOAT_ODB ( genome )
-    ch_versions = ch_versions.mix ( GOAT_ODB.out.versions.first() )
+    NCBI_GET_ODB ( SUMMARYGENOME.out.summary, lineage_tax_ids )
+    ch_versions = ch_versions.mix ( NCBI_GET_ODB.out.versions.first() )
 
 
     // BUSCO
-    GOAT_ODB.out.csv
+    NCBI_GET_ODB.out.csv
     | map { meta, csv -> csv }
     | splitCsv()
     | map { row -> row[1] }

diff --git a/workflows/genomenote.nf b/workflows/genomenote.nf
@@ -10,7 +10,7 @@ def summary_params = NfcoreSchema.paramsSummaryMap(workflow, params)
 WorkflowGenomenote.initialise(params, log)
 
 // Check input path parameters to see if they exist
-def checkPathParamList = [ params.input, params.multiqc_config, params.lineage_db, params.fasta ]
+def checkPathParamList = [ params.input, params.multiqc_config, params.lineage_db, params.fasta, params.lineage_tax_ids ]
 for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } }
 
 // Check mandatory parameters
@@ -19,8 +19,10 @@ if (params.fasta)     { ch_fasta = Channel.fromPath(params.fasta) } else { exit
 if (params.binsize)   { ch_bin   = Channel.of(params.binsize)     } else { exit 1, 'Bin size for cooler/cload not specified!' }
 if (params.kmer_size) { ch_kmer  = Channel.of(params.kmer_size)   } else { exit 1, 'Kmer library size for fastk not specified' }
 
+if (params.lineage_tax_ids) { ch_lineage_tax_ids = Channel.fromPath(params.lineage_tax_ids) } else { exit 1, 'Mapping BUSCO lineage <-> taxon_ids not specified' }
+
 // Check optional parameters
-if (params.lineage_db) { ch_busco = Channel.fromPath(params.lineage_db) } else { ch_busco = Channel.empty() }
+if (params.lineage_db) { ch_lineage_db = Channel.fromPath(params.lineage_db) } else { ch_lineage_db = Channel.empty() }
 
 
 /*
@@ -123,7 +125,7 @@ workflow GENOMENOTE {
     }
     | set { ch_flagstat }
 
-    GENOME_STATISTICS ( ch_fasta, ch_busco, ch_inputs.pacbio, ch_flagstat )
+    GENOME_STATISTICS ( ch_fasta, ch_lineage_tax_ids, ch_lineage_db, ch_inputs.pacbio, ch_flagstat )
     ch_versions = ch_versions.mix ( GENOME_STATISTICS.out.versions )