Skip to content

Commit

Permalink
Merge pull request #83 from sanger-tol/busco_tax_ids
Browse files Browse the repository at this point in the history
Get the BUSCO ODB name from the BUSCO definition file rather than GoaT
  • Loading branch information
muffato authored Oct 18, 2023
2 parents c792b14 + 7a6f3ff commit bdc772c
Show file tree
Hide file tree
Showing 8 changed files with 121 additions and 25 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
3. Filter BED ([`GNU sort`](https://www.gnu.org/software/coreutils/manual/html_node/sort-invocation.html), [`filter bed`](https://raw.githubusercontent.com/sanger-tol/genomenote/main/bin/filter_bed.sh))
4. Contact maps ([`Cooler cload`](https://cooler.readthedocs.io/en/latest/cli.html#cooler-cload-pairs), [`Cooler zoomify`](https://cooler.readthedocs.io/en/latest/cli.html#cooler-zoomify), [`Cooler dump`](https://cooler.readthedocs.io/en/latest/cli.html#cooler-dump))
5. Summary statistics ([`NCBI datasets summary genome accession`](https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/datasets/summary/genome/datasets_summary_genome_accession/))
6. Genome completeness ([`GoaT API`](https://goat.genomehubs.org/api-docs/), [`BUSCO`](https://busco.ezlab.org))
6. Genome completeness ([`NCBI API`](https://www.ncbi.nlm.nih.gov/datasets/docs/v1/reference-docs/rest-api/), [`BUSCO`](https://busco.ezlab.org))
7. Consensus quality and k-mer completeness ([`FASTK`](https://github.com/thegenemyers/FASTK), [`MERQURY.FK`](https://github.com/thegenemyers/MERQURY.FK))
8. Collated summary table ([`createtable`](https://raw.githubusercontent.com/sanger-tol/genomenote/main/bin/create_table.py))
9. Present results and visualisations ([`MultiQC`](http://multiqc.info/), [`R`](https://www.r-project.org/))
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
422676 aconoidasida
7898 actinopterygii
5338 agaricales
155619 agaricomycetes
33630 alveolata
5794 apicomplexa
6854 arachnida
6656 arthropoda
4890 ascomycota
8782 aves
5204 basidiomycota
68889 boletales
3699 brassicales
134362 capnodiales
33554 carnivora
91561 cetartiodactyla
34395 chaetothyriales
3041 chlorophyta
5796 coccidia
28738 cyprinodontiformes
7147 diptera
147541 dothideomycetes
3193 embryophyta
33392 endopterygota
314146 euarchontoglires
33682 euglenozoa
2759 eukaryota
5042 eurotiales
147545 eurotiomycetes
9347 eutheria
72025 fabales
4751 fungi
314147 glires
1028384 glomerellales
5178 helotiales
7524 hemiptera
7399 hymenoptera
5125 hypocreales
50557 insecta
314145 laurasiatheria
147548 leotiomycetes
7088 lepidoptera
4447 liliopsida
40674 mammalia
33208 metazoa
6029 microsporidia
6447 mollusca
4827 mucorales
1913637 mucoromycota
6231 nematoda
33183 onygenales
9126 passeriformes
5820 plasmodium
92860 pleosporales
38820 poales
5303 polyporales
9443 primates
4891 saccharomycetes
8457 sauropsida
4069 solanales
147550 sordariomycetes
33634 stramenopiles
32523 tetrapoda
155616 tremellomycetes
7742 vertebrata
33090 viridiplantae
71240 eudicots
44 changes: 30 additions & 14 deletions bin/get_odb.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,15 @@
import re


NCBI_TAXONOMY_API = "https://api.ncbi.nlm.nih.gov/datasets/v1/taxonomy/taxon/%s"


def parse_args(args=None):
Description = "Get ODB database value using GOAT API"
Description = "Get ODB database value using NCBI API and BUSCO configuration file"

parser = argparse.ArgumentParser(description=Description)
parser.add_argument("ASSEMBLY", help="Assembly GCA value.")
parser.add_argument("NCBI_SUMMARY_JSON", help="NCBI entry for this assembly for this assembly (in JSON).")
parser.add_argument("LINEAGE_TAX_IDS", help="Mapping between BUSCO lineages and taxon IDs.")
parser.add_argument("FILE_OUT", help="Output CSV file.")
parser.add_argument("--version", action="version", version="%(prog)s 1.0")
return parser.parse_args(args)
Expand All @@ -23,17 +27,29 @@ def make_dir(path):
os.makedirs(path, exist_ok=True)


def get_odb(asm, file_out):
# Using API, get JSON file with all associated ODB10 databases
response = requests.get(
"https://goat.genomehubs.org/api/v2/search?query=tax_lineage%28queryA.taxon_id%29&queryA=assembly--assembly_id%3D"
+ asm
+ "&result=taxon&fields=odb10_lineage"
).json()["results"]
# Extract ODB10 lineage values
odb_arr = [r["result"]["fields"]["odb10_lineage"]["value"] for r in response]
# The closest [0] OBD10 lineage is selected, unless one of the lineage values is "eutheria", then choose "eutheria"
odb_val = "eutheria_odb10" if "eutheria_odb10" in odb_arr else odb_arr[0]
def get_odb(ncbi_summary, lineage_tax_ids, file_out):
# Read the mapping between the BUSCO lineages and their taxon_id
with open(lineage_tax_ids) as file_in:
lineage_tax_ids_dict = {}
for line in file_in:
arr = line.split()
lineage_tax_ids_dict[int(arr[0])] = arr[1]

# Get the taxon_id of this species / assembly
with open(ncbi_summary) as file_in:
data = json.load(file_in)
tax_id = data["reports"][0]["organism"]["tax_id"]

# Using API, get the taxon_ids of all parents
response = requests.get(NCBI_TAXONOMY_API % tax_id).json()
ancestor_taxon_ids = response["taxonomy_nodes"][0]["taxonomy"]["lineage"]

# Do the intersection to find the ancestors that have a BUSCO lineage
odb_arr = [lineage_tax_ids_dict[taxon_id] for taxon_id in ancestor_taxon_ids if taxon_id in lineage_tax_ids_dict]

# The most recent [-1] OBD10 lineage is selected, unless one of the lineage values is "eutheria", then choose "eutheria"
# NOTE: this rule is a guess that hasn't been confirmed by Karen
odb_val = "eutheria_odb10" if "eutheria_odb10" in odb_arr else odb_arr[-1]

out_dir = os.path.dirname(file_out)
make_dir(out_dir)
Expand All @@ -44,7 +60,7 @@ def get_odb(asm, file_out):

def main(args=None):
args = parse_args(args)
get_odb(args.ASSEMBLY, args.FILE_OUT)
get_odb(args.NCBI_SUMMARY_JSON, args.LINEAGE_TAX_IDS, args.FILE_OUT)


if __name__ == "__main__":
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
process GOAT_ODB {
process NCBI_GET_ODB {
tag "$meta.id"
label 'process_single'

Expand All @@ -8,7 +8,8 @@ process GOAT_ODB {
'biocontainers/requests:2.26.0' }"

input:
tuple val(meta), path(fasta)
tuple val(meta), path(ncbi_summary)
path lineage_tax_ids

output:
tuple val(meta), path("*.busco_odb.csv"), emit: csv
Expand All @@ -21,7 +22,7 @@ process GOAT_ODB {
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
"""
get_odb.py ${prefix} ${prefix}.busco_odb.csv
get_odb.py $ncbi_summary $lineage_tax_ids ${prefix}.busco_odb.csv
cat <<-END_VERSIONS > versions.yml
"${task.process}":
Expand Down
2 changes: 2 additions & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ params {

// Database
lineage_db = null
// Taken from https://busco-data.ezlab.org/v5/data/placement_files/mapping_taxids-busco_dataset_name.eukaryota_odb10.2019-12-16.txt.tar.gz
lineage_tax_ids = "${projectDir}/assets/mapping_taxids-busco_dataset_name.eukaryota_odb10.2019-12-16.txt"

// References
fasta = null
Expand Down
7 changes: 7 additions & 0 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,13 @@
"format": "directory-path",
"description": "Local directory where clade-specific BUSCO lineage datasets are stored.",
"fa_icon": "fas fa-folder-open"
},
"lineage_tax_ids": {
"type": "string",
"format": "file-path",
"description": "Local file that holds a mapping between BUSCO lineages and taxon IDs.",
"help_text": "Initialised from https://busco-data.ezlab.org/v5/data/placement_files/mapping_taxids-busco_dataset_name.eukaryota_odb10.2019-12-16.txt.tar.gz",
"fa_icon": "fas fa-file-code"
}
},
"fa_icon": "fas fa-database"
Expand Down
9 changes: 5 additions & 4 deletions subworkflows/local/genome_statistics.nf
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

include { NCBIDATASETS_SUMMARYGENOME as SUMMARYGENOME } from '../../modules/local/ncbidatasets/summarygenome'
include { NCBIDATASETS_SUMMARYGENOME as SUMMARYSEQUENCE } from '../../modules/local/ncbidatasets/summarygenome'
include { GOAT_ODB } from '../../modules/local/goat/odb'
include { NCBI_GET_ODB } from '../../modules/local/ncbidatasets/get_odb'
include { BUSCO } from '../../modules/nf-core/busco/main'
include { FASTK_FASTK } from '../../modules/nf-core/fastk/fastk/main'
include { MERQURYFK_MERQURYFK } from '../../modules/nf-core/merquryfk/merquryfk/main'
Expand All @@ -14,6 +14,7 @@ include { CREATETABLE } from '../../modules/lo
workflow GENOME_STATISTICS {
take:
genome // channel: [ meta, fasta ]
lineage_tax_ids // channel: /path/to/lineage_tax_ids
lineage_db // channel: /path/to/buscoDB
pacbio // channel: [ meta, kmer_db or reads ]
flagstat // channel: [ meta, flagstat ]
Expand All @@ -34,12 +35,12 @@ workflow GENOME_STATISTICS {


// Get ODB lineage value
GOAT_ODB ( genome )
ch_versions = ch_versions.mix ( GOAT_ODB.out.versions.first() )
NCBI_GET_ODB ( SUMMARYGENOME.out.summary, lineage_tax_ids )
ch_versions = ch_versions.mix ( NCBI_GET_ODB.out.versions.first() )


// BUSCO
GOAT_ODB.out.csv
NCBI_GET_ODB.out.csv
| map { meta, csv -> csv }
| splitCsv()
| map { row -> row[1] }
Expand Down
8 changes: 5 additions & 3 deletions workflows/genomenote.nf
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ def summary_params = NfcoreSchema.paramsSummaryMap(workflow, params)
WorkflowGenomenote.initialise(params, log)

// Check input path parameters to see if they exist
def checkPathParamList = [ params.input, params.multiqc_config, params.lineage_db, params.fasta ]
def checkPathParamList = [ params.input, params.multiqc_config, params.lineage_db, params.fasta, params.lineage_tax_ids ]
for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } }

// Check mandatory parameters
Expand All @@ -19,8 +19,10 @@ if (params.fasta) { ch_fasta = Channel.fromPath(params.fasta) } else { exit
if (params.binsize) { ch_bin = Channel.of(params.binsize) } else { exit 1, 'Bin size for cooler/cload not specified!' }
if (params.kmer_size) { ch_kmer = Channel.of(params.kmer_size) } else { exit 1, 'Kmer library size for fastk not specified' }

if (params.lineage_tax_ids) { ch_lineage_tax_ids = Channel.fromPath(params.lineage_tax_ids) } else { exit 1, 'Mapping BUSCO lineage <-> taxon_ids not specified' }

// Check optional parameters
if (params.lineage_db) { ch_busco = Channel.fromPath(params.lineage_db) } else { ch_busco = Channel.empty() }
if (params.lineage_db) { ch_lineage_db = Channel.fromPath(params.lineage_db) } else { ch_lineage_db = Channel.empty() }


/*
Expand Down Expand Up @@ -123,7 +125,7 @@ workflow GENOMENOTE {
}
| set { ch_flagstat }

GENOME_STATISTICS ( ch_fasta, ch_busco, ch_inputs.pacbio, ch_flagstat )
GENOME_STATISTICS ( ch_fasta, ch_lineage_tax_ids, ch_lineage_db, ch_inputs.pacbio, ch_flagstat )
ch_versions = ch_versions.mix ( GENOME_STATISTICS.out.versions )


Expand Down

0 comments on commit bdc772c

Please sign in to comment.