Skip to content

Commit

Permalink
Merge pull request #151 from sanger-tol/add_ensembl_metadata_check
Browse files Browse the repository at this point in the history
Add ensembl metadata check
  • Loading branch information
BethYates authored Nov 20, 2024
2 parents 39b6690 + 019e9ae commit 3db5b58
Show file tree
Hide file tree
Showing 6 changed files with 139 additions and 3 deletions.
Binary file modified assets/genome_note_template.docx
Binary file not shown.
2 changes: 2 additions & 0 deletions bin/combine_parsed_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
("COPO_BIOSAMPLE_HIC", "copo_biosample_hic_file"),
("COPO_BIOSAMPLE_RNA", "copo_biosample_rna_file"),
("GBIF_TAXONOMY", "gbif_taxonomy_file"),
("ENSEMBL_ANNOTATION", "ensembl_annotation_file"),
]


Expand All @@ -42,6 +43,7 @@ def parse_args(args=None):
parser.add_argument("--copo_biosample_hic_file", help="Input parsed COPO HiC biosample file.", required=False)
parser.add_argument("--copo_biosample_rna_file", help="Input parsed COPO RNASeq biosample file.", required=False)
parser.add_argument("--gbif_taxonomy_file", help="Input parsed GBIF taxonomy file.", required=False)
parser.add_argument("--ensembl_annotation_file", help="Input parsed Ensembl annotation file.", required=False)
parser.add_argument("--out_consistent", help="Output file.", required=True)
parser.add_argument("--out_inconsistent", help="Output file.", required=True)
parser.add_argument("--version", action="version", version="%(prog)s 1.0")
Expand Down
86 changes: 86 additions & 0 deletions bin/fetch_ensembl_metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
#!/usr/bin/env python3

import os
import sys
import requests
import argparse


def parse_args(args=None):
Description = "Query the Ensembl Metadata API to pull out annotation information required by a genome note."
Epilog = "Example usage: python fetch_ensembl_metadata.py --taxon_id --output"

parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
parser.add_argument("--taxon_id", required=True, help="The species taxon id")
parser.add_argument("--output", required=True, help="Output file path")
parser.add_argument("--version", action="version", version="%(prog)s 1.0")

return parser.parse_args()


def make_dir(path):
if len(path) > 0:
os.makedirs(path, exist_ok=True)


def fetch_ensembl_data(taxon, output_file):
# Use the species taxon_id to query the Ensembl Metadata API to determine if the
# species has been annotated. Return assmbly accesssion of annotated data and
# a url linking to that species on the Ensembl Rapid website

url = "https://beta.ensembl.org/data/graphql"
variables = {"taxon": taxon}
query = """
query Annotation($taxon: String)
{
genomes(by_keyword: {species_taxonomy_id: $taxon }) {
assembly_accession
scientific_name
tol_id
dataset {
name
type
dataset_type
}
genome_id
}
}
"""
response = requests.post(url=url, json={"query": query, "variables": variables})

if response.status_code == 200:
param_list = []
data = response.json()
if data["data"]["genomes"] is not None:
genomes = data["data"]["genomes"][0]

if genomes["assembly_accession"]:
accession = genomes["assembly_accession"]
acc = f'"{accession}"'
param_list.append(("ANNOT_ACCESSION", acc))
species_id = genomes["genome_id"]
annot_url = f"https://beta.ensembl.org/species/{species_id}"
annot_url = f'"{annot_url}"'
param_list.append(("ANNOT_URL", annot_url))

# Write out file even if there is no annotation data to write
out_dir = os.path.dirname(output_file)
make_dir(out_dir) # Create directory if it does not exist

with open(output_file, "w") as fout:
# Write header
fout.write(",".join(["#paramName", "paramValue"]) + "\n")

for param_pair in param_list:
fout.write(",".join(param_pair) + "\n")

return output_file


def main(args=None):
args = parse_args(args)
fetch_ensembl_data(args.taxon_id, args.output)


if __name__ == "__main__":
sys.exit(main())
4 changes: 2 additions & 2 deletions bin/fetch_gbif_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@


def parse_args(args=None):
Description = "Parse contents of an ENA Taxonomy report and pull out metadata required by a genome note."
Epilog = "Example usage: python fetch_gbif_metadata.py --genus --species --output"
Description = "Query GBIF for species taxonomy information and pull out metadata required by a genome note."
Epilog = "Example usage: python fetch_gbif_metadata.py --species --output"

parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
parser.add_argument("--species", required=True, help="The species name")
Expand Down
34 changes: 34 additions & 0 deletions modules/local/fetch_ensembl_metadata.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
process FETCH_ENSEMBL_METADATA {
tag "$assembly"
label 'process_single'

conda "conda-forge::python=3.9.1"

container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/requests:2.26.0':
'quay.io/biocontainers/requests:2.26.0'}"

input:
tuple val(assembly), val(taxon_id)


output:
path "*.csv", emit: file_path
path "versions.yml", emit: versions

when:
task.ext.when == null || task.ext.when

script:
def script_name = "fetch_ensembl_metadata.py"
def output_file = "${assembly}_ensembl_annotation.csv"

"""
$script_name --taxon_id $taxon_id --output $output_file
cat <<-END_VERSIONS > versions.yml
"${task.process}":
fetch_ensembl_metadata.py: \$(fetch_ensembl_metadata.py --version | cut -d' ' -f2)
END_VERSIONS
"""
}
16 changes: 15 additions & 1 deletion subworkflows/local/genome_metadata.nf
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ include { RUN_WGET } from '../../modules/local/run_wget'
include { PARSE_METADATA } from '../../modules/local/parse_metadata'
include { COMBINE_METADATA } from '../../modules/local/combine_metadata'
include { FETCH_GBIF_METADATA } from '../../modules/local/fetch_gbif_metadata'
include { FETCH_ENSEMBL_METADATA } from '../../modules/local/fetch_ensembl_metadata'


workflow GENOME_METADATA {
Expand Down Expand Up @@ -92,11 +93,24 @@ workflow GENOME_METADATA {
| map { it -> tuple( it )}
| set { ch_gbif }


ch_file_list
| map { meta, it ->
def assembly = meta.id
def taxon_id = meta.taxon_id
[assembly, taxon_id]
}
| set { ch_ensembl_params}

// Query Ensembl Metadata API to see if this species has been annotated
FETCH_ENSEMBL_METADATA ( ch_ensembl_params )
ch_versions = ch_versions.mix( FETCH_ENSEMBL_METADATA.out.versions.first() )

PARSE_METADATA.out.file_path
| map { it -> tuple( it[1] )}
| set { ch_parsed }

ch_parsed.mix(ch_gbif)
ch_parsed.mix(ch_gbif, FETCH_ENSEMBL_METADATA.out.file_path)
| collect
| map { it ->
[ it ]
Expand Down

0 comments on commit 3db5b58

Please sign in to comment.