Merge pull request #151 from sanger-tol/add_ensembl_metadata_check

Add ensembl metadata check
sanger-tol · Nov 20, 2024 · 3db5b58 · 3db5b58
2 parents 39b6690 + 019e9ae
commit 3db5b58
Show file tree

Hide file tree

Showing 6 changed files with 139 additions and 3 deletions.
diff --git a/assets/genome_note_template.docx b/assets/genome_note_template.docx
diff --git a/bin/combine_parsed_data.py b/bin/combine_parsed_data.py
@@ -21,6 +21,7 @@
     ("COPO_BIOSAMPLE_HIC", "copo_biosample_hic_file"),
     ("COPO_BIOSAMPLE_RNA", "copo_biosample_rna_file"),
     ("GBIF_TAXONOMY", "gbif_taxonomy_file"),
+    ("ENSEMBL_ANNOTATION", "ensembl_annotation_file"),
 ]
 
 
@@ -42,6 +43,7 @@ def parse_args(args=None):
     parser.add_argument("--copo_biosample_hic_file", help="Input parsed COPO HiC biosample file.", required=False)
     parser.add_argument("--copo_biosample_rna_file", help="Input parsed COPO RNASeq biosample file.", required=False)
     parser.add_argument("--gbif_taxonomy_file", help="Input parsed GBIF taxonomy file.", required=False)
+    parser.add_argument("--ensembl_annotation_file", help="Input parsed Ensembl annotation file.", required=False)
     parser.add_argument("--out_consistent", help="Output file.", required=True)
     parser.add_argument("--out_inconsistent", help="Output file.", required=True)
     parser.add_argument("--version", action="version", version="%(prog)s 1.0")

diff --git a/bin/fetch_ensembl_metadata.py b/bin/fetch_ensembl_metadata.py
@@ -0,0 +1,86 @@
+#!/usr/bin/env python3
+
+import os
+import sys
+import requests
+import argparse
+
+
+def parse_args(args=None):
+    Description = "Query the Ensembl Metadata API to pull out annotation information required by a genome note."
+    Epilog = "Example usage: python fetch_ensembl_metadata.py --taxon_id --output"
+
+    parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
+    parser.add_argument("--taxon_id", required=True, help="The species taxon id")
+    parser.add_argument("--output", required=True, help="Output file path")
+    parser.add_argument("--version", action="version", version="%(prog)s 1.0")
+
+    return parser.parse_args()
+
+
+def make_dir(path):
+    if len(path) > 0:
+        os.makedirs(path, exist_ok=True)
+
+
+def fetch_ensembl_data(taxon, output_file):
+    # Use the species taxon_id to query the Ensembl Metadata API to determine if the
+    # species has been annotated. Return assmbly accesssion of annotated data and
+    # a url linking to that species on the Ensembl Rapid website
+
+    url = "https://beta.ensembl.org/data/graphql"
+    variables = {"taxon": taxon}
+    query = """
+    query Annotation($taxon: String)
+    {
+        genomes(by_keyword: {species_taxonomy_id: $taxon }) {
+            assembly_accession
+            scientific_name
+            tol_id
+            dataset {
+                name
+                type
+                dataset_type
+            }
+            genome_id
+        }
+    }
+    """
+    response = requests.post(url=url, json={"query": query, "variables": variables})
+
+    if response.status_code == 200:
+        param_list = []
+        data = response.json()
+        if data["data"]["genomes"] is not None:
+            genomes = data["data"]["genomes"][0]
+
+            if genomes["assembly_accession"]:
+                accession = genomes["assembly_accession"]
+                acc = f'"{accession}"'
+                param_list.append(("ANNOT_ACCESSION", acc))
+                species_id = genomes["genome_id"]
+                annot_url = f"https://beta.ensembl.org/species/{species_id}"
+                annot_url = f'"{annot_url}"'
+                param_list.append(("ANNOT_URL", annot_url))
+
+    # Write out file even if there is no annotation data to write
+    out_dir = os.path.dirname(output_file)
+    make_dir(out_dir)  # Create directory if it does not exist
+
+    with open(output_file, "w") as fout:
+        # Write header
+        fout.write(",".join(["#paramName", "paramValue"]) + "\n")
+
+        for param_pair in param_list:
+            fout.write(",".join(param_pair) + "\n")
+
+    return output_file
+
+
+def main(args=None):
+    args = parse_args(args)
+    fetch_ensembl_data(args.taxon_id, args.output)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/bin/fetch_gbif_metadata.py b/bin/fetch_gbif_metadata.py
@@ -7,8 +7,8 @@
 
 
 def parse_args(args=None):
-    Description = "Parse contents of an ENA Taxonomy report and pull out metadata required by a genome note."
-    Epilog = "Example usage: python fetch_gbif_metadata.py --genus --species --output"
+    Description = "Query GBIF for species taxonomy information and pull out metadata required by a genome note."
+    Epilog = "Example usage: python fetch_gbif_metadata.py --species --output"
 
     parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
     parser.add_argument("--species", required=True, help="The species name")

diff --git a/modules/local/fetch_ensembl_metadata.nf b/modules/local/fetch_ensembl_metadata.nf
@@ -0,0 +1,34 @@
+process FETCH_ENSEMBL_METADATA {
+    tag "$assembly"
+    label 'process_single'
+
+    conda "conda-forge::python=3.9.1"
+
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/requests:2.26.0':
+        'quay.io/biocontainers/requests:2.26.0'}"
+
+    input:
+    tuple val(assembly), val(taxon_id)
+
+
+    output:
+    path "*.csv", emit: file_path
+    path "versions.yml", emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def script_name = "fetch_ensembl_metadata.py"
+    def output_file = "${assembly}_ensembl_annotation.csv"
+
+    """
+    $script_name --taxon_id $taxon_id --output $output_file
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        fetch_ensembl_metadata.py: \$(fetch_ensembl_metadata.py --version | cut -d' ' -f2)
+    END_VERSIONS
+    """
+}
diff --git a/subworkflows/local/genome_metadata.nf b/subworkflows/local/genome_metadata.nf
@@ -8,6 +8,7 @@ include { RUN_WGET                  }       from '../../modules/local/run_wget'
 include { PARSE_METADATA            }       from '../../modules/local/parse_metadata'
 include { COMBINE_METADATA          }       from '../../modules/local/combine_metadata'
 include { FETCH_GBIF_METADATA       }       from '../../modules/local/fetch_gbif_metadata'
+include { FETCH_ENSEMBL_METADATA    }       from '../../modules/local/fetch_ensembl_metadata'
 
 
 workflow GENOME_METADATA {
@@ -92,11 +93,24 @@ workflow GENOME_METADATA {
     | map { it -> tuple( it )}
     | set { ch_gbif }
 
+
+   ch_file_list
+    | map { meta, it -> 
+        def assembly = meta.id
+        def taxon_id = meta.taxon_id
+        [assembly, taxon_id]
+    }
+    | set { ch_ensembl_params} 
+
+    // Query Ensembl Metadata API to see if this species has been annotated
+    FETCH_ENSEMBL_METADATA ( ch_ensembl_params )
+    ch_versions = ch_versions.mix( FETCH_ENSEMBL_METADATA.out.versions.first() )    
+
     PARSE_METADATA.out.file_path
     | map { it -> tuple( it[1] )}
     | set { ch_parsed }
 
-    ch_parsed.mix(ch_gbif)
+    ch_parsed.mix(ch_gbif, FETCH_ENSEMBL_METADATA.out.file_path)
     | collect  
     | map { it ->  
         [ it ]