-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #145 from sanger-tol/public_dev
Release 2.0
- Loading branch information
Showing
49 changed files
with
3,402 additions
and
56 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,3 +20,4 @@ lint: | |
multiqc_config: | ||
- report_comment | ||
actions_ci: false | ||
template_strings: False |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
#File_source,File_type,Url,Output_type | ||
ENA,Assembly,https://www.ebi.ac.uk/ena/browser/api/xml/ASSEMBLY_ACCESSION,xml | ||
ENA,Bioproject,https://www.ebi.ac.uk/ena/browser/api/xml/BIOPROJECT_ACCESSION,xml | ||
ENA,Biosample,https://www.ebi.ac.uk/ena/browser/api/xml/BIOSAMPLE_ACCESSION,xml | ||
ENA,Taxonomy,https://www.ebi.ac.uk/ena/browser/api/xml/TAXONOMY_ID,xml | ||
NCBI,Assembly,https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/ASSEMBLY_ACCESSION/dataset_report?filters.exclude_atypical=false&filters.assembly_version=current&chromosomes=1&chromosomes=2&chromosomes=3&chromosomes=X&chromosomes=Y&chromosomes=M,json | ||
NCBI,Taxonomy,https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=taxonomy&id=TAXONOMY_ID,xml | ||
GOAT,Assembly,http://goat.genomehubs.org/api/v2/record?recordId=ASSEMBLY_ACCESSION&result=assembly&taxonomy=ncbi,json | ||
COPO,Biosample,https://copo-project.org/api/sample/biosampleAccession/BIOSAMPLE_ACCESSION?standard=tol&return_type=json,json |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article> | ||
<article> | ||
<body> | ||
<sec> | ||
<title>Species taxonomy</title> | ||
<p>{{ TAX_STRING }}; | ||
<italic>{{ GENUS }}</italic>; | ||
<italic>{{ GENUS_SPECIES }}</italic> ($TAXONOMY_AUTHORITY) (NCBI:txid{{ NCBI_TAXID }}) {{ TEST_NOT_REPLACED }}. | ||
</p> | ||
</sec> | ||
<sec> | ||
<table> | ||
<thead> | ||
<tr> | ||
<th align="center" valign="top">INSDC accession</th> | ||
<th align="center" valign="top">Chromosome</th> | ||
<th align="center" valign="top">Length (Mb)</th> | ||
<th align="center" valign="top">GC%</th> | ||
</tr> | ||
</thead> | ||
<tbody> | ||
{% for chromosome in CHR_TABLE %} | ||
<tr> | ||
<td align="left" valign="top">{{ chromosome.get('Accession') }}</td> | ||
<td align="center" valign="top">{{ chromosome.get('Chromosome') }}</td> | ||
<td align="center" valign="top">{{ chromosome.get('Length') }}</td> | ||
<td align="center" valign="top">{{ chromosome.get('GC') }}</td> | ||
</tr> | ||
{% endfor %} | ||
</tbody> | ||
</table> | ||
</sec> | ||
</body> | ||
</article> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,4 @@ | ||
sample,datatype,datafile | ||
uoEpiScrs1,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Epithemia_sp._CRS-2021b/genomic_data/uoEpiScrs1/pacbio/m64228e_220617_134154.ccs.bc1015_BAK8B_OA--bc1015_BAK8B_OA.rmdup.subset.bam | ||
uoEpiScrs1,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Epithemia_sp._CRS-2021b/genomic_data/uoEpiScrs1/pacbio/m64016e_220621_193126.ccs.bc1008_BAK8A_OA--bc1008_BAK8A_OA.rmdup.subset.bam | ||
uoEpiScrs1c,hic,https://tolit.cog.sanger.ac.uk/test-data/Epithemia_sp._CRS-2021b/analysis/uoEpiScrs1.1/read_mapping/hic/GCA_946965045.1.unmasked.hic.uoEpiScrs1.subsampled.cram | ||
uoEpiScrs1b,hic,https://tolit.cog.sanger.ac.uk/test-data/Epithemia_sp._CRS-2021b/analysis/uoEpiScrs1.1/read_mapping/hic/GCA_946965045.1.unmasked.hic.uoEpiScrs1.subsampled.bam | ||
ilCerPisi1,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Ceramica_pisi/genomic_data/ilCerPisi1/pacbio/m84047_230817_174414_s3.ccs.bc2048.subsampled.bam | ||
ilCerPisi1,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Ceramica_pisi/genomic_data/ilCerPisi1/pacbio/m64097e_230309_154741.ccs.bc1012_BAK8A_OA--bc1012_BAK8A_OA.subsampled.bam | ||
ilCerPisi1,hic,https://tolit.cog.sanger.ac.uk/test-data/Ceramica_pisi/analysis/ilCerPisi1.1/read_mapping/hic/GCA_963859965.1.unmasked.hic.ilCerPisi2.subsampled.cram |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,145 @@ | ||
#!/usr/bin/env python3 | ||
|
||
import os | ||
import sys | ||
import requests | ||
import argparse | ||
|
||
|
||
def parse_args(args=None): | ||
Description = "Use the genome assembly accession to fetch additional infromation on genome from ENA" | ||
Epilog = "Example usage: python check_parameters.py --assembly --wgs_biosample --output" | ||
|
||
parser = argparse.ArgumentParser(description=Description, epilog=Epilog) | ||
parser.add_argument("--assembly", required=True, help="The INSDC accession for the assembly") | ||
parser.add_argument("--wgs_biosample", required=True, help="The biosample accession for the WGS data") | ||
parser.add_argument("--hic_biosample", required=False, help="The biosample accession for the Hi-C data") | ||
parser.add_argument("--rna_biosample", required=False, help="The biosample accession for the RNASeq data") | ||
parser.add_argument("--output", required=True, help="Output file path") | ||
return parser.parse_args() | ||
|
||
|
||
def make_dir(path): | ||
if len(path) > 0: | ||
os.makedirs(path, exist_ok=True) | ||
|
||
|
||
def fetch_assembly_data(assembly, wgs_biosample, hic_biosample, rna_biosample, output_file): | ||
url = f"https://www.ebi.ac.uk/ena/portal/api/search?query=assembly_set_accession%3D%22{assembly}%22&result=assembly&fields=assembly_set_accession%2Ctax_id%2Cscientific_name%2Cstudy_accession&limit=0&download=true&format=json" | ||
response = requests.get(url) | ||
|
||
if response.status_code == 200: | ||
assembly_data = response.json() | ||
taxon_id = assembly_data[0].get("tax_id", None) | ||
species = assembly_data[0].get("scientific_name", None).replace(" ", "_") | ||
study = assembly_data[0].get("study_accession", None) | ||
params = [assembly, species, taxon_id] | ||
header = ["assembly", "species", "taxon_id"] | ||
|
||
if study: | ||
study_url = f"https://www.ebi.ac.uk/ena/portal/api/search?query=study_accession%3D%22{study}%22&result=study&fields=parent_study_accession&limit=0&download=true&format=json" | ||
study_response = requests.get(study_url) | ||
|
||
if study_response.status_code == 200: | ||
study_data = study_response.json() | ||
studies = study_data[0].get("parent_study_accession").split(";") | ||
params.append(studies[0]) | ||
header.append("bioproject") | ||
|
||
else: | ||
raise AssertionError(f"Could not determine the Bioproject linked to this assembly {assembly}\n") | ||
else: | ||
raise AssertionError(f"Could not determine the Bioproject linked to this assembly {assembly}\n") | ||
|
||
# Validate wgs_biosample | ||
wgs_url = f"https://www.ebi.ac.uk/ena/portal/api/search?query=sample_accession%3D%22{wgs_biosample}%22&result=sample&fields=sample_accession%2Ctax_id&limit=0&download=true&format=json" | ||
wgs_response = requests.get(wgs_url) | ||
|
||
if wgs_response.status_code == 200: | ||
wgs_data = wgs_response.json() | ||
tax_id = wgs_data[0].get("tax_id") | ||
|
||
if tax_id != taxon_id: | ||
raise AssertionError( | ||
f"The WGS biosample taxon id: {tax_id} does not match the assembly taxon id: {taxon_id}\n" | ||
) | ||
else: | ||
params.append(wgs_biosample) | ||
header.append("wgs_biosample") | ||
|
||
else: | ||
raise AssertionError(f"The WGS biosample id: {wgs_biosample} could not retrieved from ENA\n") | ||
|
||
# Validate hic_biosample | ||
if hic_biosample and hic_biosample != "null": | ||
print(hic_biosample) | ||
hic_url = f"https://www.ebi.ac.uk/ena/portal/api/search?query=sample_accession%3D%22{hic_biosample}%22&result=sample&fields=sample_accession%2Ctax_id&limit=0&download=true&format=json" | ||
hic_response = requests.get(hic_url) | ||
|
||
if hic_response.status_code == 200: | ||
hic_data = hic_response.json() | ||
hic_tax_id = hic_data[0].get("tax_id") | ||
|
||
if hic_tax_id != taxon_id: | ||
raise AssertionError( | ||
f"The Hi-C biosample taxon id: {hic_tax_id} does not match the assembly taxon id: {taxon_id}\n" | ||
) | ||
else: | ||
header.append("hic_biosample") | ||
params.append(hic_biosample) | ||
|
||
else: | ||
raise AssertionError(f"The Hi-C biosample id: {hic_biosample} could not retrieved from ENA\n") | ||
else: | ||
header.append("hic_biosample") | ||
params.append("null") | ||
|
||
# Validate rna_biosample | ||
if rna_biosample and rna_biosample != "null": | ||
rna_url = f"https://www.ebi.ac.uk/ena/portal/api/search?query=sample_accession%3D%22{rna_biosample}%22&result=sample&fields=sample_accession%2Ctax_id&limit=0&download=true&format=json" | ||
rna_response = requests.get(rna_url) | ||
|
||
if rna_response.status_code == 200: | ||
rna_data = rna_response.json() | ||
rna_tax_id = rna_data[0].get("tax_id") | ||
|
||
if rna_tax_id != taxon_id: | ||
raise AssertionError( | ||
f"The RNASeq biosample taxon id: {rna_tax_id} does not match the assembly taxon id: {taxon_id}\n" | ||
) | ||
else: | ||
header.append("rna_biosample") | ||
params.append(rna_biosample) | ||
|
||
else: | ||
raise AssertionError(f"The RNASeq biosample id: {rna_biosample} could not retrieved from ENA\n") | ||
|
||
else: | ||
header.append("rna_biosample") | ||
params.append("null") | ||
|
||
with open(output_file, "w") as fout: | ||
# Write header | ||
fout.write(",".join(header) + "\n") | ||
fout.write(",".join(params) + "\n") | ||
|
||
return output_file | ||
else: | ||
raise AssertionError(f"The assemby accession: {assembly} was not found\n") | ||
|
||
|
||
def main(args=None): | ||
args = parse_args(args) | ||
hic_biosample = args.hic_biosample | ||
rna_biosample = args.rna_biosample | ||
fetch_assembly_data( | ||
args.assembly, | ||
args.wgs_biosample, | ||
hic_biosample, | ||
rna_biosample, | ||
args.output, | ||
) | ||
|
||
|
||
if __name__ == "__main__": | ||
sys.exit(main()) |
Oops, something went wrong.