diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 7ebc310e..0c05417c 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -19,7 +19,7 @@ jobs: - uses: actions/setup-node@v3 - name: Install editorconfig-checker - run: npm install -g editorconfig-checker + run: npm install -g editorconfig-checker@3.0.2 - name: Run ECLint check run: editorconfig-checker -exclude README.md $(find .* -type f | grep -v '.git\|.py\|.md\|cff\|json\|yml\|yaml\|html\|css\|work\|.nextflow\|build\|nf_core.egg-info\|log.txt\|Makefile') @@ -32,7 +32,7 @@ jobs: - uses: actions/setup-node@v3 - name: Install Prettier - run: npm install -g prettier + run: npm install -g prettier@3.1.0 - name: Run Prettier --check run: prettier --check ${GITHUB_WORKSPACE} diff --git a/.nf-core.yml b/.nf-core.yml index 0b4f8190..5de1cda5 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -20,3 +20,4 @@ lint: multiqc_config: - report_comment actions_ci: false + template_strings: False diff --git a/CHANGELOG.md b/CHANGELOG.md index ef651d28..9ed60a41 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,36 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [[2.0.0](https://github.com/sanger-tol/genomenote/releases/tag/2.0.0)] - English Cocker Spaniel [2024-10-10] + +### Enhancements & fixes + +- New genome_metadata subworkflow to fetch metadata linked to the genome assembly from various sources (COPO, GoaT, GBIF, ENA, NCBI). The options `--assembly`, `--biosample_wgs`, `--biosample_hic` and `--biosample_rna` specify what metadata to fetch and process. +- Now outputs a partially completed genome note document based on a template file which contains placeholder parameters. These placeholders are replaced with data generated by the pipeline. The template file to use can be specified using the `--note_template` option. +- Added the `--write_to_portal` option to write a set of key-value data parameters to a Genome Notes database. +- Added the `--upload_higlass_data` option to automatically upload the Hi-C Map to a kubernetes hosted Hi-Glass server. +- Bugfix: don't rely on fasta file name to correctly set assembly accession needed for use with `ncbi datasets`. +- Bugfix: ensure meta.id is used consistently. + +### Parameters + +| Old parameter | New parameter | +| ------------- | -------------------------- | +| | --assembly | +| | --biosample_wgs | +| | --biosample_hic | +| | --biosample_rna | +| | --write_to_portal | +| | --genome_notes_api | +| | --note_template | +| | --upload_higlass_data | +| | --higlass_url | +| | --higlass_deployment_name | +| | --higlass_namespace | +| | --higlass_kubeconfig | +| | --higlass_upload_directory | +| | --higlass_data_project_dir | + ## [[1.2.2](https://github.com/sanger-tol/genomenote/releases/tag/1.2.2)] - Pyrenean Mountain Dog (patch 2) - [2024-09-10] ### Enhancements & fixes diff --git a/CITATION.cff b/CITATION.cff index 44c3f427..969f1a7c 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -2,16 +2,24 @@ # Visit https://bit.ly/cffinit to generate yours today! cff-version: 1.2.0 -title: sanger-tol/genomenote v1.2.2 +title: sanger-tol/genomenote v2.0.0 message: >- If you use this software, please cite it using the metadata from this file. type: software authors: + - given-names: Sandra + family-names: Babiyre + affiliation: Wellcome Sanger Institute + orcid: "https://orcid.org/0009-0004-7773-7008" - given-names: Tyler family-names: Chafin affiliation: Wellcome Sanger Institute orcid: "https://orcid.org/0000-0001-8687-5905" + - given-names: Chau + family-names: Duong + affiliation: Wellcome Sanger Institute + orcid: "https://orcid.org/0009-0001-0649-2291" - given-names: Matthieu family-names: Muffato affiliation: Wellcome Sanger Institute @@ -38,5 +46,5 @@ identifiers: repository-code: "https://github.com/sanger-tol/genomenote" license: MIT commit: TODO -version: 1.2.2 +version: 2.0.0 date-released: "2022-10-07" diff --git a/README.md b/README.md index ad60ed41..7b324815 100644 --- a/README.md +++ b/README.md @@ -17,14 +17,15 @@ -1. Summary statistics ([`NCBI datasets summary genome accession`](https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/datasets/summary/genome/datasets_summary_genome_accession/)) -2. Convert alignment to BED ([`samtools view`](https://www.htslib.org/doc/samtools-view.html), [`bedtools bamtobed`](https://bedtools.readthedocs.io/en/latest/content/tools/bamtobed.html)) -3. Filter BED ([`GNU sort`](https://www.gnu.org/software/coreutils/manual/html_node/sort-invocation.html), [`filter bed`](https://raw.githubusercontent.com/sanger-tol/genomenote/main/bin/filter_bed.sh)) -4. Contact maps ([`Cooler cload`](https://cooler.readthedocs.io/en/latest/cli.html#cooler-cload-pairs), [`Cooler zoomify`](https://cooler.readthedocs.io/en/latest/cli.html#cooler-zoomify), [`Cooler dump`](https://cooler.readthedocs.io/en/latest/cli.html#cooler-dump)) -5. Genome completeness ([`NCBI API`](https://www.ncbi.nlm.nih.gov/datasets/docs/v1/reference-docs/rest-api/), [`BUSCO`](https://busco.ezlab.org)) -6. Consensus quality and k-mer completeness ([`FASTK`](https://github.com/thegenemyers/FASTK), [`MERQURY.FK`](https://github.com/thegenemyers/MERQURY.FK)) -7. Collated summary table ([`createtable`](bin/create_table.py)) -8. Present results and visualisations ([`MultiQC`](http://multiqc.info/), [`R`](https://www.r-project.org/)) +1. Fetches genome metadata from [ENA](https://www.ebi.ac.uk/ena/browser/api/#/ENA_Browser_Data_API), [NCBI](https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/rest-api), and [GoaT](https://goat.genomehubs.org/api-docs/) +2. Summary statistics ([`NCBI datasets summary genome accession`](https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/datasets/summary/genome/datasets_summary_genome_accession/)) +3. Convert alignment to BED ([`samtools view`](https://www.htslib.org/doc/samtools-view.html), [`bedtools bamtobed`](https://bedtools.readthedocs.io/en/latest/content/tools/bamtobed.html)) +4. Filter BED ([`GNU sort`](https://www.gnu.org/software/coreutils/manual/html_node/sort-invocation.html), [`filter bed`](https://raw.githubusercontent.com/sanger-tol/genomenote/main/bin/filter_bed.sh)) +5. Contact maps ([`Cooler cload`](https://cooler.readthedocs.io/en/latest/cli.html#cooler-cload-pairs), [`Cooler zoomify`](https://cooler.readthedocs.io/en/latest/cli.html#cooler-zoomify), [`Cooler dump`](https://cooler.readthedocs.io/en/latest/cli.html#cooler-dump)) +6. Genome completeness ([`NCBI API`](https://www.ncbi.nlm.nih.gov/datasets/docs/v1/reference-docs/rest-api/), [`BUSCO`](https://busco.ezlab.org)) +7. Consensus quality and k-mer completeness ([`FASTK`](https://github.com/thegenemyers/FASTK), [`MERQURY.FK`](https://github.com/thegenemyers/MERQURY.FK)) +8. Collated summary table ([`createtable`](bin/create_table.py)) +9. Present results and visualisations ([`MultiQC`](http://multiqc.info/), [`R`](https://www.r-project.org/)) ## Usage @@ -52,6 +53,9 @@ nextflow run sanger-tol/genomenote \ -profile \ --input samplesheet.csv \ --fasta genome.fasta \ + --assembly GCA_922984935.2 \ + --bioproject PRJEB49353 \ + --biosample SAMEA7524400 \ --outdir ``` @@ -69,8 +73,9 @@ sanger-tol/genomenote was originally written by [Priyanka Surana](https://github We thank the following people for their assistance in the development of this pipeline: - [Matthieu Muffato](https://github.com/muffato) +- [Beth Yates](https://github.com/BethYates) - [Shane McCarthy](https://github.com/mcshane) and [Yumi Sims](https://github.com/yumisims) for providing software and algorithm guidance. -- [Cibin Sadasivan Baby](https://github.com/cibinsb) and [Beth Yates](https://github.com/BethYates) for providing reviews. +- [Cibin Sadasivan Baby](https://github.com/cibinsb) for providing reviews. ## Contributions and Support diff --git a/assets/genome_metadata_template.csv b/assets/genome_metadata_template.csv new file mode 100644 index 00000000..5a709c57 --- /dev/null +++ b/assets/genome_metadata_template.csv @@ -0,0 +1,9 @@ +#File_source,File_type,Url,Output_type +ENA,Assembly,https://www.ebi.ac.uk/ena/browser/api/xml/ASSEMBLY_ACCESSION,xml +ENA,Bioproject,https://www.ebi.ac.uk/ena/browser/api/xml/BIOPROJECT_ACCESSION,xml +ENA,Biosample,https://www.ebi.ac.uk/ena/browser/api/xml/BIOSAMPLE_ACCESSION,xml +ENA,Taxonomy,https://www.ebi.ac.uk/ena/browser/api/xml/TAXONOMY_ID,xml +NCBI,Assembly,https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/ASSEMBLY_ACCESSION/dataset_report?filters.exclude_atypical=false&filters.assembly_version=current&chromosomes=1&chromosomes=2&chromosomes=3&chromosomes=X&chromosomes=Y&chromosomes=M,json +NCBI,Taxonomy,https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=taxonomy&id=TAXONOMY_ID,xml +GOAT,Assembly,http://goat.genomehubs.org/api/v2/record?recordId=ASSEMBLY_ACCESSION&result=assembly&taxonomy=ncbi,json +COPO,Biosample,https://copo-project.org/api/sample/biosampleAccession/BIOSAMPLE_ACCESSION?standard=tol&return_type=json,json diff --git a/assets/genome_note_template.docx b/assets/genome_note_template.docx new file mode 100644 index 00000000..15b1f94c Binary files /dev/null and b/assets/genome_note_template.docx differ diff --git a/assets/genome_note_template.xml b/assets/genome_note_template.xml new file mode 100644 index 00000000..9c8fa776 --- /dev/null +++ b/assets/genome_note_template.xml @@ -0,0 +1,34 @@ + +
+ + + Species taxonomy +

{{ TAX_STRING }}; + {{ GENUS }}; + {{ GENUS_SPECIES }} ($TAXONOMY_AUTHORITY) (NCBI:txid{{ NCBI_TAXID }}) {{ TEST_NOT_REPLACED }}. +

+
+ + + + + + + + + + + + {% for chromosome in CHR_TABLE %} + + + + + + + {% endfor %} + +
INSDC accessionChromosomeLength (Mb)GC%
{{ chromosome.get('Accession') }}{{ chromosome.get('Chromosome') }}{{ chromosome.get('Length') }}{{ chromosome.get('GC') }}
+
+ +
diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv index 9957e052..06145ed6 100644 --- a/assets/samplesheet.csv +++ b/assets/samplesheet.csv @@ -1,5 +1,4 @@ sample,datatype,datafile -uoEpiScrs1,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Epithemia_sp._CRS-2021b/genomic_data/uoEpiScrs1/pacbio/m64228e_220617_134154.ccs.bc1015_BAK8B_OA--bc1015_BAK8B_OA.rmdup.subset.bam -uoEpiScrs1,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Epithemia_sp._CRS-2021b/genomic_data/uoEpiScrs1/pacbio/m64016e_220621_193126.ccs.bc1008_BAK8A_OA--bc1008_BAK8A_OA.rmdup.subset.bam -uoEpiScrs1c,hic,https://tolit.cog.sanger.ac.uk/test-data/Epithemia_sp._CRS-2021b/analysis/uoEpiScrs1.1/read_mapping/hic/GCA_946965045.1.unmasked.hic.uoEpiScrs1.subsampled.cram -uoEpiScrs1b,hic,https://tolit.cog.sanger.ac.uk/test-data/Epithemia_sp._CRS-2021b/analysis/uoEpiScrs1.1/read_mapping/hic/GCA_946965045.1.unmasked.hic.uoEpiScrs1.subsampled.bam +ilCerPisi1,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Ceramica_pisi/genomic_data/ilCerPisi1/pacbio/m84047_230817_174414_s3.ccs.bc2048.subsampled.bam +ilCerPisi1,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Ceramica_pisi/genomic_data/ilCerPisi1/pacbio/m64097e_230309_154741.ccs.bc1012_BAK8A_OA--bc1012_BAK8A_OA.subsampled.bam +ilCerPisi1,hic,https://tolit.cog.sanger.ac.uk/test-data/Ceramica_pisi/analysis/ilCerPisi1.1/read_mapping/hic/GCA_963859965.1.unmasked.hic.ilCerPisi2.subsampled.cram diff --git a/bin/check_parameters.py b/bin/check_parameters.py new file mode 100755 index 00000000..0e07f531 --- /dev/null +++ b/bin/check_parameters.py @@ -0,0 +1,145 @@ +#!/usr/bin/env python3 + +import os +import sys +import requests +import argparse + + +def parse_args(args=None): + Description = "Use the genome assembly accession to fetch additional infromation on genome from ENA" + Epilog = "Example usage: python check_parameters.py --assembly --wgs_biosample --output" + + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument("--assembly", required=True, help="The INSDC accession for the assembly") + parser.add_argument("--wgs_biosample", required=True, help="The biosample accession for the WGS data") + parser.add_argument("--hic_biosample", required=False, help="The biosample accession for the Hi-C data") + parser.add_argument("--rna_biosample", required=False, help="The biosample accession for the RNASeq data") + parser.add_argument("--output", required=True, help="Output file path") + return parser.parse_args() + + +def make_dir(path): + if len(path) > 0: + os.makedirs(path, exist_ok=True) + + +def fetch_assembly_data(assembly, wgs_biosample, hic_biosample, rna_biosample, output_file): + url = f"https://www.ebi.ac.uk/ena/portal/api/search?query=assembly_set_accession%3D%22{assembly}%22&result=assembly&fields=assembly_set_accession%2Ctax_id%2Cscientific_name%2Cstudy_accession&limit=0&download=true&format=json" + response = requests.get(url) + + if response.status_code == 200: + assembly_data = response.json() + taxon_id = assembly_data[0].get("tax_id", None) + species = assembly_data[0].get("scientific_name", None).replace(" ", "_") + study = assembly_data[0].get("study_accession", None) + params = [assembly, species, taxon_id] + header = ["assembly", "species", "taxon_id"] + + if study: + study_url = f"https://www.ebi.ac.uk/ena/portal/api/search?query=study_accession%3D%22{study}%22&result=study&fields=parent_study_accession&limit=0&download=true&format=json" + study_response = requests.get(study_url) + + if study_response.status_code == 200: + study_data = study_response.json() + studies = study_data[0].get("parent_study_accession").split(";") + params.append(studies[0]) + header.append("bioproject") + + else: + raise AssertionError(f"Could not determine the Bioproject linked to this assembly {assembly}\n") + else: + raise AssertionError(f"Could not determine the Bioproject linked to this assembly {assembly}\n") + + # Validate wgs_biosample + wgs_url = f"https://www.ebi.ac.uk/ena/portal/api/search?query=sample_accession%3D%22{wgs_biosample}%22&result=sample&fields=sample_accession%2Ctax_id&limit=0&download=true&format=json" + wgs_response = requests.get(wgs_url) + + if wgs_response.status_code == 200: + wgs_data = wgs_response.json() + tax_id = wgs_data[0].get("tax_id") + + if tax_id != taxon_id: + raise AssertionError( + f"The WGS biosample taxon id: {tax_id} does not match the assembly taxon id: {taxon_id}\n" + ) + else: + params.append(wgs_biosample) + header.append("wgs_biosample") + + else: + raise AssertionError(f"The WGS biosample id: {wgs_biosample} could not retrieved from ENA\n") + + # Validate hic_biosample + if hic_biosample and hic_biosample != "null": + print(hic_biosample) + hic_url = f"https://www.ebi.ac.uk/ena/portal/api/search?query=sample_accession%3D%22{hic_biosample}%22&result=sample&fields=sample_accession%2Ctax_id&limit=0&download=true&format=json" + hic_response = requests.get(hic_url) + + if hic_response.status_code == 200: + hic_data = hic_response.json() + hic_tax_id = hic_data[0].get("tax_id") + + if hic_tax_id != taxon_id: + raise AssertionError( + f"The Hi-C biosample taxon id: {hic_tax_id} does not match the assembly taxon id: {taxon_id}\n" + ) + else: + header.append("hic_biosample") + params.append(hic_biosample) + + else: + raise AssertionError(f"The Hi-C biosample id: {hic_biosample} could not retrieved from ENA\n") + else: + header.append("hic_biosample") + params.append("null") + + # Validate rna_biosample + if rna_biosample and rna_biosample != "null": + rna_url = f"https://www.ebi.ac.uk/ena/portal/api/search?query=sample_accession%3D%22{rna_biosample}%22&result=sample&fields=sample_accession%2Ctax_id&limit=0&download=true&format=json" + rna_response = requests.get(rna_url) + + if rna_response.status_code == 200: + rna_data = rna_response.json() + rna_tax_id = rna_data[0].get("tax_id") + + if rna_tax_id != taxon_id: + raise AssertionError( + f"The RNASeq biosample taxon id: {rna_tax_id} does not match the assembly taxon id: {taxon_id}\n" + ) + else: + header.append("rna_biosample") + params.append(rna_biosample) + + else: + raise AssertionError(f"The RNASeq biosample id: {rna_biosample} could not retrieved from ENA\n") + + else: + header.append("rna_biosample") + params.append("null") + + with open(output_file, "w") as fout: + # Write header + fout.write(",".join(header) + "\n") + fout.write(",".join(params) + "\n") + + return output_file + else: + raise AssertionError(f"The assemby accession: {assembly} was not found\n") + + +def main(args=None): + args = parse_args(args) + hic_biosample = args.hic_biosample + rna_biosample = args.rna_biosample + fetch_assembly_data( + args.assembly, + args.wgs_biosample, + hic_biosample, + rna_biosample, + args.output, + ) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/bin/combine_parsed_data.py b/bin/combine_parsed_data.py new file mode 100755 index 00000000..fc82bfb6 --- /dev/null +++ b/bin/combine_parsed_data.py @@ -0,0 +1,148 @@ +#!/usr/bin/env python3 + +import csv +import os +import sys +import argparse +import string +import numbers + +files = [ + ("ENA_ASSEMBLY", "ena_assembly_file"), + ("ENA_BIOPROJECT", "ena_bioproject_file"), + ("ENA_BIOSAMPLE", "ena_biosample_wgs_file"), + ("ENA_BIOSAMPLE_HIC", "ena_biosample_hic_file"), + ("ENA_BIOSAMPLE_RNA", "ena_biosample_rna_file"), + ("ENA_TAXONOMY", "ena_taxonomy_file"), + ("NCBI_ASSEMBLY", "ncbi_assembly_file"), + ("NCBI_TAXONOMY", "ncbi_taxonomy_file"), + ("GOAT_ASSEMBLY", "goat_assembly_file"), + ("COPO_BIOSAMPLE", "copo_biosample_wgs_file"), + ("COPO_BIOSAMPLE_HIC", "copo_biosample_hic_file"), + ("COPO_BIOSAMPLE_RNA", "copo_biosample_rna_file"), + ("GBIF_TAXONOMY", "gbif_taxonomy_file"), +] + + +def parse_args(args=None): + Description = "Combined the parsed data files from each of the genome meta data sources." + Epilog = "Example usage: python parse_xml_ena_bioproject.py " + + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument("--ena_assembly_file", help="Input parsed ENA assembly file.", required=False) + parser.add_argument("--ena_bioproject_file", help="Input parsed ENA assembly file.", required=False) + parser.add_argument("--ena_biosample_wgs_file", help="Input parsed ENA genomic biosample file.", required=False) + parser.add_argument("--ena_biosample_hic_file", help="Input parsed ENA HiC biosample file.", required=False) + parser.add_argument("--ena_biosample_rna_file", help="Input parsed ENA RNASeq biosample file.", required=False) + parser.add_argument("--ena_taxonomy_file", help="Input parsed ENA assembly file.", required=False) + parser.add_argument("--ncbi_assembly_file", help="Input parsed ENA assembly file.", required=False) + parser.add_argument("--ncbi_taxonomy_file", help="Input parsed ENA assembly file.", required=False) + parser.add_argument("--goat_assembly_file", help="Input parsed ENA assembly file.", required=False) + parser.add_argument("--copo_biosample_wgs_file", help="Input parsed COPO genomic biosample file.", required=False) + parser.add_argument("--copo_biosample_hic_file", help="Input parsed COPO HiC biosample file.", required=False) + parser.add_argument("--copo_biosample_rna_file", help="Input parsed COPO RNASeq biosample file.", required=False) + parser.add_argument("--gbif_taxonomy_file", help="Input parsed GBIF taxonomy file.", required=False) + parser.add_argument("--out_consistent", help="Output file.", required=True) + parser.add_argument("--out_inconsistent", help="Output file.", required=True) + parser.add_argument("--version", action="version", version="%(prog)s 1.0") + return parser.parse_args(args) + + +def make_dir(path): + if len(path) > 0: + os.makedirs(path, exist_ok=True) + + +def process_file(file_in, params): + with open(file_in, mode="r") as infile: + reader = csv.reader(infile) + source_dict = {} + for row in reader: + if row[0] == "#paramName": + continue + + key = row.pop(0) + value = row[0] + + if any(p in string.punctuation for p in value): + value = '"' + value + '"' + + source_dict[key] = value + + if key in params: + params[key].append(value) + else: + params[key] = [value] + + return (params, source_dict) + + +def main(args=None): + args = parse_args(args) + params = {} + param_sets = {} + params_inconsistent = {} + locs = ["COLLECTION_LOCATION", "HIC_COLLECTION_LOCATION", "RNA_COLLECTION_LOCATION"] + + for file in files: + # check if file exists skip if not + if getattr(args, file[1]) is None: + continue + + (params, paramDict) = process_file(getattr(args, file[1]), params) + param_sets[file[0]] = paramDict + + for key in params.keys(): + value_set = {v for v in params[key]} + + # Handle collection locations having county provided by some data sources but not others + # use longer of the two location strings + if (key in locs) and len(value_set) == 2: + (loc_a, loc_b) = sorted(value_set, key=len) + if loc_b.find(loc_a): + params[key] = [loc_b] + + if len(value_set) != 1: + params_inconsistent[key] = [] + + for source in param_sets: + if key in param_sets[source]: + params_inconsistent[key].append((source, param_sets[source][key])) + + # Strip inconsistent data from parameter list + for i in params_inconsistent.keys(): + # Don't remove locations from consistent file if one is a substring of the other, longest string is returned + if (i in locs) and len(params[i]) == 1: + continue + else: + params.pop(i) + + # Write out file where data is consistent across different sources + if len(params) > 0: + with open(args.out_consistent, "w") as fout: + fout.write(",".join(["#paramName", "paramValue"]) + "\n") + # add in data source for consistent_params + for key in sorted(params): + key_sources = [] + for source in param_sets: + if key in param_sets[source]: + key_sources.append(source) + source_list = "|".join(key_sources) + fout.write(key + "," + params[key][0] + ',"' + source_list + '"\n') + + # Write out file where data is inconsistent across different sources + with open(args.out_inconsistent, "w") as fout: + fout.write(",".join(["#paramName", "source|paramValue"]) + "\n") + if len(params_inconsistent) > 0: + for key in sorted(params_inconsistent): + fout.write(key + ",") + pairs = [] + for value in params_inconsistent[key]: + pair = "|".join(value) + pairs.append(pair) + + fout.write(",".join(pairs) + "\n") + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/bin/combine_statistics_data.py b/bin/combine_statistics_data.py new file mode 100755 index 00000000..2e48c1dc --- /dev/null +++ b/bin/combine_statistics_data.py @@ -0,0 +1,134 @@ +#!/usr/bin/env python3 + +import csv +import os +import sys +import argparse +import string + +files = [ + ("CONSISTENT", "in_consistent"), + ("STATISITCS", "in_statistics"), +] + + +def parse_args(args=None): + Description = "Combined the parsed data file from the genome metadata subworkflow with the parsed data file from the genome statistics subworkflow." + Epilog = "Example usage: python combine_statistics.py " + + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument("--in_consistent", help="Input consistent params file.", required=True) + parser.add_argument("--in_inconsistent", help="Input consistent params file.", required=True) + parser.add_argument("--in_statistics", help="Input parsed genome statistics params file.", required=True) + parser.add_argument("--out_consistent", help="Output file.", required=True) + parser.add_argument("--out_inconsistent", help="Output file.", required=True) + parser.add_argument("--version", action="version", version="%(prog)s 1.0") + return parser.parse_args(args) + + +def make_dir(path): + if len(path) > 0: + os.makedirs(path, exist_ok=True) + + +def process_file(file_in, file_type, params, param_sets): + with open(file_in, mode="r") as infile: + reader = csv.reader(infile) + + for row in reader: + if row[0] == "#paramName": + continue + + key = row.pop(0) + source_values = [] + + if param_sets.get(key): + source_values = param_sets.get(key) + + if file_type == "CONSISTENT": + sources = row[1].split("|") + else: + sources = ["STATISICS"] + + value = row[0] + + if key == "CHR_TABLE": + value = ",".join(row) + + elif any(p in string.punctuation for p in value): + value = '"' + value + '"' + + for source in sources: + source_values.append([source, value]) + + param_sets[key] = source_values + + if key in params: + params[key].append(value) + else: + params[key] = [value] + + return (params, param_sets) + + +def process_inconsistent_file(file, params, inconsistent, consistent): + # Add inconsistent data from metadata_inconsistent_file + with open(file, mode="r") as infile: + reader = csv.reader(infile) + + for row in reader: + if row[0] == "#paramName": + continue + else: + key = row.pop(0) + + if consistent.get(key) is None: + inconsistent[key] = row + + return inconsistent + + +def main(args=None): + args = parse_args(args) + params = {} + param_sets = {} + params_inconsistent = {} + + for file in files: + (params, param_sets) = process_file(getattr(args, file[1]), file[0], params, param_sets) + + for key in params.keys(): + value_set = {v for v in params[key]} + if len(value_set) != 1: + params_inconsistent[key] = [] + + if key in param_sets: + for pair in param_sets[key]: + pair_str = pair[0] + "|" + pair[1] + params_inconsistent[key].append(pair_str) + + # Strip inconsitent data from parameter list + for i in params_inconsistent.keys(): + params.pop(i) + + # combine inconsisent params and add in original data source + params_inconsistent = process_inconsistent_file(args.in_inconsistent, param_sets, params_inconsistent, params) + + # Write out file where data is consistent across different sources + if len(params) > 0: + with open(args.out_consistent, "w") as fout: + fout.write(",".join(["#paramName", "paramValue"]) + "\n") + for key in sorted(params): + fout.write(key + "," + params[key][0] + "\n") + + # Write out file where data is inconsistent across different sources + with open(args.out_inconsistent, "w") as fout: + fout.write(",".join(["#paramName", "source|paramValue"]) + "\n") + if len(params_inconsistent) > 0: + for key in sorted(params_inconsistent): + fout.write(key + ",") + fout.write(",".join(params_inconsistent[key]) + "\n") + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/bin/create_table.py b/bin/create_table.py index 31ab9663..0f3e861c 100755 --- a/bin/create_table.py +++ b/bin/create_table.py @@ -104,13 +104,14 @@ def ncbi_stats(genome_in, seq_in, writer): for mol in seq: if mol.get("gc_percent") is not None and mol.get("assembly_unit") != "non-nuclear": if not chromosome_header: - writer.writerow(["##Chromosome", "Length", "GC_Percent"]) + writer.writerow(["##Chromosome", "Length", "GC_Percent", "Accession"]) chromosome_header = True writer.writerow( [ mol.get("chr_name", math.nan), - round(mol.get("length", 0) / 1000000, 2) if mol.get("length") is not None else math.nan, + "%.2f" % (mol.get("length", 0) / 1000000) if mol.get("length") is not None else math.nan, mol.get("gc_percent", math.nan), + mol.get("genbank_accession"), ] ) @@ -118,13 +119,14 @@ def ncbi_stats(genome_in, seq_in, writer): for mol in seq: if mol.get("gc_percent") is not None and mol.get("assembly_unit") == "non-nuclear": if not organelle_header: - writer.writerow(["##Organelle", "Length", "GC_Percent"]) + writer.writerow(["##Organelle", "Length", "GC_Percent", "Accession"]) organelle_header = True writer.writerow( [ mol.get("assigned_molecule_location_type", math.nan), - round(mol.get("length", 0) / 1000000, 2) if mol.get("length") is not None else math.nan, + "%.2f" % (mol.get("length", 0) / 1000) if mol.get("length") is not None else math.nan, mol.get("gc_percent", math.nan), + mol.get("genbank_accession"), ] ) @@ -142,9 +144,17 @@ def extract_busco(file_in, writer): lineage_dataset_name = data.get("lineage_dataset", {}).get("name", math.nan) results_summary = data.get("results", {}).get("one_line_summary", math.nan) + results_complete = data.get("results", {}).get("Complete", math.nan) + results_single_copy = data.get("results", {}).get("Single copy", math.nan) + results_duplicated = data.get("results", {}).get("Multi copy", math.nan) + results_n_markers = data.get("results", {}).get("n_markers", math.nan) writer.writerow(["##BUSCO", lineage_dataset_name]) writer.writerow(["Summary", results_summary]) + writer.writerow(["Complete", results_complete]) + writer.writerow(["Single", results_single_copy]) + writer.writerow(["Duplicated", results_duplicated]) + writer.writerow(["Number_Orthologs", f"{results_n_markers:,}"]) def extract_pacbio(qv, completeness, writer): diff --git a/bin/fetch_gbif_metadata.py b/bin/fetch_gbif_metadata.py new file mode 100755 index 00000000..ccac54cd --- /dev/null +++ b/bin/fetch_gbif_metadata.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 + +import os +import sys +import requests +import argparse + + +def parse_args(args=None): + Description = "Parse contents of an ENA Taxonomy report and pull out metadata required by a genome note." + Epilog = "Example usage: python fetch_gbif_metadata.py --genus --species --output" + + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument("--species", required=True, help="The species name") + parser.add_argument("--output", required=True, help="Output file path") + return parser.parse_args() + + +def make_dir(path): + if len(path) > 0: + os.makedirs(path, exist_ok=True) + + +def fetch_gbif_data(genus, species, output_file): + match_url = f"https://api.gbif.org/v1/species/match?verbose=true&genus={genus}&species={species}" + response = requests.get(match_url) + + if response.status_code == 200: + match_data = response.json() + usage_key = match_data.get("usageKey") + + if usage_key: + species_url = f"https://api.gbif.org/v1/species/{usage_key}" + species_response = requests.get(species_url) + + if species_response.status_code == 200: + species_data = species_response.json() + + # Metadata fields to extract + metadata_fields = { + "PHYLUM": "phylum", + "CLASS": "class", + "ORDER": "order", + "FAMILY": "family", + "GENUS": "genus", + "SPECIES": "species", + "GENUS_SPECIES": "canonicalName", + "COMMON_NAME": "vernacularName", + "TAXONOMY_AUTHORITY": "authorship", + } + + param_list = [] + + # Retrieve the required fields and create parameter pairs + for key, json_key in metadata_fields.items(): + value = species_data.get(json_key) + if value: + # Special handling for TAXONOMY_AUTHORITY to clean up the value + if key == "TAXONOMY_AUTHORITY": + value = value.strip() + # Wrap the authorship in quotes + value = f'"{value}"' # Enclose the value in quotes + + param_list.append((key, value)) + + # Check if there is any data to write + if len(param_list) > 0: + out_dir = os.path.dirname(output_file) + make_dir(out_dir) # Create directory if it does not exist + + with open(output_file, "w") as fout: + # Write header + fout.write(",".join(["#paramName", "paramValue"]) + "\n") + for param_pair in param_list: + fout.write(",".join(param_pair) + "\n") + + return output_file + + return "Metadata not found." + + +def main(args=None): + args = parse_args(args) + (genus, species) = args.species.split("_") + fetch_gbif_data(genus, species, args.output) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/bin/generate_higlass_link.py b/bin/generate_higlass_link.py new file mode 100755 index 00000000..350689ce --- /dev/null +++ b/bin/generate_higlass_link.py @@ -0,0 +1,155 @@ +#!/usr/bin/env python3 + +import os +import sys +import argparse +import csv +import requests + + +def parse_args(args=None): + Description = "Parse contents of an ENA Assembly report and pul out meta data required by a genome note." + Epilog = "Example usage: python generate_higlass_link.py " + + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument("FILE_NAME", help="Prefix file name for the project.") + parser.add_argument("MAP_UUID", help="UUID for the .mcool file tileset.") + parser.add_argument("GRID_UUID", help="UUID for the .genome file tileset.") + parser.add_argument("HIGLASS_SERVER", help="Higlass server url") + parser.add_argument("GENOME_FILE", help="Input .genome file") + parser.add_argument("OUTPUT_FILE", help="Output .csv file") + parser.add_argument("--version", action="version", version="%(prog)s 1.0") + return parser.parse_args(args) + + +def calculate_genome_size(file_in): + # calculate total genome length by adding all contig/scaffold lengths in the .genome file + genome_length = 0 + with open(file_in) as csvfile: + reader = csv.reader(csvfile, delimiter="\t") + for row in reader: + genome_length += int(row[1]) + + return genome_length + + +def check_viewconfig_exists(higlass_server, file_name): + # Use HiGlass API to see if a viewconfig matching the file_name already exists on the server + headers = {"Content-Type": "application/json"} + params = {"d": file_name} + response = requests.get(f"{higlass_server}/api/v1/viewconfs/l/", params=params, headers=headers) + if response: + return True + return False + + +def request_viewconfig(higlass_server, file_name, map_uuid, grid_uuid, genome_length): + # define viewconfig, "contents" array should contain a section for each filetype. + # uid of viewconfig should match the file_name + request_data = { + "uid": file_name, + "viewconf": { + "editable": True, + "zoomFixed": False, + "trackSourceServers": ["/api/v1"], + "exportViewUrl": "/api/v1/viewconfs/", + "views": [ + { + "tracks": { + "top": [], + "left": [], + "center": [ + { + "uid": "", + "type": "combined", + "contents": [ + { + "filetype": "cooler", + "server": f"{higlass_server}/api/v1", + "tilesetUid": map_uuid, + "uid": "", + "type": "heatmap", + "options": { + "heatmapValueScaling": "linear", + "valueScaleMin": 0.0, + "ValueScaleMax": 20.0, + }, + }, + { + "filetype": "chromsizes-tsv", + "server": f"{higlass_server}/api/v1", + "tilesetUid": grid_uuid, + "uid": "", + "type": "2d-chromosome-grid", + "options": {"lineStrokeWidth": 1, "lineStrokeColor": "grey"}, + "width": 20, + "height": 20, + }, + ], + "width": 1583, + "height": 788, + } + ], + "right": [], + "bottom": [], + }, + "initialXDomain": [0, genome_length], + "initialYDomain": [0, genome_length], + "layout": {"w": 12, "h": 12, "x": 0, "y": 0, "i": "", "moved": False, "static": False}, + } + ], + "zoomLocks": {"locksByViewUid": {}, "locksDict": {}}, + "locationLocks": {"locksByViewUid": {}, "locksDict": {}}, + "valueScaleLocks": {"locksByViewUid": {}, "locksDict": {}}, + }, + } + + headers = {"Content-Type": "application/json"} + + response = requests.post(f"{higlass_server}/api/v1/viewconfs/", json=request_data, headers=headers) + + if response: + viewconf_uid = response.json()["uid"] + url = f"{higlass_server}/l/?d=" + viewconf_uid + return url + else: + error_str = "ERROR: Posting view config failed" + print(error_str) + sys.exit(1) + + +def make_dir(path): + if len(path) > 0: + os.makedirs(path, exist_ok=True) + + +def print_output(url, file_out): + out_dir = os.path.dirname(file_out) + make_dir(out_dir) + with open(file_out, "w") as fout: + fout.write(",".join(["HIGLASS_URL", url]) + "\n") + + +def main(args=None): + args = parse_args(args) + + # total genome length is required when creating viewconfig + length = calculate_genome_size(args.GENOME_FILE) + + # file name is used as the uid for the view config, it can't contain a "." + file_name = args.FILE_NAME.replace(".", "_") + + # check if already have a viewconfig matching the file name + exists = check_viewconfig_exists(args.HIGLASS_SERVER, file_name) + if exists: + # return existing viewconfig url + url = f"{args.HIGLASS_SERVER}/l/?d={file_name}" + else: + # create a new viewconfig and return the url + url = request_viewconfig(args.HIGLASS_SERVER, file_name, args.MAP_UUID, args.GRID_UUID, length) + + print_output(url, args.OUTPUT_FILE) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/bin/parse_csv_genome_summary.py b/bin/parse_csv_genome_summary.py new file mode 100755 index 00000000..20590fbe --- /dev/null +++ b/bin/parse_csv_genome_summary.py @@ -0,0 +1,134 @@ +#!/usr/bin/env python3 + +import argparse +import os +import csv +import json +import sys +import string +import numbers + +param_lookup = { + "Accession": "ASSEMBLY_ACCESSION", + "Organism_Name": "GENUS_SPECIES", + "ToL_ID": "TOLID", + "Taxon_ID": "NCBI_TAXID", + "Assembly_Name": "ASSEMBLY_ID", + "Life_Stage": "LIFESTAGE", + "Sex": "SAMPLE_SEX", + "Total_Sequence": "GENOME_LENGTH", + "Chromosomes": "CHROMOSOME_NUMBER", + "Scaffolds": "SCAFF_NUMBER", + "Scaffold_N50": "SCAFF_N50", + "Contigs": "CONTIG_NUMBER", + "Contig_N50": "CONTIG_N50", + "Mitochondrion": "MITO_SIZE", + "##BUSCO": "BUSCO_REF", + "Summary": "BUSCO_STRING", + "Complete": "BUSCO_C", + "Single": "BUSCO_S", + "Duplicated": "BUSCO_D", + "Number_Orthologs": "BUSCO_N", + "QV": "QV", + "Completeness": "KMER", +} + + +def parse_args(args=None): + Description = "Parse contents of the Genome Summary CSV file produced by the Genome Statistics subworkflow and pul out meta data required by a genome note." + Epilog = "Example usage: python parse_json_ncbi_assembly.py " + + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument("FILE_IN", help="Input CSV Genome Summary file.") + parser.add_argument("FILE_OUT", help="Output file.") + parser.add_argument("--version", action="version", version="%(prog)s 1.0") + return parser.parse_args(args) + + +def make_dir(path): + if len(path) > 0: + os.makedirs(path, exist_ok=True) + + +def print_error(error, context="Line", context_str=""): + error_str = "ERROR: Please check csv file -> {}".format(error) + if context != "": + if context_str != "": + error_str = "ERROR: Please check csv file -> {}\n{}: '{}'".format( + error, context.strip(), context_str.strip() + ) + else: + error_str = "ERROR: Please check csv file -> {}\n{}".format(error, context.strip()) + + print(error_str) + sys.exit(1) + + +def parse_csv(file_in, file_out): + param_list = [] + with open(file_in) as csvfile: + reader = csv.reader(csvfile) + + for row in reader: + if row[0] in param_lookup: + key = param_lookup.get(row[0]) + param = row[1] + + if key == "BUSCO_STRING": + busco = param.replace("[", ":").split(":") + param_list.append(["BUSCO", busco[1]]) + + elif key == "GENOME_LENGTH": + param = str("%.2f" % (int(param) * 1e-6)) # convert to Mbp, 2 decimal places + + elif key == "SCAFF_N50" or key == "CONTIG_N50": + param = str("%.1f" % (int(param) * 1e-6)) # convert to Mbp, 1 decimal place + + # Convert ints and floats to str to allow for params with punctuation to be quoted + if isinstance(param, numbers.Number): + param = str(param) + + if any(p in string.punctuation for p in param): + param = '"' + param + '"' + + if len(param) != 0: + param_list.append([key, param]) + + elif row[0] == "##Chromosome": + chrs = [] + chr_list_complete = 0 + + while chr_list_complete < 1: + chr_row = reader.__next__() + + if chr_row[0].startswith("##"): + chr_list_complete = 1 + sorted_chrs = sorted(chrs, key=lambda d: d["Length"], reverse=True) + param_list.append(["LONGEST_SCAFF", sorted_chrs[0].get("Length")]) + json_chrs = json.dumps(chrs) + param_list.append(["CHR_TABLE", json_chrs]) + + else: + chrs.append( + {"Chromosome": chr_row[0], "Length": chr_row[1], "GC": chr_row[2], "Accession": chr_row[3]} + ) + + if len(param_list) > 0: + out_dir = os.path.dirname(file_out) + make_dir(out_dir) + with open(file_out, "w") as fout: + fout.write(",".join(["#paramName", "paramValue"]) + "\n") + for param_pair in param_list: + fout.write(",".join(param_pair) + "\n") + + else: + print_error("No parameters found!", "File: {}".format(file_in)) + + +def main(args=None): + args = parse_args(args) + parse_csv(args.FILE_IN, args.FILE_OUT) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/bin/parse_json_copo_biosample.py b/bin/parse_json_copo_biosample.py new file mode 100755 index 00000000..912a31e3 --- /dev/null +++ b/bin/parse_json_copo_biosample.py @@ -0,0 +1,160 @@ +#!/usr/bin/env python3 + +import argparse +import os +import json +import sys +import string +import numbers + +fetch = [ + ("SPECIMEN_ID", ("SPECIMEN_ID",)), + ("BIOSAMPLE_ACCESSION", ("biosampleAccession",)), + ("GENUS_SPECIES", ("SCIENTIFIC_NAME",)), + ("COMMON_NAME", ("COMMON_NAME",)), + ("COLLECTORS", ("COLLECTED_BY",)), + ("COLLECTOR_INSTITUTE", ("COLLECTOR_AFFILIATION",)), + ("COLLECTOR_DATE", ("DATE_OF_COLLECTION",)), + ("COLLECTION_METHOD", ("DESCRIPTION_OF_COLLECTION_METHOD",)), + ("COLLECTION_LOCATION", ("COLLECTION_LOCATION",)), + ("LATITUDE", ("DECIMAL_LATITUDE",)), + ("LONGITUDE", ("DECIMAL_LONGITUDE",)), + ("HABITAT", ("HABITAT",)), + ("IDENTIFIER", ("IDENTIFIED_BY",)), + ("IDENTIFIER_INSTITUTE", ("IDENTIFIER_AFFILIATION",)), + ("PRESERVATION_METHOD", ("PRESERVATION_APPROACH",)), + ("SYMBIONT", ("SYMBIONT",)), + ("NCBI_TAXID", ("TAXON_ID",)), + ("ORDER", ("ORDER_OR_GROUP",)), + ("FAMILY", ("FAMILY",)), + ("GENUS", ("GENUS",)), + ("SEX", ("SEX",)), + ("LIFESTAGE", ("LIFESTAGE",)), + ("ORGANISM_PART", ("ORGANISM_PART",)), + ("GAL", ("GAL",)), +] + + +def parse_args(args=None): + Description = "Parse contents of a COPO json file report and pul out meta data required by a genome note." + Epilog = "Example usage: python parse_json_copo_biosample.py " + + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument("FILE_IN", help="Input JSON Assembly file.") + parser.add_argument("FILE_OUT", help="Output file.") + parser.add_argument("--version", action="version", version="%(prog)s 1.0") + return parser.parse_args(args) + + +def make_dir(path): + if len(path) > 0: + os.makedirs(path, exist_ok=True) + + +def print_error(error, context="Line", context_str=""): + error_str = "ERROR: Please check json file -> {}".format(error) + if context != "": + if context_str != "": + error_str = "ERROR: Please check json file -> {}\n{}: '{}'".format( + error, context.strip(), context_str.strip() + ) + else: + error_str = "ERROR: Please check json file -> {}\n{}".format(error, context.strip()) + + print(error_str) + sys.exit(1) + + +def parse_json(file_in, file_out): + try: + with open(file_in, "r") as f: + data = json.load(f) + + except Exception as e: + print_error(f"Failed to read JSON file. Error: {e}") + + if data["number_found"] == 0: + out_dir = os.path.dirname(file_out) + make_dir(out_dir) + with open(file_out, "w") as fout: + fout.write(",".join(["#paramName", "paramValue"]) + "\n") + return + + elif data["number_found"] >> 1: + print_error("More than one record found") + + else: + record = data["data"] + + # Extract biosample type from FILE_OUT + biosample_type = None + if "HIC" in file_out.upper(): + biosample_type = "HIC" + elif "RNA" in file_out.upper(): + biosample_type = "RNA" + + param_list = [] + + for data in record: + for f in fetch: + param = find_element(data, f[1], index=0) + if param is not None: + # Preprocess some values to standardise their format + if f[0] == "GAL": + param = param.title() + + elif f[0] == "LIFESTAGE": + param = param.lower() + + # pre-process collection location + elif f[0] == "COLLECTION_LOCATION": + location_list = param.split("|") + location_list.reverse() + param = ", ".join(loc.title().strip() for loc in location_list) + + # organism part and preservation methods should be in lower case + elif f[0] == "ORGANISM_PART" or f[0] == "PRESERVATION_METHOD": + param = param.lower() + + if isinstance(param, numbers.Number): + param = str(param) + if any(p in string.punctuation for p in param): + param = '"' + param + '"' + # Prefix parameter name if biosample type is COPO + param_name = f[0] + if biosample_type in ["HIC", "RNA"]: + param_name = f"{biosample_type}_{param_name}" + param_list.append([param_name, param]) + + if len(param_list) > 0: + out_dir = os.path.dirname(file_out) + make_dir(out_dir) + with open(file_out, "w") as fout: + fout.write(",".join(["#paramName", "paramValue"]) + "\n") + for param_pair in param_list: + fout.write(",".join(param_pair) + "\n") + + else: + print_error("No parameters found!", "File: {}".format(file_in)) + + +def find_element(data, fields, index=0): + if index < len(fields): + key = fields[index] + if key in data: + sub_data = data[key] + if type(sub_data) in [list, dict]: + return find_element(sub_data, fields, index + 1) + return sub_data + else: + return None + return None + + +def main(args=None): + args = parse_args(args) + parse_json(args.FILE_IN, args.FILE_OUT) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/bin/parse_json_goat_assembly.py b/bin/parse_json_goat_assembly.py new file mode 100755 index 00000000..a387e17c --- /dev/null +++ b/bin/parse_json_goat_assembly.py @@ -0,0 +1,170 @@ +#!/usr/bin/env python3 + +import argparse +import os +import json +import sys +import string +import numbers + +fetch = [ + ("BIOPROJECT_ACCESSION", ("record", "attributes", "bioproject", "value"), {"index": 0}), + ( + "ASSEMBLY_ACCESSION", + ( + "record", + "assembly_id", + ), + ), + ("COMMON_NAME", ("record", "taxon_names"), {"class": "genbank common name"}), + ("GENUS_SPECIES", ("record", "taxon_names"), {"class": "scientific name"}), + ("PHYLUM", ("record", "lineage"), {"taxon_rank": "phylum"}), + ("CLASS", ("record", "lineage"), {"taxon_rank": "class"}), + ("ORDER", ("record", "lineage"), {"taxon_rank": "order"}), + ("FAMILY", ("record", "lineage"), {"taxon_rank": "family"}), + ("TRIBE", ("record", "lineage"), {"taxon_rank": "tribe"}), + ("GENUS", ("record", "lineage"), {"taxon_rank": "genus"}), + ("NCBI_TAXID", ("record", "taxon_id")), + ("PROJECT_BIOSAMPLE_ACCESSION", ("record", "attributes", "biosample", "value")), + ("SAMPLE_SEX", ("record", "attributes", "sample_sex", "value")), + ("GENOME_LENGTH", ("record", "attributes", "assembly_span", "value")), + ("SCAFF_NUMBER", ("record", "attributes", "scaffold_count", "value")), + ("SCAFF_N50", ("record", "attributes", "scaffold_n50", "value")), + ("CHROMOSOME_NUMBER", ("record", "attributes", "chromosome_count", "value")), + ("CONTIG_NUMBER", ("record", "attributes", "contig_count", "value")), + ("CONTIG_N50", ("record", "attributes", "contig_n50", "value")), + ("PERC_ASSEM", ("record", "attributes", "assigned_percent", "value")), +] + + +def parse_args(args=None): + Description = "Parse contents of an NCBI Assembly report and pul out meta data required by a genome note." + Epilog = "Example usage: python parse_json_ncbi_assembly.py " + + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument("FILE_IN", help="Input JSON Assembly file.") + parser.add_argument("FILE_OUT", help="Output file.") + parser.add_argument("--version", action="version", version="%(prog)s 1.0") + return parser.parse_args(args) + + +def make_dir(path): + if len(path) > 0: + os.makedirs(path, exist_ok=True) + + +def print_error(error, context="Line", context_str=""): + error_str = "ERROR: Please check json file -> {}".format(error) + if context != "": + if context_str != "": + error_str = "ERROR: Please check json file -> {}\n{}: '{}'".format( + error, context.strip(), context_str.strip() + ) + else: + error_str = "ERROR: Please check json file -> {}\n{}".format(error, context.strip()) + + print(error_str) + sys.exit(1) + + +def parse_json(file_in, file_out): + json_file = open(file_in) + data = json.load(json_file) + + param_list = [] + + if len(data["records"]) != 1: + print_error("More than one record found") + + for f in fetch: + attribs = None + if len(f) == 3: + attribs = f[2] + + param = find_element(data["records"][0], f[1], attribs, param_list, index=0) + + if param is not None: + if f[0] == "GENOME_LENGTH": + param = str("%.2f" % (int(param) * 1e-6)) # convert to Mbp, 2 decimal places + + elif f[0] == "SCAFF_N50" or f[0] == "CONTIG_N50": + param = str("%.1f" % (int(param) * 1e-6)) # convert to Mbp 1 decimal place + + elif f[0] == "PERC_ASSEM": + param = str("%.2f" % param) + + # Convert ints and floats to str to allow for params with punctuation to be quoted + if isinstance(param, numbers.Number): + param = str(param) + + if any(p in string.punctuation for p in param): + param = '"' + param + '"' + + param_list.append([f[0], param]) + + if len(param_list) > 0: + out_dir = os.path.dirname(file_out) + make_dir(out_dir) + with open(file_out, "w") as fout: + fout.write(",".join(["#paramName", "paramValue"]) + "\n") + for param_pair in param_list: + fout.write(",".join(param_pair) + "\n") + + else: + print_error("No parameters found!", "File: {}".format(file_in)) + + +def find_element(data, fields, attribs, param_list, index=0): + if type(data) == list: + # we have a list to iterate + if "class" in attribs.keys(): + for item in data: + if item["class"] == attribs["class"]: + return item["name"] + + elif "taxon_rank" in attribs.keys(): + for item in data: + if item["taxon_rank"] == attribs["taxon_rank"]: + return item["scientific_name"] + + elif "index" in attribs.keys(): + index = attribs["index"] + return data[attribs["index"]] + + elif "bioprojects" in attribs.keys(): + bioproject_key = None + + for param in param_list: + if param[0] == "BIOPROJECT_ACCESSION": + bioproject_key = param[1] + + bioprojects = data[0]["bioprojects"] + for project in bioprojects: + if project["accession"] == bioproject_key: + if project["parent_accessions"] != None and len(project["parent_accessions"]) == 1: + if project["title"] != None: + return project["title"] + + else: + # fields either not found or we don't yet handle parsing it + pass + + else: + if fields[index] in data: + sub_data = data[fields[index]] + if type(sub_data) == list or type(sub_data) == dict: + return find_element(sub_data, fields, attribs, param_list, index=index + 1) + return sub_data + else: + # Don't have the field so it is an error or missing + # print(f'We could not find {fields[index]}') + pass + + +def main(args=None): + args = parse_args(args) + parse_json(args.FILE_IN, args.FILE_OUT) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/bin/parse_json_ncbi_assembly.py b/bin/parse_json_ncbi_assembly.py new file mode 100755 index 00000000..4c9250e7 --- /dev/null +++ b/bin/parse_json_ncbi_assembly.py @@ -0,0 +1,181 @@ +#!/usr/bin/env python3 + +import argparse +import os +import json +import sys +import string +import numbers + +fetch = [ + ("TOLID", ("assembly_info", "biosample", "attributes"), {"name": "tolid"}), + ("ASSEMBLY_ID", ("assembly_info", "assembly_name")), + ("SPECIMEN_ID", ("assembly_info", "biosample", "attributes"), {"name": "specimen id"}), + ("BIOPROJECT_ACCESSION", ("assembly_info", "bioproject_accession")), + ("BIOPROJECT_TITLE", ("assembly_info", "bioproject_lineage"), {"bioprojects": "test"}), + ("ASSEMBLY_ACCESSION", ("accession",)), + ("ALT_HAP_ACCESSION", ("assembly_info", "linked_assemblies"), {"linked_assembly": None}), + ("GAL", ("assembly_info", "biosample", "attributes"), {"name": "GAL"}), + ("COLLECTORS", ("assembly_info", "biosample", "attributes"), {"name": "collected_by"}), + ("COLLECTOR_INSTITUTE", ("assembly_info", "biosample", "attributes"), {"name": "collecting institution"}), + ("COLLECTOR_DATE", ("assembly_info", "biosample", "attributes"), {"name": "collection_date"}), + ("IDENTIFIER", ("assembly_info", "biosample", "attributes"), {"name": "identified_by"}), + ("IDENTIFIER_INSTITUTE", ("assembly_info", "biosample", "attributes"), {"name": "identifier_affiliation"}), + ("IDENTIFIER", ("assembly_info", "biosample", "attributes"), {"name": "identified_by"}), + ("COMMON_NAME", ("assembly_info", "biosample", "attributes"), {"name": "common name"}), + ("GENUS_SPECIES", ("organism", "organism_name")), + ("NCBI_TAXID", ("organism", "tax_id")), + ("SAMPLE_SEX", ("assembly_info", "biosample", "attributes"), {"name": "sex"}), + ( + "COLLECTION_LOCATION", + ("assembly_info", "biosample", "attributes"), + {"name": "geographic location (region and locality)"}, + ), + ("LATITUDE", ("assembly_info", "biosample", "attributes"), {"name": "geographic location (latitude)"}), + ("LONGITUDE", ("assembly_info", "biosample", "attributes"), {"name": "geographic location (longitude)"}), + ("HABITAT", ("assembly_info", "biosample", "attributes"), {"name": "habitat"}), + ("PROJECT_BIOSAMPLE_ACCESSION", ("assembly_info", "biosample", "accession")), + ("PROJECT_ORGANISM_PART", ("assembly_info", "biosample", "attributes"), {"name": "tissue"}), + ("GENOME_LENGTH", ("assembly_stats", "total_sequence_length")), + ("LR_COV", ("assembly_stats", "genome_coverage")), + ("CHROMOSOME_NUMBER", ("assembly_stats", "total_number_of_chromosomes")), + ("SCAFF_NUMBER", ("assembly_stats", "number_of_scaffolds")), + ("CONTIG_NUMBER", ("assembly_stats", "number_of_contigs")), + ("CONTIG_N50", ("assembly_stats", "contig_n50")), +] + + +def parse_args(args=None): + Description = "Parse contents of an NCBI Assembly report and pul out meta data required by a genome note." + Epilog = "Example usage: python parse_json_ncbi_assembly.py " + + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument("FILE_IN", help="Input JSON Assembly file.") + parser.add_argument("FILE_OUT", help="Output file.") + parser.add_argument("--version", action="version", version="%(prog)s 1.0") + return parser.parse_args(args) + + +def make_dir(path): + if len(path) > 0: + os.makedirs(path, exist_ok=True) + + +def print_error(error, context="Line", context_str=""): + error_str = "ERROR: Please check json file -> {}".format(error) + if context != "": + if context_str != "": + error_str = "ERROR: Please check json file -> {}\n{}: '{}'".format( + error, context.strip(), context_str.strip() + ) + else: + error_str = "ERROR: Please check json file -> {}\n{}".format(error, context.strip()) + + print(error_str) + sys.exit(1) + + +def parse_json(file_in, file_out): + json_file = open(file_in) + data = json.load(json_file) + param_list = [] + + if len(data["reports"]) != 1: + print_error("More than one report found") + + for f in fetch: + attribs = None + if len(f) == 3: + attribs = f[2] + + param = find_element(data["reports"][0], f[1], attribs, param_list, index=0) + + if param is not None: + # Preprocess some values to standardise their format + if f[0] == "GAL": + param = param.title() + + elif f[0] == "COLLECTION_LOCATION": + location_list = param.split("|") + location_list.reverse() + param = ", ".join(loc.title().strip() for loc in location_list) + + elif f[0] == "GENOME_LENGTH": + param = str("%.2f" % (int(param) * 1e-6)) # convert to Mbp 2 decimal places + + elif f[0] == "SCAFF_N50" or f[0] == "CONTIG_N50": + param = str("%.1f" % (int(param) * 1e-6)) # convert to Mbp 1 decimal place + + # Convert ints and floats to str to allow for params with punctuation to be quoted + if isinstance(param, numbers.Number): + param = str(param) + + if any(p in string.punctuation for p in param): + param = '"' + param + '"' + + param_list.append([f[0], param]) + + if len(param_list) > 0: + out_dir = os.path.dirname(file_out) + make_dir(out_dir) + with open(file_out, "w") as fout: + fout.write(",".join(["#paramName", "paramValue"]) + "\n") + for param_pair in param_list: + fout.write(",".join(param_pair) + "\n") + + else: + print_error("No parameters found!", "File: {}".format(file_in)) + + +def find_element(data, fields, attribs, param_list, index=0): + if type(data) == list: + # we have a list to iterate + if "name" in attribs.keys(): + for item in data: + if item["name"] == attribs["name"]: + return item["value"] + + elif "bioprojects" in attribs.keys(): + bioproject_key = None + + for param in param_list: + if param[0] == "BIOPROJECT_ACCESSION": + bioproject_key = param[1] + + bioprojects = data[0]["bioprojects"] + for project in bioprojects: + if project["accession"] == bioproject_key: + if project["parent_accessions"] != None and len(project["parent_accessions"]) == 1: + if project["title"] != None: + return project["title"] + + elif "linked_assembly" in attribs.keys(): + linked_assembly = [] + for assembly in data: + linked_assembly.append(assembly["linked_assembly"]) + + return ", ".join(linked_assembly) + + else: + # fields either not found or we don't yet handle parsing it + pass + + else: + if fields[index] in data: + sub_data = data[fields[index]] + if type(sub_data) == list or type(sub_data) == dict: + return find_element(sub_data, fields, attribs, param_list, index=index + 1) + return sub_data + else: + # Don't have the field so it is an error or missing + # print(f'We could not find {fields[index]}') + pass + + +def main(args=None): + args = parse_args(args) + parse_json(args.FILE_IN, args.FILE_OUT) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/bin/parse_xml_ena_assembly.py b/bin/parse_xml_ena_assembly.py new file mode 100755 index 00000000..822ab58d --- /dev/null +++ b/bin/parse_xml_ena_assembly.py @@ -0,0 +1,147 @@ +#!/usr/bin/env python3 + +import os +import sys +import argparse +import xml.etree.ElementTree as ET +import string +import numbers + +fetch = [ + ("ASSEMBLY_ID", ["ASSEMBLY"], ("attrib", "alias")), + ("BIOPROJECT_ACCESSION", ["ASSEMBLY", "STUDY_REF", "IDENTIFIERS", "PRIMARY_ID"]), + ("ASSEMBLY_ACCESSION", ["ASSEMBLY", "IDENTIFIERS", "PRIMARY_ID"]), + ("NCBI_TAXID", ["ASSEMBLY", "TAXON", "TAXON_ID"]), + ("COMMON_NAME", ["ASSEMBLY", "TAXON", "COMMON_NAME"]), + ("GENUS_SPECIES", ["ASSEMBLY", "TAXON", "SCIENTIFIC_NAME"]), + ("PROJECT_BIOSAMPLE_ACCESSION", ["ASSEMBLY", "SAMPLE_REF", "IDENTIFIERS", "PRIMARY_ID"]), + ("GENOME_LENGTH", ["ASSEMBLY", "ASSEMBLY_ATTRIBUTES"], ("tag", ".//*[TAG='total-length']//", "VALUE")), + ("SCAFF_NUMBER", ["ASSEMBLY", "ASSEMBLY_ATTRIBUTES"], ("tag", ".//*[TAG='scaffold-count']//", "VALUE")), + ("SCAFF_N50", ["ASSEMBLY", "ASSEMBLY_ATTRIBUTES"], ("tag", ".//*[TAG='n50']//", "VALUE")), + ("GAP_COUNT", ["ASSEMBLY", "ASSEMBLY_ATTRIBUTES"], ("tag", ".//*[TAG='spanned-gaps']//", "VALUE")), + ("CHROMOSOME_NUMBER", ["ASSEMBLY", "ASSEMBLY_ATTRIBUTES"], ("tag", ".//*[TAG='replicon-count']//", "VALUE")), + ("CONTIG_NUMBER", ["ASSEMBLY", "ASSEMBLY_ATTRIBUTES"], ("tag", ".//*[TAG='count-contig']//", "VALUE")), + ("CONTIG_N50", ["ASSEMBLY", "ASSEMBLY_ATTRIBUTES"], ("tag", ".//*[TAG='contig-n50']//", "VALUE")), +] + + +def parse_args(args=None): + Description = "Parse contents of an ENA Assembly report and pul out meta data required by a genome note." + Epilog = "Example usage: python parse_xml_ena_assembly.py " + + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument("FILE_IN", help="Input XML Assembly file.") + parser.add_argument("FILE_OUT", help="Output file.") + parser.add_argument("--version", action="version", version="%(prog)s 1.0") + return parser.parse_args(args) + + +def make_dir(path): + if len(path) > 0: + os.makedirs(path, exist_ok=True) + + +def print_error(error, context="Line", context_str=""): + error_str = "ERROR: Please check xml file -> {}".format(error) + if context != "": + if context_str != "": + error_str = "ERROR: Please check xml file -> {}\n{}: '{}'".format( + error, context.strip(), context_str.strip() + ) + else: + error_str = "ERROR: Please check xml file -> {}\n{}".format(error, context.strip()) + + print(error_str) + sys.exit(1) + + +def parse_xml(file_in, file_out): + tree = ET.parse(file_in) + root = tree.getroot() + param_list = [] + + for f in fetch: + param = None + r = root + max_depth = len(f[1]) + fn = len(f) + i = 0 + + for tag in f[1]: + i += 1 + + r = r.find(tag) + ## Handle cases where parameter is not available for this assembly + if r is None: + break + + if i == max_depth: + ## Handle more complex cases where not just fetching text for an element + if fn == 3: + ## Fetch specific attribute for a given element + if f[2][0] == "attrib": + try: + param = r.attrib.get(f[2][1]) + except ValueError: + param = None + + ## Fetch paired tag-value elements from a parent, where tag is specified and value is wanted + elif f[2][0] == "tag": + r = r.findall(f[2][1]) + for child in r: + if child.tag == f[2][2]: + param = child.text + + ## format return values + if param is not None: + if f[0] == "SPECIMEN_ID": + param = param.split(".")[0] + elif f[0] == "ASSEMBLY_ID": + param = param.split(" ")[0] + elif f[0] == "CHROMOSOME_NUMBER": + ra = root.findall("./ASSEMBLY/ASSEMBLY_ATTRIBUTES/ASSEMBLY_ATTRIBUTE") + for child in ra: + if child.find("TAG").text == "count-non-chromosome-replicon": + non_chrs = child.find("VALUE").text + param = str(int(param) - int(non_chrs)) + elif f[0] == "GENOME_LENGTH": + param = str("%.2f" % (int(param) * 1e-6)) # convert to Mbp, 2 decimal place + + elif f[0] == "SCAFF_N50" or f[0] == "CONTIG_N50": + param = str("%.1f" % (int(param) * 1e-6)) # convert to Mbp, 1 decimal place + + else: + try: + param = r.text + except ValueError: + param = None + + if param is not None: + # Convert ints and floats to str to allow for params with punctuation to be quoted + if isinstance(param, numbers.Number): + param = str(param) + + if any(p in string.punctuation for p in param): + param = '"' + param + '"' + + param_list.append([f[0], param]) + + if len(param_list) > 0: + out_dir = os.path.dirname(file_out) + make_dir(out_dir) + with open(file_out, "w") as fout: + fout.write(",".join(["#paramName", "paramValue"]) + "\n") + for param_pair in param_list: + fout.write(",".join(param_pair) + "\n") + + else: + print_error("No parameters found!", "File: {}".format(file_in)) + + +def main(args=None): + args = parse_args(args) + parse_xml(args.FILE_IN, args.FILE_OUT) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/bin/parse_xml_ena_bioproject.py b/bin/parse_xml_ena_bioproject.py new file mode 100755 index 00000000..a9b40596 --- /dev/null +++ b/bin/parse_xml_ena_bioproject.py @@ -0,0 +1,121 @@ +#!/usr/bin/env python3 + +import os +import sys +import argparse +import xml.etree.ElementTree as ET +import string +import numbers + +fetch = [ + ("ENA_BIOPROJECT_ACCESSION", ["PROJECT"], ("attrib", "accession")), + ("ENA_BIOPROJECT_TITLE", ["PROJECT", "TITLE"]), + ("ENA_FIRST_PUBLIC", ["PROJECT", "PROJECT_ATTRIBUTES"], ("tag", ".//*[TAG='ENA-FIRST-PUBLIC']//", "VALUE")), +] + + +def parse_args(args=None): + Description = "Parse contents of an ENA Bioproject report and pul out meta data required by a genome note." + Epilog = "Example usage: python parse_xml_ena_bioproject.py " + + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument("FILE_IN", help="Input XML Bioproject file.") + parser.add_argument("FILE_OUT", help="Output file.") + parser.add_argument("--version", action="version", version="%(prog)s 1.0") + return parser.parse_args(args) + + +def make_dir(path): + if len(path) > 0: + os.makedirs(path, exist_ok=True) + + +def print_error(error, context="Line", context_str=""): + error_str = "ERROR: Please check xml file -> {}".format(error) + if context != "": + if context_str != "": + error_str = "ERROR: Please check xml file -> {}\n{}: '{}'".format( + error, context.strip(), context_str.strip() + ) + else: + error_str = "ERROR: Please check xml file -> {}\n{}".format(error, context.strip()) + + print(error_str) + sys.exit(1) + + +def parse_xml(file_in, file_out): + tree = ET.parse(file_in) + root = tree.getroot() + param_list = [] + + for f in fetch: + param = None + r = root + max_depth = len(f[1]) + fn = len(f) + i = 0 + + for tag in f[1]: + i += 1 + r = r.find(tag) + + if r is None: + ## Handle cases where parameter is not available for this PROJECT + break + + if i == max_depth: + ## Handle more complex cases where not just fetching text for an element + if fn == 3: + ## Fetch specific attribute for a given element + if f[2][0] == "attrib": + try: + param = r.attrib.get(f[2][1]) + except ValueError: + param = None + + ## Fetch paired tag-value elements from a parent, where tag is specified and value is wanted + elif f[2][0] == "tag": + r = r.findall(f[2][1]) + for child in r: + if child.tag == f[2][2]: + param = child.text + + else: + try: + param = r.text + except ValueError: + param = None + + if param is not None: + if f[0] == "ENA_FIRST_PUBLIC": + param = param.split("-")[0] + + # Convert ints and floats to str to allow for params with punctuation to be quoted + if isinstance(param, numbers.Number): + param = str(param) + + if any(p in string.punctuation for p in param): + param = '"' + param + '"' + + param_list.append([f[0], param]) + + if len(param_list) > 0: + out_dir = os.path.dirname(file_out) + make_dir(out_dir) + with open(file_out, "w") as fout: + fout.write(",".join(["#paramName", "paramValue"]) + "\n") + for param_pair in param_list: + fout.write(",".join(param_pair) + "\n") + + else: + print_error("No parameters found!", "File: {}".format(file_in)) + + +def main(args=None): + args = parse_args(args) + parse_xml(args.FILE_IN, args.FILE_OUT) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/bin/parse_xml_ena_biosample.py b/bin/parse_xml_ena_biosample.py new file mode 100755 index 00000000..5ce07d8c --- /dev/null +++ b/bin/parse_xml_ena_biosample.py @@ -0,0 +1,176 @@ +#!/usr/bin/env python3 + +import os +import sys +import argparse +import xml.etree.ElementTree as ET +import string +import numbers + +fetch = [ + ("GAL", ["SAMPLE", "SAMPLE_ATTRIBUTES"], ("tag", ".//*[TAG='GAL']//", "VALUE")), + ("SPECIMEN_ID", ["SAMPLE", "SAMPLE_ATTRIBUTES"], ("tag", ".//*[TAG='specimen id']//", "VALUE")), + ("COLLECTORS", ["SAMPLE", "SAMPLE_ATTRIBUTES"], ("tag", ".//*[TAG='collected by']//", "VALUE")), + ("COLLECTOR_INSTITUTE", ["SAMPLE", "SAMPLE_ATTRIBUTES"], ("tag", ".//*[TAG='collecting institution']//", "VALUE")), + ( + "COLLECTION_LOCATION", + ["SAMPLE", "SAMPLE_ATTRIBUTES"], + ("tag", ".//*[TAG='geographic location (region and locality)']//", "VALUE"), + ), + ("IDENTIFIER", ["SAMPLE", "SAMPLE_ATTRIBUTES"], ("tag", ".//*[TAG='identified by']//", "VALUE")), + ("IDENTIFIER_INSTITUTE", ["SAMPLE", "SAMPLE_ATTRIBUTES"], ("tag", ".//*[TAG='identifier_affiliation']//", "VALUE")), + ("COMMON_NAME", ["SAMPLE", "SAMPLE_NAME", "COMMON_NAME"]), + ("GENUS_SPECIES", ["SAMPLE", "SAMPLE_NAME", "SCIENTIFIC_NAME"]), + ("NCBI_TAXID", ["SAMPLE", "SAMPLE_NAME", "TAXON_ID"]), + ("SAMPLE_SEX", ["SAMPLE", "SAMPLE_ATTRIBUTES"], ("tag", ".//*[TAG='sex']//", "VALUE")), + ( + "COLLECTION_LOCATION", + ["SAMPLE", "SAMPLE_ATTRIBUTES"], + ("tag", ".//*[TAG='geographic location (region and locality)']//", "VALUE"), + ), + ( + "COLLECTION_DATE", + ["SAMPLE", "SAMPLE_ATTRIBUTES"], + ("tag", ".//*[TAG='collection date']//", "VALUE"), + ), + ("LATITUDE", ["SAMPLE", "SAMPLE_ATTRIBUTES"], ("tag", ".//*[TAG='geographic location (latitude)']//", "VALUE")), + ("LONGITUDE", ["SAMPLE", "SAMPLE_ATTRIBUTES"], ("tag", ".//*[TAG='geographic location (longitude)']//", "VALUE")), + ("HABITAT", ["SAMPLE", "SAMPLE_ATTRIBUTES"], ("tag", ".//*[TAG='habitat']//", "VALUE")), + ("BIOSAMPLE_ACCESSION", ["SAMPLE"], ("attrib", "accession")), + ("ORGANISM_PART", ["SAMPLE", "SAMPLE_ATTRIBUTES"], ("tag", ".//*[TAG='organism part']//", "VALUE")), + ("TOLID", ["SAMPLE", "SAMPLE_ATTRIBUTES"], ("tag", ".//*[TAG='tolid']//", "VALUE")), +] + + +def parse_args(args=None): + Description = "Parse contents of an ENA SAMPLE report and pull out meta data required by a genome note." + Epilog = "Example usage: python parse_xml_ena_sample.py " + + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument("FILE_IN", help="Input XML SAMPLE file.") + parser.add_argument("FILE_OUT", help="Output file.") + parser.add_argument("--version", action="version", version="%(prog)s 1.0") + return parser.parse_args(args) + + +def make_dir(path): + if len(path) > 0: + os.makedirs(path, exist_ok=True) + + +def print_error(error, context="Line", context_str=""): + error_str = "ERROR: Please check xml file -> {}".format(error) + if context != "": + if context_str != "": + error_str = "ERROR: Please check xml file -> {}\n{}: '{}'".format( + error, context.strip(), context_str.strip() + ) + else: + error_str = "ERROR: Please check xml file -> {}\n{}".format(error, context.strip()) + + print(error_str) + sys.exit(1) + + +def parse_xml(file_in, file_out): + tree = ET.parse(file_in) + root = tree.getroot() + param_list = [] + + # Extract biosample type from FILE_OUT + biosample_type = None + if "HIC" in file_out.upper(): + biosample_type = "HIC" + elif "RNA" in file_out.upper(): + biosample_type = "RNA" + + for f in fetch: + param = None + r = root + max_depth = len(f[1]) + fn = len(f) + i = 0 + + for tag in f[1]: + i += 1 + + r = r.find(tag) + ## Handle cases where parameter is not available for this SAMPLE + if r is None: + break + + if i == max_depth: + ## Handle more complex cases where not just fetching text for an element + if fn == 3: + ## Fetch specific attribute for a given element + if f[2][0] == "attrib": + try: + param = r.attrib.get(f[2][1]) + except ValueError: + param = None + ## Count child elements with specific tag + elif f[2][0] == "count": + if r is not None: + param = str(len(r.findall(f[2][1]))) if len(r.findall(f[2][1])) != 0 else None + else: + param = None + + ## Fetch paired tag-value elements from a parent, where tag is specified and value is wanted + elif f[2][0] == "tag": + r = r.findall(f[2][1]) + for child in r: + if child.tag == f[2][2]: + param = child.text + + else: + try: + param = r.text + except ValueError: + param = None + + if param is not None: + # Preprocess some values to standardise their format + if f[0] == "GAL": + param = param.title() + + # pre-process collection location + elif f[0] == "COLLECTION_LOCATION": + location_list = param.split("|") + location_list.reverse() + param = ", ".join(loc.title().strip() for loc in location_list) + + # organism part should be in lower case + elif f[0] == "ORGANISM_PART": + param = param.lower().replace("_", " ") + + # Convert ints and floats to str to allow for params with punctuation to be quoted + if isinstance(param, numbers.Number): + param = str(param) + + if any(p in string.punctuation for p in param): + param = '"' + param + '"' + # Prefix parameter name if biosample type is HiC or RNA + param_name = f[0] + if biosample_type in ["HIC", "RNA"]: + param_name = f"{biosample_type}_{param_name}" + param_list.append([param_name, param]) + + if len(param_list) > 0: + out_dir = os.path.dirname(file_out) + make_dir(out_dir) + with open(file_out, "w") as fout: + fout.write(",".join(["#paramName", "paramValue"]) + "\n") + for param_pair in param_list: + fout.write(",".join(param_pair) + "\n") + + else: + print_error("No parameters found!", "File: {}".format(file_in)) + + +def main(args=None): + args = parse_args(args) + parse_xml(args.FILE_IN, args.FILE_OUT) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/bin/parse_xml_ena_taxonomy.py b/bin/parse_xml_ena_taxonomy.py new file mode 100755 index 00000000..c99755be --- /dev/null +++ b/bin/parse_xml_ena_taxonomy.py @@ -0,0 +1,107 @@ +#!/usr/bin/env python3 + +import os +import sys +import argparse +import xml.etree.ElementTree as ET + +fetch = { + "phylum": ["PHYLUM"], + "class": ["CLASS"], + "order": ["ORDER"], + "family": ["FAMILY"], + "tribe": ["TRIBE"], + "genus": ["GENUS"], +} + + +def parse_args(args=None): + Description = "Parse contents of an ENA Taxonomy report and pul out meta data required by a genome note." + Epilog = "Example usage: python parse_xml_ena_taxonomy.py " + + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument("FILE_IN", help="Input XML Taxonomy file.") + parser.add_argument("FILE_OUT", help="Output file.") + parser.add_argument("--version", action="version", version="%(prog)s 1.0") + return parser.parse_args(args) + + +def make_dir(path): + if len(path) > 0: + os.makedirs(path, exist_ok=True) + + +def print_error(error, context="Line", context_str=""): + error_str = "ERROR: Please check xml file -> {}".format(error) + if context != "": + if context_str != "": + error_str = "ERROR: Please check xml file -> {}\n{}: '{}'".format( + error, context.strip(), context_str.strip() + ) + else: + error_str = "ERROR: Please check xml file -> {}\n{}".format(error, context.strip()) + + print(error_str) + sys.exit(1) + + +def parse_xml(file_in, file_out): + tree = ET.parse(file_in) + root = tree.getroot() + param_list = [] + + taxon = root.find("taxon") + common_name = taxon.get("commonName") + scientific_name = taxon.get("scientificName") + taxon_id = taxon.get("taxId") + + if common_name is not None: + param_list.append(["COMMON_NAME", common_name]) + if scientific_name is not None: + param_list.append(["GENUS_SPECIES", scientific_name]) + + tax_string = [] + lineage = root.find("taxon/lineage") + for child in lineage: + name = child.get("scientificName") + if name is not None and name != "root": + tax_string.append(name) + + rank = child.get("rank") + if rank is not None: + if rank in fetch: + if name is not None: + fetch[rank].append(name) + + for f in fetch: + if len(fetch[f]) > 1: + param_list.append([fetch[f][0], fetch[f][1]]) + + if taxon_id is not None: + param_list.append(["NCBI_TAXID", taxon_id]) + + tax_string.reverse() + tax_string.pop(0) + full_taxonomy = "; ".join(tax_string) + + param_list.append(["TAX_STRING", '"' + full_taxonomy + '"']) + + if len(param_list) > 0: + out_dir = os.path.dirname(file_out) + make_dir(out_dir) + with open(file_out, "w") as fout: + fout.write(",".join(["#paramName", "paramValue"]) + "\n") + for param_pair in param_list: + fout.write(",".join(param_pair) + "\n") + + else: + print_error("No parameters found!", "File: {}".format(file_in)) + + +def main(args=None): + args = parse_args(args) + parse_xml(args.FILE_IN, args.FILE_OUT) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/bin/parse_xml_ncbi_taxonomy.py b/bin/parse_xml_ncbi_taxonomy.py new file mode 100755 index 00000000..74f19608 --- /dev/null +++ b/bin/parse_xml_ncbi_taxonomy.py @@ -0,0 +1,128 @@ +#!/usr/bin/env python3 + +import os +import sys +import argparse +import xml.etree.ElementTree as ET + +fetch = [ + ("COMMON_NAME", ["Taxon", "OtherNames", "GenbankCommonName"]), + ("GENUS_SPECIES", ["Taxon", "Scientific_Name"]), + ("NCBI_TAXID", ["Taxon", "TaxId"]), + ("TAX_STRING", ["Taxon", "Lineage"]), + ("PHYLUM", ["Taxon", "LineageEx"], ("Taxon", "Rank", "phylum", "ScientificName")), + ("CLASS", ["Taxon", "LineageEx"], ("Taxon", "Rank", "class", "ScientificName")), + ("ORDER", ["Taxon", "LineageEx"], ("Taxon", "Rank", "order", "ScientificName")), + ("FAMILY", ["Taxon", "LineageEx"], ("Taxon", "Rank", "family", "ScientificName")), + ("TRIBE", ["Taxon", "LineageEx"], ("Taxon", "Rank", "tribe", "ScientificName")), + ("GENUS", ["Taxon", "LineageEx"], ("Taxon", "Rank", "genus", "ScientificName")), +] + + +def parse_args(args=None): + Description = "Parse contents of a NCBI taxonomy report and pul out meta data required by a genome note." + Epilog = "Example usage: python parse_xml_ncbi_taxonomy.py " + + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument("FILE_IN", help="Input XML Assembly file.") + parser.add_argument("FILE_OUT", help="Output file.") + parser.add_argument("--version", action="version", version="%(prog)s 1.0") + return parser.parse_args(args) + + +def make_dir(path): + if len(path) > 0: + os.makedirs(path, exist_ok=True) + + +def print_error(error, context="Line", context_str=""): + error_str = "ERROR: Please check xml file -> {}".format(error) + if context != "": + if context_str != "": + error_str = "ERROR: Please check xml file -> {}\n{}: '{}'".format( + error, context.strip(), context_str.strip() + ) + else: + error_str = "ERROR: Please check xml file -> {}\n{}".format(error, context.strip()) + + print(error_str) + sys.exit(1) + + +def parse_xml(file_in, file_out): + tree = ET.parse(file_in) + root = tree.getroot() + param_list = [] + + for f in fetch: + r = root + max_depth = len(f[1]) + fn = len(f) + i = 0 + param = None + + for tag in f[1]: + i += 1 + + r = r.find(tag) + ## Handle cases where parameter is not available for this assembly + if r is None: + break + + if i == max_depth: + ## Handle more complex cases where not just fetching text for an element + if fn == 3: + ## Fetch rank and scientific name from a parent taxon, where rank is specified and scientific_name is wanted + if f[2][0] == "Taxon": + rank_found = 0 + r = r.findall(f[2][0]) + for child in r: + c = child.find(f[2][1]) + if c.text == f[2][2]: + rank_found = 1 + name = child.find(f[2][3]) + if name is not None: + param = name.text + else: + param = None + + if rank_found == 0: + param = None + + else: + try: + param = r.text + except ValueError: + param = None + + if param is not None: + ## format return values + if f[0] == "TAX_STRING": + # remove first element from tax string + params = param + params = params.split("; ") + params.pop(0) + tax_str = "; ".join(params) + param = '"' + tax_str + '"' + + param_list.append([f[0], param]) + + if len(param_list) > 0: + out_dir = os.path.dirname(file_out) + make_dir(out_dir) + with open(file_out, "w") as fout: + fout.write(",".join(["#paramName", "paramValue"]) + "\n") + for param_pair in param_list: + fout.write(",".join(param_pair) + "\n") + + else: + print_error("No parameters found!", "File: {}".format(file_in)) + + +def main(args=None): + args = parse_args(args) + parse_xml(args.FILE_IN, args.FILE_OUT) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/bin/populate_genome_note_template.py b/bin/populate_genome_note_template.py new file mode 100755 index 00000000..ff97a5f3 --- /dev/null +++ b/bin/populate_genome_note_template.py @@ -0,0 +1,140 @@ +#!/usr/bin/env python3 + +import os +import sys +import argparse +import csv +import jinja2 +import json +from docxtpl import DocxTemplate + + +def parse_args(args=None): + Description = "" + Epilog = "" + + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument("PARAM_FILE", help="Input parameters CSV file.") + parser.add_argument("TEMPLATE_FILE", help="Input Genome Note Template file.") + parser.add_argument("TEMPLATE_TYPE", help="Input Genome Note Template file type.") + parser.add_argument("FILE_OUT", help="Output file.") + parser.add_argument("--version", action="version", version="%(prog)s 1.1") + return parser.parse_args(args) + + +def make_dir(path): + if len(path) > 0: + os.makedirs(path, exist_ok=True) + + +def write_file(template, type, file_out): + out_dir = os.path.dirname(file_out) + make_dir(out_dir) + if type == "docx": + template.save(os.path.join(out_dir, file_out)) + else: + with open(file_out, "w") as fout: + fout.write(template) + + +def build_param_list(param_file): + with open(param_file, "r") as infile: + reader = csv.reader(infile) + + mydict = {} + locs = ["COLLECTION_LOCATION", "HIC_COLLECTION_LOCATION", "RNA_COLLECTION_LOCATION"] + collectors = ["COLLECTORS", "HIC_COLLECTORS", "RNA_COLLECTORS"] + inst_collectors = ["COLLECTOR_INSTITUTE", "HIC_COLLECTOR_INSTITUTE", "RNA_COLLECTOR_INSTITUTE"] + identifiers = ["IDENTIFIER", "HIC_IDENTIFIER", "RNA_IDENTIFIER"] + inst_identifier = ["IDENTIFIER_INSTITUTE", "HIC_IDENTIFIER_INSTITUTE", "RNA_IDENTIFIER_INSTITUTE"] + + for row in reader: + key = row.pop(0) + value = row[0] + if key == "CHR_TABLE": + value = ",".join(row) + json_chrs = json.loads(value) + value = json_chrs + + elif key == "ORGANISM_PART": + value = value.lower() + + elif key in identifiers or key in inst_identifier: + value = value.replace("|", ",") + value = value.lower().title() + value = value.replace("At", "at") + value = value.replace("Of", "of") + value = value.replace("The", "the") + + elif key in collectors or key in inst_collectors or key in locs: + value = value.replace("|", ",") + value = value.lower().title() + value = value.replace("At", "at") + value = value.replace("Of", "of") + value = value.replace("The", "the") + + # Set URLS for BTK + elif key == "ASSEMBLY_ACCESSION": + # Base BTK URL + btk_url = "https://blobtoolkit.genomehubs.org/view/GCA/dataset/GCA" + btk_url = btk_url.replace("GCA", value) + + mydict["BTK_SNAIL_URL"] = btk_url + "/snail" + mydict["BTK_BLOB_URL"] = btk_url + "/blob" + mydict["BTK_CUMULATIVE_URL"] = btk_url + "/cumulative" + + mydict[key] = value + + authors = [] + seen = set() + + for i_key in ( + "IDENTIFIER", + "HIC_IDENTIFIER", + "RNA_IDENTIFIER", + "COLLECTORS", + "HIC_COLLECTORS", + "RNA_COLLECTORS", + ): + item = mydict.get(i_key) + if item: + for i in item.split(","): + i = i.strip() + if i not in seen: + authors.append(i) + seen.add(i) + + mydict["AUTHORS"] = ", ".join(authors).strip() + + assembly_acc = mydict.get("ASSEMBLY_ACCESSION") + if assembly_acc: + btk_busco_url = "https://blobtoolkit.genomehubs.org/view/GCA/dataset/GCA/busco" + btk_busco_url = btk_busco_url.replace("GCA", assembly_acc) + mydict["BTK_BUSCO_URL"] = btk_busco_url + + return mydict + + +def populate_template(param_file, template_file, template_type, file_out): + myenv = jinja2.Environment(undefined=jinja2.DebugUndefined) + context = build_param_list(param_file) + if template_type == "docx": + template = DocxTemplate(template_file) + template.render(context, myenv) + write_file(template, template_type, file_out) + else: + with open(template_file, "r") as file: + data = file.read() + + template = myenv.from_string(data) + content = template.render(context) + write_file(content, template_type, file_out) + + +def main(args=None): + args = parse_args(args) + populate_template(args.PARAM_FILE, args.TEMPLATE_FILE, args.TEMPLATE_TYPE, args.FILE_OUT) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/bin/upload_higlass_file.sh b/bin/upload_higlass_file.sh new file mode 100755 index 00000000..2e22550d --- /dev/null +++ b/bin/upload_higlass_file.sh @@ -0,0 +1,36 @@ +#!/usr/bin/env bash + +pod_name=$1 +project_name=$2 +file_name=$3 +file_type=$4 +file_ext=$5 +uid=$6 +assembly=$7 + +# add file type as a suffix to the tileset uid +uid="${uid}_${file_type}" + +file_upload="${file_name}_${file_type}" + +# Check to see if a tileset with the same name already exists and delete it if so +tilesets=$(kubectl exec $pod_name -- python /home/higlass/projects/higlass-server/manage.py list_tilesets | (grep $file_upload || [ "$?" == "1" ] ) | awk '{print substr($NF, 1, length($NF)-1)}') + +for f in $tilesets; do + echo "Deleting $f" + kubectl exec $pod_name -- python /home/higlass/projects/higlass-server/manage.py delete_tileset --uuid $f +done + +# Upload the file +echo "Loading ${file_upload}${file_ext} file" + +if [[ $file_ext == '.mcool' ]] +then + kubectl exec $pod_name -- python /home/higlass/projects/higlass-server/manage.py ingest_tileset --filename /higlass-temp${project_name}/${file_name}.mcool --filetype cooler --datatype matrix --project-name ${project_name} --name ${file_upload} --uid ${uid} +elif [[ $file_ext == '.genome' ]] +then + kubectl exec $pod_name -- python /home/higlass/projects/higlass-server/manage.py ingest_tileset --filename /higlass-temp${project_name}/${file_name}.genome --filetype chromsizes-tsv --datatype chromsizes --coordSystem ${assembly}_assembly --project-name ${project_name} --name ${file_upload} --uid ${uid} +fi + +echo "$file_upload loaded" + diff --git a/bin/write_to_genome_notes_db.py b/bin/write_to_genome_notes_db.py new file mode 100755 index 00000000..30bece44 --- /dev/null +++ b/bin/write_to_genome_notes_db.py @@ -0,0 +1,176 @@ +#!/usr/bin/env python3 + +import sys +import argparse +import csv +from tol.api_client import ApiDataSource, ApiObject +from tol.core import DataSourceFilter + + +def parse_args(args=None): + Description = "" + Epilog = "" + + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument("PARAM_FILE", help="Input parameters CSV file.") + parser.add_argument("TOL_API_URL", help="URL for Genome Notes API") + parser.add_argument("TOL_API_KEY", help="Key for using ToL API Client") + parser.add_argument("--version", action="version", version="%(prog)s 1.0") + return parser.parse_args(args) + + +def print_error(error, context="Line", context_str=""): + error_str = "ERROR: Please check file -> {}".format(error) + if context != "": + if context_str != "": + error_str = "ERROR: Please check file -> {}\n{}: '{}'".format(error, context.strip(), context_str.strip()) + else: + error_str = "ERROR: Please check file -> {}\n{}".format(error, context.strip()) + + print(error_str) + sys.exit(1) + + +def build_param_list(param_file): + with open(param_file, "r") as infile: + reader = csv.reader(infile) + mydict = {rows[0]: rows[1] for rows in reader} + return mydict + + +def fetch_ads(url, key): + ads = ApiDataSource({"url": url, "key": key}) + return ads + + +def write_to_db(param_file, url, key): + params = build_param_list(param_file) + ads = fetch_ads(url, key) + + assembly_accession = params.get("ASSEMBLY_ACCESSION") if params.get("ASSEMBLY_ACCESSION") else None + species_name = params.get("GENUS_SPECIES") if params.get("GENUS_SPECIES") else None + + # Check species exists, and add if missing + if species_name: + species_exists = ads.get_list("species", DataSourceFilter(exact={"scientific_name": species_name})) + + if len(list(species_exists)) != 0: + species = species_exists[0] + else: + species = ApiObject("species", None, attributes={"scientific_name": params.get("GENUS_SPECIES")}) + + species = ads.create(species) + else: + print_error("No GENUS_SPECIES found!", "File: {}".format(param_file)) + + # check the template has been defined, add if not and fetch value + template_exists = ads.get_list( + "template", DataSourceFilter(exact={"name": "WOR_Standard", "journal": "Wellcome Open Research"}) + ) + + if len(list(template_exists)) != 0: + template = template_exists[0] + else: + template = ApiObject( + "template", + None, + attributes={"name": "WOR_Standard", "journal": "Wellcome Open Research", "template_body": ""}, + ) + + template = ads.create(template) + + if assembly_accession: + filter = DataSourceFilter(exact={"accession": assembly_accession}) + assembly_exists = ads.get_list("assembly", object_filters=filter) + + # retrieve assembly info from database or add if not already there + if len(list(assembly_exists)) != 0: + assembly = assembly_exists[0] + else: + assembly_name = params.get("SPECIMEN_ID") if params.get("SPECIMEN_ID") else None + taxon_id = params.get("NCBI_TAXID") if params.get("NCBI_TAXID") else None + + assembly = ApiObject( + "assembly", + None, + attributes={"accession": assembly_accession, "name": assembly_name, "taxon_id": taxon_id}, + ) + + assembly = ads.create(assembly) + + for parameter in params: + if parameter == "#paramName": + continue + + parameter_value = params.get(parameter) + + parameter_class_exists = ads.get_list( + "parameter_classes", DataSourceFilter(exact={"name": parameter, "jats": parameter}) + ) + + if len(list(parameter_class_exists)) != 0: + parameter_class = parameter_class_exists[0] + else: + parameter_class = ApiObject( + "parameter_classes", None, attributes={"name": parameter, "jats": parameter} + ) + + parameter_class = ads.create(parameter_class) + + parameter_exists = ads.get_list( + "parameters", + DataSourceFilter( + exact={ + "parameter_class_id": parameter_class.id, + "value": parameter_value, + "assembly_accession": assembly.accession, + } + ), + ) + + if len(list(parameter_exists)) != 0: + parameters = parameter_exists[0] + parameters.value = parameter_value + ads.update(parameters) + + else: + parameters = ApiObject( + "parameters", + None, + attributes={"value": parameter_value, "assembly_accession": assembly_accession}, + relationships={"parameter_classes": parameter_class}, + ) + + parameters = ads.create(parameters) + + template_parameter_exists = ads.get_list( + "template_parameters", + DataSourceFilter( + exact={ + "template_id": template.id, + "parameter_id": parameters.id, + } + ), + ) + + if len(list(template_parameter_exists)) != 0: + template_parameter = template_parameter_exists[0] + + else: + template_parameter = ApiObject( + "template_parameters", + None, + attributes={"required": True}, + relationships={"template": template, "parameters": parameters}, + ) + + template_parameter = ads.create(template_parameter) + + +def main(args=None): + args = parse_args(args) + write_to_db(args.PARAM_FILE, args.TOL_API_URL, args.TOL_API_KEY) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/conf/modules.config b/conf/modules.config index ec011e2f..c9c3aa9c 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -45,7 +45,7 @@ process { publishDir = [ path: { "${params.outdir}/contact_maps" }, mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + saveAs: { filename -> filename.equals('versions.yml') ? null : "${params.assembly}_" + filename } ] } @@ -83,7 +83,7 @@ process { withName: CREATETABLE { publishDir = [ - path: { "${params.outdir}/genome_statistics" }, + path: { "${params.outdir}/genome_note" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -106,4 +106,21 @@ process { ] } + withName: COMBINE_STATISTICS_AND_METADATA { + publishDir = [ + path: { "${params.outdir}/genome_note" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: POPULATE_TEMPLATE { + memory = { check_max( 100.MB * task.attempt, 'memory' ) } + publishDir = [ + path: { "${params.outdir}/genome_note" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + } diff --git a/conf/test.config b/conf/test.config index 9ac78d0a..a8591f5e 100644 --- a/conf/test.config +++ b/conf/test.config @@ -23,8 +23,28 @@ params { input = "${projectDir}/assets/samplesheet.csv" // Fasta references - fasta = "https://tolit.cog.sanger.ac.uk/test-data/Epithemia_sp._CRS-2021b/assembly/release/uoEpiScrs1.1/insdc/GCA_946965045.1.fasta.gz" + fasta = "https://tolit.cog.sanger.ac.uk/test-data/Ceramica_pisi/assembly/release/ilCerPisi1.1/insdc/GCA_963859965.1.fasta.gz" // Reducing the k-mer size to speed FastK/Merqury a little bit, but also decrease the memory consumption kmer_size = 11 + + // Input data for genome_metadata subworkflow + assembly = 'GCA_963859965.1' + biosample_wgs = 'SAMEA112198456' + biosample_hic = 'SAMEA112198479' + biosample_rna = 'SAMEA112232914' + + // Genome Notes Portal + write_to_portal = false + genome_notes_api = "https://notes-staging.tol.sanger.ac.uk/api/v1" + note_template = "${projectDir}/assets/genome_note_template.docx" + + // HiGlass Options + upload_higlass_data = false + higlass_url = "http://genome-note-higlass.tol-dev.sanger.ac.uk" + higlass_deployment_name = "higlass-app-genome-note" + higlass_namespace = "tol-higlass-genome-note" + higlass_kubeconfig = "~/.kube/config.tol-it-dev-k8s" + higlass_upload_directory = "/lustre/scratch123/tol/share/genome-note-higlass/data_to_load" + higlass_data_project_dir = "/darwin/insects" } diff --git a/conf/test_full.config b/conf/test_full.config index 672cbaa5..3eace25e 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -27,4 +27,25 @@ params { // Need to be set to avoid overfilling /tmp use_work_dir_as_temp = true + + // Input data for genome_metadata subworkflow + assembly = 'GCA_934047225.1' + biosample_wgs = 'SAMEA7519929' + biosample_hic = 'SAMEA7519968' + biosample_rna = null + + // Genome Notes Portal + write_to_portal = false + genome_notes_api = "https://notes-staging.tol.sanger.ac.uk/api/v1" + note_template = "${projectDir}/assets/genome_note_template.docx" + + + // HiGlass Options + upload_higlass_data = false + higlass_url = "http://genome-note-higlass.tol-dev.sanger.ac.uk" + higlass_upload_directory = "/lustre/scratch123/tol/share/genome-note-higlass/data_to_load" + higlass_data_project_dir = "/darwin/insects" + higlass_deployment_name = "higlass-app-genome-note" + higlass_namespace = "tol-higlass-genome-note" + higlass_kubeconfig = "~/.kube/config.tol-it-dev-k8s" } diff --git a/docs/output.md b/docs/output.md index 3b1d1bc8..f742e4cc 100644 --- a/docs/output.md +++ b/docs/output.md @@ -37,8 +37,11 @@ This pipeline collates (1) assembly information, statistics and chromosome detai
Output files -- `genome_statistics/` +- `genome_note/` - `.csv`: collate genome statistics file + - `.{docx|xml}`: partially completed genome note template file + - `_genome_note_consistent.csv`: a file of genome metadata parameters pulled from various public data repositories where all source agree on the paramter value. + - `_genome_note_inconsistent.csv`: a file of genome metadata parameters, and their sources pulled from various public data repositories where the paramter value differs between data sources.
@@ -84,7 +87,6 @@ Results generated by MultiQC collate pipeline from supported tools e.g. BUSCO. T - Reports generated by Nextflow: `execution_report.html`, `execution_timeline.html`, `execution_trace.txt` and `pipeline_dag.dot`/`pipeline_dag.svg`. - Reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.yml`. The `pipeline_report*` files will only be present if the `--email` / `--email_on_fail` parameter's are used when running the pipeline. - Reformatted samplesheet files used as input to the pipeline: `samplesheet.valid.csv`. - - + [Nextflow](https://www.nextflow.io/docs/latest/tracing.html) provides excellent functionality for generating various reports relevant to the running and execution of the pipeline. This will allow you to troubleshoot errors with the running of the pipeline, and also provide you with other information such as launch commands, run times and resource usage. diff --git a/docs/usage.md b/docs/usage.md index 068840d2..529ec0a3 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -10,10 +10,22 @@ The [sanger-tol/genomenote](https://pipelines.tol.sanger.ac.uk/genomenote) pipel These typically include: -1. Assembly information, statistics and chromosome details from NCBI datasets. -2. Genome completeness from BUSCO. -3. Consensus quality and k-mer completeness from MerquryFK - when high-quality reads are available. -4. Hi-C contact map and chromosomal grid using Cooler, as well as primary mapped percentage from samtools flagstat - when Hi-C reads are provided. These files can be displayed on a [HiGlass](http://higlass.io) server, like the one use by the [Sanger Institute](https://genome-note-higlass.tol.sanger.ac.uk/app). +1. Assembly metadata from COPO, ENA, GoaT, GBIF and NCBI +2. Assembly information, statistics and chromosome details from NCBI datasets. +3. Genome completeness from BUSCO. +4. Consensus quality and k-mer completeness from MerquryFK - when high-quality reads are available. +5. Hi-C contact map and chromosomal grid using Cooler, as well as primary mapped percentage from samtools flagstat - when Hi-C reads are provided. These files can be displayed on a [HiGlass](http://higlass.io) server, like the one use by the [Sanger Institute](https://genome-note-higlass.tol.sanger.ac.uk/app). + +## Genome metadata input + +You will need to supply the assembly accession for the genome you would like to analyse along with the biosample acession(s) linked to this genome assembly. + +```bash + --assembly '[assembly accession]' + --biosample_wgs '[biosample accession of the biosample used to produce the genomic sequence]' + --biosample_hic '[biosample accession of the biosample used to produce the HiC data]' + --biosample_rna '[biosample accession of the biosample used to produce the RNASeq data] +``` ## Samplesheet input @@ -58,7 +70,7 @@ An [example samplesheet](../assets/samplesheet.csv) has been provided with the p The typical command for running the pipeline is as follows: ```bash -nextflow run sanger-tol/genomenote --input samplesheet.csv --outdir --fasta genome.fasta -profile docker +nextflow run sanger-tol/genomenote --input samplesheet.csv --outdir --fasta genome.fasta --assembly GCA_922984935.2 --biosample_wgs SAMEA7524400 -profile docker ``` This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. @@ -90,6 +102,8 @@ input: './samplesheet.csv' outdir: './results/' fasta: './genome.fasta' input: 'data' +assembly: 'GCA_922984935.2' +biosample_wgs: 'SAMEA7524400' <...> ``` @@ -215,3 +229,35 @@ We recommend adding the following line to your environment to limit this (typica ```bash NXF_OPTS='-Xms1g -Xmx4g' ``` + +## For internal Sanger use only + +If you wish to run the optional step that writes genome metatdata key value-pairs to a genome notes databases you will need to set the parameter "write_to_portal" to true and provide the base url for the REST API that writes to the database. + +```bash + --write_to_portal_db 'true' + --genome_notes_api '[URL for Genome Notes Portal API]' +``` + +You will also need to set a nextflow secret to store the API key belonging to your user. + +```bash + nextflow secrets set TOL_API_KEY '[API key]' +``` + +If you want to populate a genome notes template file with the key-value pairs generated by this pipeline you will need to pass the path to the template file as the "note_template" parameter. Templates may be either docx or xml format. + +```bash + --note_template '[URL for Genome Notes Portal API]' +``` + +If you wish to run the optional step that writes the .mcool and .genome files produced by the contact_maps subworkflow to a kubernetes hosted higlass server you will need to set the parameter "upload_higlass_data" to true and provide the configuration information for the kubernetes deployment. + +```bash + --upload_higlass_data 'true' + --higlass_upload_directory '[Path to ingress directory for kubernetes]' + --higlass_data_project_dir '[Directory structure to be used for Higlass data, suggestions is to use //]' + --higlass_deployment_name '[ Name of Higlass Deployment in kubernetes]' + --higlass_namespace '[Name of the namespace used for Higlass Deployment in Kubernetes]' + --higlass_kubeconfig '[path to kubeconfig file]' +``` diff --git a/modules/local/combine_metadata.nf b/modules/local/combine_metadata.nf new file mode 100644 index 00000000..69dbfcde --- /dev/null +++ b/modules/local/combine_metadata.nf @@ -0,0 +1,44 @@ +process COMBINE_METADATA { + tag "${meta.id}" + label 'process_single' + + conda "conda-forge::python=3.9.1" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.9--1' : + 'quay.io/biocontainers/python:3.9--1' }" + + input: + tuple val(meta), path(file_list) + + output: + tuple val (meta), path("${meta.id}_consistent.csv"), emit: consistent + tuple val (meta), path("${meta.id}_inconsistent.csv"), emit: inconsistent + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = [] + def prefix = task.ext.prefix ?: meta.id + for (item in file_list){ + def file = item + def file_ext = item.getExtension() + def file_name = "--" + item.getName().minus("${prefix}_").minus(".${file_ext}").toLowerCase() + "_file" + args.add(file_name) + args.add(file) + } + + """ + combine_parsed_data.py \\ + ${args.join(" ")} \\ + --out_consistent ${prefix}_consistent.csv \\ + --out_inconsistent ${prefix}_inconsistent.csv + + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + combine_parsed_data.py: \$(combine_parsed_data.py --version | cut -d' ' -f2) + END_VERSIONS + """ +} diff --git a/modules/local/combine_statistics_and_metadata.nf b/modules/local/combine_statistics_and_metadata.nf new file mode 100644 index 00000000..05b03ace --- /dev/null +++ b/modules/local/combine_statistics_and_metadata.nf @@ -0,0 +1,39 @@ +process COMBINE_STATISTICS_AND_METADATA { + tag "${meta.id}" + label 'process_single' + + conda "conda-forge::python=3.9.1" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.9--1' : + 'quay.io/biocontainers/python:3.9--1' }" + + input: + tuple val(meta), path(consistent_params) + tuple val(meta2), path(inconsistent_params) + tuple val(meta3), path(statistics_params) + + output: + tuple val (meta), path("${meta.id}_genome_note_consistent.csv") , emit: consistent + tuple val (meta), path("${meta.id}_genome_note_inconsistent.csv") , emit: inconsistent + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: meta.id + + """ + combine_statistics_data.py \\ + --in_consistent $consistent_params \\ + --in_inconsistent $inconsistent_params \\ + --in_statistics $statistics_params \\ + --out_consistent ${prefix}_genome_note_consistent.csv \\ + --out_inconsistent ${prefix}_genome_note_inconsistent.csv \\ + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + combine_statistics_data.py: \$(combine_statistics_data.py --version | cut -d' ' -f2) + END_VERSIONS + """ +} diff --git a/modules/local/fetch_gbif_metadata.nf b/modules/local/fetch_gbif_metadata.nf new file mode 100755 index 00000000..9a6e92c7 --- /dev/null +++ b/modules/local/fetch_gbif_metadata.nf @@ -0,0 +1,35 @@ +process FETCH_GBIF_METADATA { + tag "$assembly" + label 'process_single' + + conda "conda-forge::python=3.9.1" + + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/requests:2.26.0': + 'quay.io/biocontainers/requests:2.26.0'}" + + input: + tuple val(assembly), val(species) + + + output: + path "*.csv", emit: file_path + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def script_name = "fetch_gbif_metadata.py" + def output_file = "${assembly}_gbif_taxonomy.csv" + + """ + $script_name --species $species --output $output_file + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + requests: \$(python -c 'import requests; print(requests.__version__)') + END_VERSIONS + """ +} diff --git a/modules/local/generate_higlass_link.nf b/modules/local/generate_higlass_link.nf new file mode 100644 index 00000000..306c39d0 --- /dev/null +++ b/modules/local/generate_higlass_link.nf @@ -0,0 +1,41 @@ +process GENERATE_HIGLASS_LINK { + tag "$meta.id" + label 'process_single' + + conda "conda-forge::requests=2.26.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/requests:2.26.0': + 'biocontainers/requests:2.26.0' }" + + input: + val(file_name) + val(map_uuid) + val(grid_uuid) + val(server) + tuple val(meta), path(genome) + + output: + tuple val(meta), path("${meta.id}_higlass_link.csv"), emit: higlass_link + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: // This script is bundled with the pipeline, in nf-core/genomenote/bin/ + def prefix = task.ext.prefix ?: meta.id + """ + generate_higlass_link.py \\ + $file_name \\ + $map_uuid \\ + $grid_uuid \\ + $server \\ + $genome \\ + ${prefix}_higlass_link.csv + + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + generate_higlass_link.py: \$(generate_higlass_link.py --version | cut -d' ' -f2) + END_VERSIONS + """ +} diff --git a/modules/local/params_check.nf b/modules/local/params_check.nf new file mode 100644 index 00000000..a7230cab --- /dev/null +++ b/modules/local/params_check.nf @@ -0,0 +1,32 @@ +process PARAMS_CHECK { + tag "$assembly" + label 'process_single' + + conda "conda-forge::python=3.10.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/requests:2.26.0': + 'quay.io/biocontainers/requests:2.26.0'}" + + input: + tuple val(assembly), val(wgs_biosample), val(hic_biosample), val(rna_biosample) + + output: + path '*.csv', emit: csv + path "versions.yml", emit: versions + + script: + """ + check_parameters.py \\ + --assembly $assembly \\ + --wgs_biosample $wgs_biosample \\ + --hic_biosample $hic_biosample \\ + --rna_biosample $rna_biosample \\ + --output ${assembly}_valid.csv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + END_VERSIONS + """ +} + diff --git a/modules/local/parse_metadata.nf b/modules/local/parse_metadata.nf new file mode 100644 index 00000000..0f895a86 --- /dev/null +++ b/modules/local/parse_metadata.nf @@ -0,0 +1,37 @@ + +process PARSE_METADATA { + tag "${meta.ext}|${meta.source}|${meta.type}" + label 'process_single' + + conda "conda-forge::python=3.9.1" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.9--1' : + 'quay.io/biocontainers/python:3.9--1' }" + + input: + tuple val(meta), path(json) + + output: + tuple val(meta), path("*.csv") , emit: file_path + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + + script: // This script is bundled with the pipeline, in nf-core/genomenote/bin/ + def prefix = task.ext.prefix ?: meta.id + def script_name = "parse_${meta.ext.toLowerCase()}_${meta.source.toLowerCase()}_${meta.type.toLowerCase()}.py" + def is_biosample = (meta.biosample_type == "WGS" || meta.biosample_type == "HIC" || meta.biosample_type == "RNA") ? "_${meta.biosample_type}" : "" + def output_file = "${prefix}_${meta.source.toLowerCase()}_${meta.type.toLowerCase()}${is_biosample}.csv".strip('_') + """ + $script_name \\ + $json \\ + $output_file + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + END_VERSIONS + """ +} diff --git a/modules/local/populate_template.nf b/modules/local/populate_template.nf new file mode 100644 index 00000000..2900cd0c --- /dev/null +++ b/modules/local/populate_template.nf @@ -0,0 +1,36 @@ +process POPULATE_TEMPLATE { + tag "${meta.id}" + label 'process_single' + + + conda "conda-forge::docxtpl=0.11.5" + container "quay.io/sanger-tol/python_docx_template:0.11.5-c1" + + input: + tuple val(meta), path(param_data) + path(note_template) + + output: + tuple val(meta), path("*.{docx,xml}"), emit: genome_note + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: meta.id + def file_type = note_template.extension + + """ + populate_genome_note_template.py \\ + $param_data \\ + $note_template \\ + ${file_type} \\ + ${prefix}.${file_type} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + populate_genome_note_template.py: \$(populate_genome_note_template.py --version | cut -d' ' -f2) + END_VERSIONS + """ +} diff --git a/modules/local/run_wget.nf b/modules/local/run_wget.nf new file mode 100644 index 00000000..6ae86b9f --- /dev/null +++ b/modules/local/run_wget.nf @@ -0,0 +1,35 @@ + +process RUN_WGET { + + tag "${meta.source}|${meta.type}" + label 'process_single' + + conda "bioconda::gnu-wget=1.18" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gnu-wget:1.18--h7132678_6' : + 'quay.io/biocontainers/gnu-wget:1.18--h7132678_6' }" + + input: + tuple val(meta), val(url) + + + output: + tuple val(meta), path("${meta.id}_${meta.source}_${meta.type}*.${meta.ext}") , emit: file_path + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def no_certificate = (meta.source == 'GOAT') ? '--no-check-certificate' : '' + def is_biosample = (meta.biosample_type == "WGS" || meta.biosample_type == "HIC" || meta.biosample_type == "RNA") ? "_${meta.biosample_type}" : "" + def output = "${meta.id}_${meta.source}_${meta.type}${is_biosample}.${meta.ext}".strip('_') + """ + wget ${no_certificate} -c -O ${output} '${url}' + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + wget: \$(wget --version | head -n 1 | cut -d' ' -f3) + END_VERSIONS + """ +} diff --git a/modules/local/upload_higlass_data.nf b/modules/local/upload_higlass_data.nf new file mode 100644 index 00000000..1b1d8971 --- /dev/null +++ b/modules/local/upload_higlass_data.nf @@ -0,0 +1,91 @@ +process UPLOAD_HIGLASS_DATA { + tag "$meta.id" + label 'process_single' + + container "bitnami/kubectl:1.27" + + input: + tuple val(meta), path(mcool) + tuple val(meta2), path(genome) + val(higlass_data_project_dir) + path(upload_dir) + + output: + env map_uuid, emit: map_uuid + env grid_uuid, emit: grid_uuid + env file_name, emit: file_name + tuple val(meta2), path(genome), emit: genome_file + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + error "UPLOAD_HIGLASS_DATA modules do not support Conda. Please use Docker / Singularity / Podman instead." + } + def assembly = "${meta.assembly}" + def species = "${meta.species}" + def project_name = "${higlass_data_project_dir}/${species.replaceAll('\\s','_')}/${assembly}" + def file_name = "${assembly}_${meta.id}" + // uid cannot contain a "." + def uid = "${file_name.replaceAll('\\.','_')}" + + + """ + # Configure kubectl access to the namespace + export KUBECONFIG=$params.higlass_kubeconfig + kubectl config get-contexts + kubectl config set-context --current --namespace=$params.higlass_namespace + + # Find the name of the pod + sel=\$(kubectl get deployments.apps $params.higlass_deployment_name --output=json | jq -j '.spec.selector.matchLabels | to_entries | .[] | "\\(.key)=\\(.value),"') + sel2=\${sel%?} + pod_name=\$(kubectl get pod --selector=\$sel2 --output=jsonpath={.items[0].metadata.name}) + echo "\$pod_name" + + # Copy the files to the upload area + mkdir -p ${upload_dir}${project_name} + cp -f $mcool ${upload_dir}${project_name}/${file_name}.mcool + cp -f $genome ${upload_dir}${project_name}/${file_name}.genome + + + # Loop over files to load them in Kubernetes + + files_to_upload=(".mcool" ".genome") + + for file_ext in \${files_to_upload[@]}; do + echo "loading \$file_ext file" + + # Set file type and uuid to use for tileset. This uuid is needed for creating viewconfig. + + if [[ \$file_ext == ".mcool" ]] + then + file_type="map" + map_uuid=${uid}_\${file_type} + + elif [[ \$file_ext == '.genome' ]] + then + file_type="grid" + grid_uuid=${uid}_\${file_type} + fi + + # Call the bash script to handle upload of file to higlass server + + upload_higlass_file.sh \$pod_name ${project_name} ${file_name} \$file_type \$file_ext ${uid} ${assembly} + + echo "\$file_ext loaded" + done + + # Set file name to pass through to view config creation + file_name=${file_name} + + echo "done" + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + kubectl: \$(kubectl version --output=json | jq -r ".clientVersion.gitVersion") + END_VERSIONS + """ +} diff --git a/modules/local/write_to_database.nf b/modules/local/write_to_database.nf new file mode 100644 index 00000000..0fcdf82d --- /dev/null +++ b/modules/local/write_to_database.nf @@ -0,0 +1,32 @@ + +process WRITE_TO_GENOME_NOTES_DB { + secret 'TOL_API_KEY' + + tag = "" + label 'process_single' + + conda "conda-forge::python=3.9.1" + container "gitlab-registry.internal.sanger.ac.uk/tol-it/software/docker-images-test/tol_sdk:0.12.5-c1" + input: + tuple val(meta), path(param_data) + val api_url + + output: + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: // This script is bundled with the pipeline, in nf-core/genomenote/bin/ + """ + write_to_genome_notes_db.py \\ + $param_data \\ + $api_url \\ + \$TOL_API_KEY \\ + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + write_to_genome_notes_db.py: \$(write_to_genome_notes_db.py --version | cut -d' ' -f2) + END_VERSIONS + """ +} diff --git a/nextflow.config b/nextflow.config index 828ee20b..0785357c 100644 --- a/nextflow.config +++ b/nextflow.config @@ -10,9 +10,29 @@ params { // Input options - input = null - binsize = 1000 - kmer_size = 31 + input = null + binsize = 1000 + kmer_size = 31 + + // Metadata + assembly = null + biosample_wgs = null + biosample_hic = null + biosample_rna = null + + // Genome Notes + write_to_portal = false + genome_notes_api = null + note_template = null + + // HiGlass options + upload_higlass_data = false + higlass_url = null + higlass_upload_directory = null + higlass_data_project_dir = null + higlass_kubeconfig = null + higlass_deployment_name = null + higlass_namespace = null // Database lineage_db = null @@ -222,7 +242,7 @@ manifest { description = """Creating standarised genome assembly publications""" mainScript = 'main.nf' nextflowVersion = '!>=22.10.1' - version = '1.2.2' + version = '2.0.0' doi = '10.5281/zenodo.7949384' } @@ -287,4 +307,3 @@ def positive_log(value, base) { def log_increase_cpus(start, step, value, base) { return check_max(start + step * (1 + Math.ceil(positive_log(value, base))), 'cpus') } - diff --git a/nextflow_schema.json b/nextflow_schema.json index e24cfdb9..881561d4 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,7 +10,7 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": ["input", "binsize", "kmer_size", "outdir"], + "required": ["input", "assembly", "biosample_wgs", "binsize", "kmer_size", "outdir"], "properties": { "input": { "type": "string", @@ -32,6 +32,22 @@ "default": 31, "description": "Size for Fastk to create the k-mer library" }, + "assembly": { + "type": "string", + "description": "The Genbank assembly accession for the assembly, for example: GCA_922984935.2." + }, + "biosample_wgs": { + "type": "string", + "description": "The biosample accesion(s) linked to the WGS samples in the experiment, for example: SAMEA7520803." + }, + "biosample_rna": { + "type": "string", + "description": "The biosample accesion(s) linked to the RNA samples in the experiment, for example: SAMEA7521081." + }, + "biosample_hic": { + "type": "string", + "description": "The biosample accesion(s) linked to the Hi-C samples in the experiment, for example: SAMEA7520846." + }, "outdir": { "type": "string", "format": "directory-path", @@ -58,9 +74,72 @@ "description": "Path to a file that contains the sequence names in the order you want to represent them on the contact-map.", "help_text": "If not specified, the pipeline will use the sequences in the same order they are listed by the NCI, which follows the karyotype.", "fa_icon": "fas fa-file-lines" + }, + "write_to_portal": { + "type": "boolean", + "description": "Flag to control if results are written to genome notes portal database .", + "default": false, + "fa_icon": "fas far fa-file-code", + "hidden": true + }, + "genome_notes_api": { + "type": "string", + "description": "URL for Genome Notes Portal API .", + "fa_icon": "far fa-file-code" + }, + "note_template": { + "type": "string", + "format": "file-path", + "description": "The path to a genome note template file.", + "help_text": "Set this parameter if you have a genome note template file that you wish to populate. Templates may be docx or xml files", + "fa_icon": "fas fa-folder-open" + }, + "upload_higlass_data": { + "type": "boolean", + "description": "flag to control if Higlass server should be updated to add new files", + "default": false, + "fa_icon": "fas far fa-file-code", + "hidden": true + }, + "higlass_url": { + "type": "string", + "description": "URL for the Higlass server", + "hidden": true + }, + "higlass_upload_directory": { + "type": "string", + "format": "directory-path", + "description": "The ingress directory for the kubernetes cluster running the HiGlass server.", + "fa_icon": "fas fa-folder-open", + "hidden": true + }, + "higlass_data_project_dir": { + "type": "string", + "format": "directory-path", + "description": "Subdirectory struture to use for organising HiGlass data, suggested format is / e.g. '/asg/algae'", + "fa_icon": "fas fa-folder-open", + "hidden": true + }, + "higlass_kubeconfig": { + "type": "string", + "format": "file-path", + "description": "The path to the kubeconfig for the kubernetes cluster running the HiGlass server.", + "fa_icon": "fas fa-folder-open", + "hidden": true + }, + "higlass_deployment_name": { + "type": "string", + "description": "Name of the kubernetes deployment for the HiGlass server.", + "hidden": true + }, + "higlass_namespace": { + "type": "string", + "description": "The name for the namespace used in the Kubernetes cluster running the HiGlass server.", + "hidden": true } } }, + "databases": { "title": "Databases", "type": "object", diff --git a/subworkflows/local/combine_note_data.nf b/subworkflows/local/combine_note_data.nf new file mode 100644 index 00000000..b9a34611 --- /dev/null +++ b/subworkflows/local/combine_note_data.nf @@ -0,0 +1,69 @@ +#!/usr/bin/env nextflow + +// +// Combine output to produce genome note doc and optionally write data back to genome dates database +// +include { PARSE_METADATA } from '../../modules/local/parse_metadata' +include { COMBINE_STATISTICS_AND_METADATA } from '../../modules/local/combine_statistics_and_metadata' +include { POPULATE_TEMPLATE } from '../../modules/local/populate_template' +include { WRITE_TO_GENOME_NOTES_DB } from '../../modules/local/write_to_database' + +workflow COMBINE_NOTE_DATA { + take: + ch_params_consistent // channel: /path/to/csv/file/consistent_parameters from GENOME_METADATA subworkflow + ch_params_inconsistent // channel: /path/to/csv/file/consistent_parameters from GENOME_METADATA subworkflow + ch_summary // channel: /path/to/csv/summary/file from GENOME_STATISTICS subworkflow + ch_higlass // channel: /path/to/csv/higlass_link from CONTACT_MAPS subworkflow + ch_note_template // channel: /path/to/genome_note_doc_template + + + main: + ch_versions = Channel.empty() + + ch_summary + | map { meta, json -> + [ meta + [ + ext: "csv", + source: "genome", + type: "summary", + ], json ] + } + | set { ch_summary_meta } + + PARSE_METADATA(ch_summary_meta) + ch_versions = ch_versions.mix( PARSE_METADATA.out.versions.first() ) + + + COMBINE_STATISTICS_AND_METADATA(ch_params_consistent, ch_params_inconsistent, PARSE_METADATA.out.file_path) + ch_versions = ch_versions.mix( COMBINE_STATISTICS_AND_METADATA.out.versions.first() ) + + ch_higlass + | map { it -> [ [id: params.assembly] , it ] } + + + // Add higlass url to the parsed dataset + COMBINE_STATISTICS_AND_METADATA.out.consistent.concat(ch_higlass) + .map { it -> + it[1] + } + .collectFile(name: 'combined.csv', sort: false) { it -> + it.text + } + .map { it -> [ [id: params.assembly] , it ] } + .set { ch_parsed } + + POPULATE_TEMPLATE( ch_parsed, ch_note_template ) + ch_versions = ch_versions.mix( POPULATE_TEMPLATE.out.versions.first() ) + + if ( params.write_to_portal ) { + ch_api_url = Channel.of(params.genome_notes_api) + WRITE_TO_GENOME_NOTES_DB( COMBINE_STATISTICS_AND_METADATA.out.consistent, ch_api_url ) + ch_versions = ch_versions.mix( WRITE_TO_GENOME_NOTES_DB.out.versions.first() ) + } + + + emit: + template = POPULATE_TEMPLATE.out.genome_note // channel: [ docx ] + versions = ch_versions.ifEmpty(null) // channel: [versions.yml] + +} \ No newline at end of file diff --git a/subworkflows/local/contact_maps.nf b/subworkflows/local/contact_maps.nf index ca784baf..5d06d680 100644 --- a/subworkflows/local/contact_maps.nf +++ b/subworkflows/local/contact_maps.nf @@ -11,7 +11,8 @@ include { FILTER_BED } from '../../modules/local/filter/bed' include { COOLER_CLOAD } from '../../modules/nf-core/cooler/cload/main' include { COOLER_ZOOMIFY } from '../../modules/nf-core/cooler/zoomify/main' include { COOLER_DUMP } from '../../modules/nf-core/cooler/dump/main' - +include { UPLOAD_HIGLASS_DATA } from '../../modules/local/upload_higlass_data' +include { GENERATE_HIGLASS_LINK } from '../../modules/local/generate_higlass_link' workflow CONTACT_MAPS { take: @@ -24,7 +25,7 @@ workflow CONTACT_MAPS { main: ch_versions = Channel.empty() - + ch_higlass_link = Channel.empty() // Extract the ordered chromosome list GET_CHROMLIST ( summary_seq, cool_order.ifEmpty([]) ) @@ -95,9 +96,22 @@ workflow CONTACT_MAPS { ch_versions = ch_versions.mix ( COOLER_DUMP.out.versions.first() ) + // Optionally add the files to a HiGlass webserver + + if ( params.upload_higlass_data ) { + UPLOAD_HIGLASS_DATA (COOLER_ZOOMIFY.out.mcool, COOLER_DUMP.out.bedpe, params.higlass_data_project_dir, params.higlass_upload_directory ) + ch_versions = ch_versions.mix ( UPLOAD_HIGLASS_DATA.out.versions.first() ) + + GENERATE_HIGLASS_LINK (UPLOAD_HIGLASS_DATA.out.file_name, UPLOAD_HIGLASS_DATA.out.map_uuid, UPLOAD_HIGLASS_DATA.out.grid_uuid, params.higlass_url, UPLOAD_HIGLASS_DATA.out.genome_file) + ch_versions = ch_versions.mix ( GENERATE_HIGLASS_LINK.out.versions.first() ) + ch_higlass_link = ch_higlass_link.mix ( GENERATE_HIGLASS_LINK.out.higlass_link.first() ) + } + + emit: - cool = COOLER_CLOAD.out.cool // tuple val(meta), val(cool_bin), path("*.cool") - mcool = COOLER_ZOOMIFY.out.mcool // tuple val(meta), path("*.mcool") - grid = COOLER_DUMP.out.bedpe // tuple val(meta), path("*.bedpe") - versions = ch_versions // channel: [ versions.yml ] + cool = COOLER_CLOAD.out.cool // tuple val(meta), val(cool_bin), path("*.cool") + mcool = COOLER_ZOOMIFY.out.mcool // tuple val(meta), path("*.mcool") + grid = COOLER_DUMP.out.bedpe // tuple val(meta), path("*.bedpe") + link = ch_higlass_link // channel: [ *_higlass_link.csv] + versions = ch_versions // channel: [ versions.yml ] } diff --git a/subworkflows/local/genome_metadata.nf b/subworkflows/local/genome_metadata.nf new file mode 100755 index 00000000..22088de9 --- /dev/null +++ b/subworkflows/local/genome_metadata.nf @@ -0,0 +1,125 @@ +#!/usr/bin/env nextflow + +// +// Fetch genome metadata for genome notes +// + +include { RUN_WGET } from '../../modules/local/run_wget' +include { PARSE_METADATA } from '../../modules/local/parse_metadata' +include { COMBINE_METADATA } from '../../modules/local/combine_metadata' +include { FETCH_GBIF_METADATA } from '../../modules/local/fetch_gbif_metadata' + + +workflow GENOME_METADATA { + take: + ch_file_list // channel: [meta, /path/to/genome_metadata_file_template] + + main: + ch_versions = Channel.empty() + + // Define channel for RUN_WGET + ch_file_list + | splitCsv(header: ['source', 'type', 'url', 'ext'], skip: 1) + | flatMap { meta, row -> + // Create a list to hold the final entries + def entries = [] + + def new_meta = meta + [source: row.source] + [type: row.type] + [ext: row.ext] + + + // Define biosamples with their types + def biosamples = [ + ["WGS", new_meta.biosample_wgs], + ["HIC", new_meta.biosample_hic], + ["RNA", new_meta.biosample_rna] + ] + + // Process each biosample + biosamples.each { biosampleType, biosampleID -> + if ( biosampleID != null ) { + // Skip if biosampleID is null} + def url = row.url + .replaceAll(/ASSEMBLY_ACCESSION/, new_meta.id) + .replaceAll(/TAXONOMY_ID/, new_meta.taxon_id) + .replaceAll(/BIOPROJECT_ACCESSION/, new_meta.bioproject) + .replaceAll(/BIOSAMPLE_ACCESSION/, biosampleID) + + if (row.type == 'Biosample') { + // Add entry with biosample type in meta for Biosample type + entries << [ + new_meta + [biosample_type: biosampleType], + url + ] + } else { + // Add entry without biosample type in meta for other types + entries << [ + new_meta + [biosample_type: ''], + url + ] + } + } + } + return entries + } + | unique() + | set { file_list } + + // Fetch files + RUN_WGET ( file_list ) + ch_versions = ch_versions.mix( RUN_WGET.out.versions.first() ) + + PARSE_METADATA(RUN_WGET.out.file_path) + ch_versions = ch_versions.mix( PARSE_METADATA.out.versions.first() ) + + + // Set channel for running GBIF + ch_gbif_params = Channel.empty() + + ch_file_list + | map { meta, it -> + def assembly = meta.id + def species = meta.species + [assembly, species] + } + | set { ch_gbif_params} + + // Fetch GBIF metdata using genus, species and id as input channels + FETCH_GBIF_METADATA( ch_gbif_params ) + ch_versions = ch_versions.mix(FETCH_GBIF_METADATA.out.versions.first() ) + + // Combining the two output channels into one channel + FETCH_GBIF_METADATA.out.file_path + | map { it -> tuple( it )} + | set { ch_gbif } + + PARSE_METADATA.out.file_path + | map { it -> tuple( it[1] )} + | set { ch_parsed } + + ch_parsed.mix(ch_gbif) + | collect + | map { it -> + [ it ] + } + | set { ch_parsed_files } + + // Set meta required for file parsing + ch_file_list + | map { meta, it -> + [id: meta.id, taxon_id: meta.taxon_id] + } + | set {ch_meta} + + // combine meta and parsed files + ch_meta_parsed = ch_meta.combine(ch_parsed_files) + + + COMBINE_METADATA( ch_meta_parsed ) + ch_versions = ch_versions.mix( COMBINE_METADATA.out.versions.first() ) + + emit: + consistent = COMBINE_METADATA.out.consistent // channel: [ csv ] + inconsistent = COMBINE_METADATA.out.inconsistent // channel: [ csv ] + versions = ch_versions.ifEmpty(null) // channel: [versions.yml] + +} diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index 7d4889f5..c0f83e92 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -2,35 +2,81 @@ // Check input samplesheet and get read channels // +include { PARAMS_CHECK } from '../../modules/local/params_check' include { SAMPLESHEET_CHECK } from '../../modules/local/samplesheet_check' workflow INPUT_CHECK { take: samplesheet // file: /path/to/samplesheet.csv + cli_params // tuple, see below main: + + PARAMS_CHECK ( cli_params ) + .csv + .splitCsv (header:true, sep: ',') + | map { row -> + meta = [ + id: row.assembly, + species: row.species, + taxon_id: row.taxon_id, + bioproject: row.bioproject, + biosample_wgs: row.wgs_biosample, + ] + + if (row.hic_biosample != "null") { + meta.biosample_hic = row.hic_biosample + } + + if (row.rna_biosample != "null") { + meta.biosample_rna = row.rna_biosample + } + + [meta] + } + | set { param } + SAMPLESHEET_CHECK ( samplesheet ) .csv .splitCsv ( header:true, sep:',' ) - .map { create_data_channel(it) } - .set { data } + .map { create_data_channel(it, params.assembly) } + .set { ch_data } + + // set temp key to allow combining channels + param + .map { meta -> + [meta.id[0], meta] + } + .set { ch_tmp_param } + // add some metadata params to the data channel meta + ch_data + .combine(ch_tmp_param, by: 0) + .map { assembly, meta, sample, meta2 -> + def new_meta = meta.clone() + new_meta.species = meta2.species[0] + new_meta.taxon_id = meta2.taxon_id[0] + [new_meta, sample] + } + .set { data } emit: - data // channel: [ val(meta), data ] + data // channel: [ val(meta), data ] + param // channel: [val(meta)] versions = SAMPLESHEET_CHECK.out.versions // channel: [ versions.yml ] } // Function to get list of [ meta, data ] -def create_data_channel(LinkedHashMap row) { +def create_data_channel(LinkedHashMap row, assembly) { // create meta map def meta = [:] meta.id = row.sample meta.datatype = row.datatype + meta.assembly = assembly // add path(s) of the data file(s) to the meta map - return [ meta, file(row.datafile) ] + return [ meta.assembly, meta, file(row.datafile) ] } diff --git a/workflows/genomenote.nf b/workflows/genomenote.nf index c46e7a52..412718af 100644 --- a/workflows/genomenote.nf +++ b/workflows/genomenote.nf @@ -14,6 +14,8 @@ def checkPathParamList = [ params.input, params.multiqc_config, params.lineage_d for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } // Check mandatory parameters +if (params.assembly && params.biosample_wgs) { metadata_inputs = [ params.assembly, params.biosample_wgs ] } +else { exit 1, 'Metadata input not specified. Please include an assembly accession and a biosample accession for the WGS data' } if (params.input) { ch_input = Channel.fromPath(params.input) } else { exit 1, 'Input samplesheet not specified!' } if (params.fasta) { ch_fasta = Channel.fromPath(params.fasta) } else { exit 1, 'Genome fasta not specified!' } if (params.binsize) { ch_bin = Channel.of(params.binsize) } else { exit 1, 'Bin size for cooler/cload not specified!' } @@ -23,8 +25,10 @@ if (params.lineage_tax_ids) { ch_lineage_tax_ids = Channel.fromPath(params.linea // Check optional parameters if (params.lineage_db) { ch_lineage_db = Channel.fromPath(params.lineage_db) } else { ch_lineage_db = Channel.empty() } +if (params.note_template) { ch_note_template = Channel.fromPath(params.note_template) } else { ch_note_template = Channel.empty() } if (params.cool_order) { ch_cool_order = Channel.fromPath(params.cool_order) } else { ch_cool_order = Channel.empty() } - +if (params.biosample_hic) metadata_inputs.add(params.biosample_hic) else metadata_inputs.add(null) +if (params.biosample_rna) metadata_inputs.add(params.biosample_rna) else metadata_inputs.add(null) /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -32,12 +36,13 @@ if (params.cool_order) { ch_cool_order = Channel.fromPath(params.cool_order) } e ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ +ch_metdata_input = Channel.of( metadata_inputs ) +ch_file_list = Channel.fromPath("$projectDir/assets/genome_metadata_template.csv") ch_multiqc_config = Channel.fromPath("$projectDir/assets/multiqc_config.yml", checkIfExists: true) ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath( params.multiqc_config, checkIfExists: true ) : Channel.empty() ch_multiqc_logo = params.multiqc_logo ? Channel.fromPath( params.multiqc_logo, checkIfExists: true ) : Channel.empty() ch_multiqc_custom_methods_description = params.multiqc_methods_description ? file(params.multiqc_methods_description, checkIfExists: true) : file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) - /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ IMPORT LOCAL MODULES/SUBWORKFLOWS @@ -48,8 +53,10 @@ ch_multiqc_custom_methods_description = params.multiqc_methods_description ? fil // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules // include { INPUT_CHECK } from '../subworkflows/local/input_check' +include { GENOME_METADATA } from '../subworkflows/local/genome_metadata' include { CONTACT_MAPS } from '../subworkflows/local/contact_maps' include { GENOME_STATISTICS } from '../subworkflows/local/genome_statistics' +include { COMBINE_NOTE_DATA } from '../subworkflows/local/combine_note_data' /* @@ -79,11 +86,10 @@ workflow GENOMENOTE { ch_versions = Channel.empty() - // // SUBWORKFLOW: Read in samplesheet, validate and stage input files // - INPUT_CHECK ( ch_input ).data + INPUT_CHECK ( ch_input, ch_metdata_input ).data | branch { meta, file -> hic : meta.datatype == 'hic' return [ meta, file, [] ] @@ -95,10 +101,21 @@ workflow GENOMENOTE { // - // MODULE: Uncompress fasta file if needed + // SUBWORKFLOW: Read in template of data files to fetch, parse these files and output a list of genome metadata params + // + + INPUT_CHECK.out.param.combine( ch_file_list ) + | set { ch_metadata } + + + GENOME_METADATA ( ch_metadata ) + ch_versions = ch_versions.mix(GENOME_METADATA.out.versions) + // - ch_fasta - | map { file -> [ [ 'id': file.baseName ], file ] } + // MODULE: Uncompress fasta file if needed and set meta based on input params + // + + INPUT_CHECK.out.param.combine( ch_fasta ) | set { ch_genome } if ( params.fasta.endsWith('.gz') ) { @@ -126,13 +143,19 @@ workflow GENOMENOTE { GENOME_STATISTICS ( ch_fasta, ch_lineage_tax_ids, ch_lineage_db, ch_inputs.pacbio, ch_flagstat ) ch_versions = ch_versions.mix ( GENOME_STATISTICS.out.versions ) - // // SUBWORKFLOW: Create contact map matrices from HiC alignment files // CONTACT_MAPS ( ch_fasta, ch_inputs.hic, GENOME_STATISTICS.out.summary_seq, ch_bin, ch_cool_order ) ch_versions = ch_versions.mix ( CONTACT_MAPS.out.versions ) + // + // SUBWORKFLOW: Combine data from previous steps to create formatted genome note + // + + COMBINE_NOTE_DATA (GENOME_METADATA.out.consistent, GENOME_METADATA.out.inconsistent, GENOME_STATISTICS.out.summary, CONTACT_MAPS.out.link, ch_note_template) + ch_versions = ch_versions.mix ( COMBINE_NOTE_DATA.out.versions ) + // // MODULE: Combine different versions.yml