diff --git a/conf/test.config b/conf/test.config index af8e691f..ade68cb3 100644 --- a/conf/test.config +++ b/conf/test.config @@ -32,7 +32,7 @@ params { biosample = 'SAMEA10835113' // Genome Notes Portal - write_to_portal = true + write_to_portal = false genome_notes_api = "https://notes-staging.tol.sanger.ac.uk/api/v1" } diff --git a/conf/test_full.config b/conf/test_full.config index 28044be0..23891d0c 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -32,7 +32,7 @@ params { biosample = 'SAMEA10835113' // Genome Notes Portal - write_to_portal = true + write_to_portal = false genome_notes_api = "https://notes-staging.tol.sanger.ac.uk/api/v1" } diff --git a/modules/local/combine_metadata.nf b/modules/local/combine_metadata.nf index a3706034..63f4dd25 100644 --- a/modules/local/combine_metadata.nf +++ b/modules/local/combine_metadata.nf @@ -1,5 +1,5 @@ process COMBINE_METADATA { - tag "${meta.id}|combine_parsed" + tag "${meta.id}" label 'process_single' conda "conda-forge::python=3.9.1" @@ -8,7 +8,7 @@ process COMBINE_METADATA { 'quay.io/biocontainers/python:3.9--1' }" input: - tuple val(meta), val(file_list) + tuple val(meta), path(file_list) output: tuple val (meta), path("consistent.csv") , emit: consistent @@ -21,10 +21,9 @@ process COMBINE_METADATA { script: def args = [] for (item in file_list){ - def meta_file = item[0] - def file = item[1] - def arg = "--${meta_file.source}_${meta_file.type}_file".toLowerCase() - args.add(arg) + def file = item + def file_name = "--" + item.getSimpleName() + "_file" + args.add(file_name) args.add(file) } diff --git a/modules/local/parse_metadata.nf b/modules/local/parse_metadata.nf new file mode 100644 index 00000000..b627ad0a --- /dev/null +++ b/modules/local/parse_metadata.nf @@ -0,0 +1,35 @@ + +process PARSE_METADATA { + tag "${meta.ext}|${meta.source}|${meta.type}" + label 'process_single' + + conda "conda-forge::python=3.9.1" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.9--1' : + 'quay.io/biocontainers/python:3.9--1' }" + + input: + tuple val(meta), path(json) + + output: + tuple val(meta), path("${meta.source.toLowerCase()}_${meta.type.toLowerCase()}.csv") , emit: file_path + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + + script: // This script is bundled with the pipeline, in nf-core/genomenote/bin/ + def script_name = "parse_${meta.ext.toLowerCase()}_${meta.source.toLowerCase()}_${meta.type.toLowerCase()}.py" + def output_file = "${meta.source.toLowerCase()}_${meta.type.toLowerCase()}.csv" + """ + $script_name \\ + $json \\ + $output_file + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + $script_name: \$($script_name --version | cut -d' ' -f2) + END_VERSIONS + """ +} diff --git a/subworkflows/local/genome_metadata.nf b/subworkflows/local/genome_metadata.nf index c210afd7..23d6ed5d 100644 --- a/subworkflows/local/genome_metadata.nf +++ b/subworkflows/local/genome_metadata.nf @@ -5,14 +5,8 @@ // include { RUN_WGET } from '../../modules/local/run_wget' -include { PARSE_ENA_ASSEMBLY } from '../../modules/local/parse_ena_assembly' -include { PARSE_ENA_BIOPROJECT } from '../../modules/local/parse_ena_bioproject' -include { PARSE_ENA_BIOSAMPLE } from '../../modules/local/parse_ena_biosample' -include { PARSE_ENA_TAXONOMY } from '../../modules/local/parse_ena_taxonomy' -include { PARSE_NCBI_ASSEMBLY } from '../../modules/local/parse_ncbi_assembly' -include { PARSE_NCBI_TAXONOMY } from '../../modules/local/parse_ncbi_taxonomy' -include { PARSE_GOAT_ASSEMBLY } from '../../modules/local/parse_goat_assembly' -include { COMBINE_METADATA } from '../../modules/local/combine_metadata' +include { PARSE_METADATA } from '../../modules/local/parse_metadata' +include { COMBINE_METADATA } from '../../modules/local/combine_metadata' include { POPULATE_TEMPLATE } from '../../modules/local/populate_template' include { WRITE_TO_GENOME_NOTES_DB } from '../../modules/local/write_to_database' @@ -23,20 +17,15 @@ workflow GENOME_METADATA { main: ch_versions = Channel.empty() - ch_combined = Channel.empty() - - - def meta = [:] - meta.id = params.assembly - meta.taxon_id = params.taxon_id - ch_combined_params = Channel.of(meta) - + // Define channel for RUN_WGET ch_file_list - .splitCsv(header: ['source', 'type', 'url', 'ext'], skip: 1) - .map { row -> [ + | splitCsv(header: ['source', 'type', 'url', 'ext'], skip: 1) + | map { row -> + [ // meta - [ + [ id: params.assembly, + taxon_id: params.taxon_id, source: row.source, type: row.type, ext: row.ext, @@ -47,66 +36,38 @@ workflow GENOME_METADATA { .replaceAll(/TAXONOMY_ID/, params.taxon_id) .replaceAll(/BIOPROJECT_ACCESSION/, params.bioproject) .replaceAll(/BIOSAMPLE_ACCESSION/, params.biosample) - ] } - .set{file_list} + ] + } + | set { file_list } // Fetch files - RUN_WGET ( file_list ) - - ch_versions = ch_versions.mix( RUN_WGET.out.versions.first() ) - - ch_input = RUN_WGET.out.file_path.branch { - ENA_ASSEMBLY: it[0].source == "ENA" && it[0].type == "Assembly" - ENA_BIOPROJECT: it[0].source == "ENA" && it[0].type == "Bioproject" - ENA_BIOSAMPLE: it[0].source == "ENA" && it[0].type == "Biosample" - ENA_TAXONOMY: it[0].source == "ENA" && it[0].type == "Taxonomy" - NCBI_ASSEMBLY: it[0].source == "NCBI" && it[0].type == "Assembly" - NCBI_TAXONOMY: it[0].source == "NCBI" && it[0].type == "Taxonomy" - GOAT_ASSEMBLY: it[0].source == "GOAT" && it[0].type == "Assembly" + RUN_WGET ( file_list ) + ch_versions = ch_versions.mix( RUN_WGET.out.versions.first() ) + + PARSE_METADATA(RUN_WGET.out.file_path) + ch_versions = ch_versions.mix( PARSE_METADATA.out.versions.first() ) + + PARSE_METADATA.out.file_path + | map { it -> tuple( it[1] )} + | collect + | map { it -> + meta = [:] + meta.id = params.assembly + meta.taxon_id = params.taxon_id + [ meta, it ] } + | set { ch_parsed_files } - PARSE_ENA_ASSEMBLY ( ch_input.ENA_ASSEMBLY ) - ch_versions = ch_versions.mix( PARSE_ENA_ASSEMBLY.out.versions.first() ) - ch_combined = ch_combined.concat( PARSE_ENA_ASSEMBLY.out.file_path ) - - PARSE_ENA_BIOPROJECT ( ch_input.ENA_BIOPROJECT ) - ch_versions = ch_versions.mix( PARSE_ENA_BIOPROJECT.out.versions.first() ) - ch_combined = ch_combined.concat( PARSE_ENA_BIOPROJECT.out.file_path ) - - - PARSE_ENA_BIOSAMPLE ( ch_input.ENA_BIOSAMPLE ) - ch_versions = ch_versions.mix( PARSE_ENA_BIOSAMPLE.out.versions.first() ) - ch_combined = ch_combined.concat( PARSE_ENA_BIOSAMPLE.out.file_path ) - - - PARSE_ENA_TAXONOMY ( ch_input.ENA_TAXONOMY ) - ch_versions = ch_versions.mix( PARSE_ENA_TAXONOMY.out.versions.first() ) - ch_combined = ch_combined.concat( PARSE_ENA_TAXONOMY.out.file_path ) - - PARSE_NCBI_ASSEMBLY ( ch_input.NCBI_ASSEMBLY ) - ch_versions = ch_versions.mix( PARSE_NCBI_ASSEMBLY.out.versions.first() ) - ch_combined = ch_combined.concat( PARSE_NCBI_ASSEMBLY.out.file_path ) - - PARSE_NCBI_TAXONOMY ( ch_input.NCBI_TAXONOMY ) - ch_versions = ch_versions.mix( PARSE_NCBI_TAXONOMY.out.versions.first() ) - ch_combined = ch_combined.concat( PARSE_NCBI_TAXONOMY.out.file_path ) - - PARSE_GOAT_ASSEMBLY ( ch_input.GOAT_ASSEMBLY) - ch_versions = ch_versions.mix( PARSE_GOAT_ASSEMBLY.out.versions.first() ) - ch_combined = ch_combined.concat( PARSE_GOAT_ASSEMBLY.out.file_path ) + COMBINE_METADATA(ch_parsed_files) + ch_versions = ch_versions.mix( COMBINE_METADATA.out.versions.first() ) - ch_combined = ch_combined.collect(flat: false) - ch_combined_params = ch_combined_params.concat(ch_combined).collect(flat: false) - COMBINE_METADATA(ch_combined_params) - ch_versions = ch_versions.mix( COMBINE_METADATA.out.versions.first() ) - COMBINE_METADATA.out.consistent - .multiMap { it -> + | multiMap { it -> TEMPLATE: it DB: it } - .set { ch_params_consistent } + | set { ch_params_consistent } POPULATE_TEMPLATE( ch_params_consistent.TEMPLATE, ch_note_template ) ch_versions = ch_versions.mix( POPULATE_TEMPLATE.out.versions.first() ) @@ -120,4 +81,5 @@ workflow GENOME_METADATA { emit: template = POPULATE_TEMPLATE.out.genome_note // channel: [ docx ] versions = ch_versions.ifEmpty(null) // channel: [versions.yml] + }