diff --git a/pipes/WDL/tasks/tasks_assembly.wdl b/pipes/WDL/tasks/tasks_assembly.wdl index 696df4bf1..6b2f75e74 100644 --- a/pipes/WDL/tasks/tasks_assembly.wdl +++ b/pipes/WDL/tasks/tasks_assembly.wdl @@ -407,9 +407,9 @@ task refine_assembly_with_aligned_reads { File reads_aligned_bam String sample_name - Boolean? mark_duplicates = false - Float? major_cutoff = 0.5 - Int? min_coverage = 3 + Boolean mark_duplicates = false + Float major_cutoff = 0.5 + Int min_coverage = 3 Int? machine_mem_gb String docker = "quay.io/broadinstitute/viral-assemble:2.1.16.1" diff --git a/pipes/WDL/tasks/tasks_reports.wdl b/pipes/WDL/tasks/tasks_reports.wdl index f8871b9aa..43ee24a90 100644 --- a/pipes/WDL/tasks/tasks_reports.wdl +++ b/pipes/WDL/tasks/tasks_reports.wdl @@ -5,6 +5,8 @@ task alignment_metrics { File aligned_bam File ref_fasta File? primers_bed + String? amplicon_set + Int? min_coverage Int? machine_mem_gb String docker = "quay.io/broadinstitute/viral-core:2.1.33" @@ -54,12 +56,28 @@ task alignment_metrics { echo -e "$SAMPLE\t~{out_basename}" >> prepend.txt paste prepend.txt picard_clean.insert_size_metrics.txt > "~{out_basename}".insert_size_metrics.txt - # actually don't know how to do CollectTargetedPcrMetrics yet + touch "~{out_basename}".ampliconstats.txt "~{out_basename}".ampliconstats_parsed.txt + echo -e "sample_sanitized\tbam\tamplicon_set\tamplicon_idx\tamplicon_left\tamplicon_right\tFREADS\tFDEPTH\tFPCOV\tFAMP" > "~{out_basename}.ampliconstats_parsed.txt" if [ -n "~{primers_bed}" ]; then - picard $XMX BedToIntervalList \ - -I "~{primers_bed}" \ - -O primers.interval.list \ - -SD reference.dict + # samtools ampliconstats + samtools ampliconstats -s -@ $(nproc) \ + ~{'-d ' + min_coverage} \ + -o "~{out_basename}".ampliconstats.txt "~{primers_bed}" "~{aligned_bam}" + + # parse into our own tsv to facilitate tsv joining later + if [ -n "~{default='' amplicon_set}" ]; then + AMPLICON_SET="~{default='' amplicon_set}" + else + AMPLICON_SET=$(basename "~{primers_bed}" .bed) + fi + grep ^AMPLICON "~{out_basename}".ampliconstats.txt | cut -f 2- > AMPLICON + grep ^FREADS "~{out_basename}".ampliconstats.txt | cut -f 3- | tr '\t' '\n' > FREADS; echo "" >> FREADS + grep ^FDEPTH "~{out_basename}".ampliconstats.txt | cut -f 3- | tr '\t' '\n' > FDEPTH; echo "" >> FDEPTH + grep ^FPCOV "~{out_basename}".ampliconstats.txt | cut -f 3- | tr '\t' '\n' > FPCOV; echo "" >> FPCOV + grep ^FAMP "~{out_basename}".ampliconstats.txt | cut -f 4 | tail +2 > FAMP + for i in $(cut -f 1 AMPLICON); do echo -e "$SAMPLE\t~{out_basename}\t$AMPLICON_SET"; done > prepend.txt + wc -l prepend.txt AMPLICON FREADS FDEPTH FPCOV FAMP + paste prepend.txt AMPLICON FREADS FDEPTH FPCOV FAMP | grep '\S' >> "~{out_basename}.ampliconstats_parsed.txt" fi >>> @@ -67,6 +85,8 @@ task alignment_metrics { File wgs_metrics = "~{out_basename}.raw_wgs_metrics.txt" File alignment_metrics = "~{out_basename}.alignment_metrics.txt" File insert_size_metrics = "~{out_basename}.insert_size_metrics.txt" + File amplicon_stats = "~{out_basename}.ampliconstats.txt" + File amplicon_stats_parsed = "~{out_basename}.ampliconstats_parsed.txt" } runtime { diff --git a/pipes/WDL/tasks/tasks_sarscov2.wdl b/pipes/WDL/tasks/tasks_sarscov2.wdl index b28fb6b09..55d93a531 100644 --- a/pipes/WDL/tasks/tasks_sarscov2.wdl +++ b/pipes/WDL/tasks/tasks_sarscov2.wdl @@ -184,8 +184,8 @@ task sequencing_report { File assembly_stats_tsv File? collab_ids_tsv - String? sequencing_lab = "Broad Institute" - String? intro_blurb = "The Broad Institute Viral Genomics group, in partnership with the Genomics Platform and Data Sciences Platform, has been engaged in viral sequencing of COVID-19 patients since March 2020." + String sequencing_lab = "Broad Institute" + String intro_blurb = "The Broad Institute Viral Genomics group, in partnership with the Genomics Platform and Data Sciences Platform, has been engaged in viral sequencing of COVID-19 patients since March 2020." String? max_date String? min_date Int? min_unambig @@ -240,9 +240,12 @@ task sc2_meta_final { String? max_date String? min_date - Int? min_unambig=24000 + Int min_unambig=24000 Boolean drop_file_cols=false + String address_map = '{}' + String authors_map = '{}' + File? filter_to_ids String docker = "quay.io/broadinstitute/py3-bio:0.1.2" @@ -274,6 +277,8 @@ task sc2_meta_final { genome_status = json.load(inf) else: genome_status = {} + address_map = json.loads('~{address_map}') + authors_map = json.loads('~{authors_map}') # read input files df_assemblies = pd.read_csv(assemblies_tsv, sep='\t').dropna(how='all') @@ -352,6 +357,10 @@ task sc2_meta_final { # join column: collaborator_id df_assemblies = df_assemblies.merge(collab_ids, on='sample', how='left', validate='one_to_one') + # derived columns: authors, orig_lab_addr + df_assemblies.loc[:,'authors'] = list(authors_map.get(cby) if not pd.isna(cby) else cby for cby in df_assemblies.loc[:,'collected_by']) + df_assemblies.loc[:,'orig_lab_addr'] = list(address_map.get(cby) if not pd.isna(cby) else cby for cby in df_assemblies.loc[:,'collected_by']) + # write final output df_assemblies.to_csv("~{out_basename}.final.tsv", sep='\t', index=False) CODE @@ -547,15 +556,19 @@ task gisaid_uploader { File gisaid_sequences_fasta File gisaid_meta_csv File cli_auth_token + String database="EpiCoV" + String frameshift="catch_novel" } command { set -e - cp "~{cli_auth_token}" gisaid_uploader.authtoken - gisaid_uploader CoV upload \ + cli3 upload \ + --database "~{database}" \ + --token "~{cli_auth_token}" \ --fasta "~{gisaid_sequences_fasta}" \ - --csv "~{gisaid_meta_csv}" \ - --failedout failed_metadata.csv \ - | tee logs.txt + --metadata "~{gisaid_meta_csv}" \ + --failed failed_metadata.csv \ + --frameshift "~{frameshift}" \ + --log logs.txt # the following grep statement will exit 1 if anything failed grep "submissions failed: 0" logs.txt > /dev/null } @@ -563,7 +576,7 @@ task gisaid_uploader { File failed_metadata = "failed_metadata.csv" } runtime { - docker: "quay.io/broadinstitute/gisaid-cli:1.0" + docker: "quay.io/broadinstitute/gisaid-cli:3.0" memory: "2 GB" cpu: 2 disks: "local-disk 100 HDD" diff --git a/pipes/WDL/tasks/tasks_utils.wdl b/pipes/WDL/tasks/tasks_utils.wdl index 881157855..38889debc 100644 --- a/pipes/WDL/tasks/tasks_utils.wdl +++ b/pipes/WDL/tasks/tasks_utils.wdl @@ -116,6 +116,32 @@ task zcat { } } +task sed { + meta { + description: "Replace all occurrences of 'search' with 'replace' using sed." + } + input { + File infile + String search + String replace + String outfilename = "~{basename(infile)}-rename.txt" + } + command { + sed 's/~{search}/~{replace}/g' "~{infile}" > "~{outfilename}" + } + runtime { + docker: "ubuntu" + memory: "1 GB" + cpu: 1 + disks: "local-disk 375 LOCAL" + dx_instance_type: "mem1_ssd1_v2_x2" + maxRetries: 2 + } + output { + File outfile = "~{outfilename}" + } +} + task fasta_to_ids { meta { description: "Return the headers only from a fasta file" @@ -461,6 +487,31 @@ task tsv_stack { } } +task cat_except_headers { + input { + Array[File]+ infiles + String out_filename + } + + command <<< + awk 'FNR>1 || NR==1' \ + ~{sep=' ' infiles} \ + > ~{out_filename} + >>> + + output { + File out_tsv = out_filename + } + + runtime { + memory: "1 GB" + cpu: 1 + docker: "ubuntu" + disks: "local-disk 50 HDD" + dx_instance_type: "mem1_ssd1_v2_x2" + maxRetries: 2 + } +} task make_empty_file { input { String out_filename diff --git a/pipes/WDL/workflows/assemble_refbased.wdl b/pipes/WDL/workflows/assemble_refbased.wdl index 40ef09fc5..00ee8a445 100644 --- a/pipes/WDL/workflows/assemble_refbased.wdl +++ b/pipes/WDL/workflows/assemble_refbased.wdl @@ -122,7 +122,9 @@ workflow assemble_refbased { call reports.alignment_metrics { input: aligned_bam = aligned_trimmed_bam, - ref_fasta = reference_fasta + ref_fasta = reference_fasta, + primers_bed = trim_coords_bed, + min_coverage = min_coverage } call assembly.run_discordance { @@ -221,6 +223,8 @@ workflow assemble_refbased { File picard_metrics_wgs = alignment_metrics.wgs_metrics File picard_metrics_alignment = alignment_metrics.alignment_metrics File picard_metrics_insert_size = alignment_metrics.insert_size_metrics + File samtools_ampliconstats = alignment_metrics.amplicon_stats + File samtools_ampliconstats_parsed = alignment_metrics.amplicon_stats_parsed Array[File] align_to_self_merged_aligned_and_unaligned_bam = align_to_self.aligned_bam diff --git a/pipes/WDL/workflows/sarscov2_illumina_full.wdl b/pipes/WDL/workflows/sarscov2_illumina_full.wdl index 39fd19cd7..b687d38c4 100644 --- a/pipes/WDL/workflows/sarscov2_illumina_full.wdl +++ b/pipes/WDL/workflows/sarscov2_illumina_full.wdl @@ -50,6 +50,7 @@ workflow sarscov2_illumina_full { Int min_genome_bases = 24000 Int max_vadr_alerts = 0 + Int ntc_max_unambig = 3000 File? sample_rename_map @@ -107,7 +108,13 @@ workflow sarscov2_illumina_full { # assemble genome if (ampseq) { - String trim_coords_bed = amplicon_bed_prefix + demux_deplete.meta_by_sample[name_reads.left]["amplicon_set"] + ".bed" + call utils.sed as bed_rename { + input: + infile = amplicon_bed_prefix + demux_deplete.meta_by_sample[name_reads.left]["amplicon_set"] + ".bed", + outfilename = demux_deplete.meta_by_sample[name_reads.left]["amplicon_set"] + ".bed", + search = "MN908947.3", + replace = "NC_045512.2" + } } call assemble_refbased.assemble_refbased { input: @@ -116,7 +123,7 @@ workflow sarscov2_illumina_full { sample_name = name_reads.left, aligner = "minimap2", skip_mark_dupes = ampseq, - trim_coords_bed = trim_coords_bed, + trim_coords_bed = bed_rename.outfile, major_cutoff = 0.75, min_coverage = if ampseq then 50 else 3 } @@ -242,7 +249,7 @@ workflow sarscov2_illumina_full { seqid_list = write_lines(select_all(passing_assembly_ids)), demux_meta_by_sample_json = demux_deplete.meta_by_sample_json, assembly_meta_tsv = sarscov2_batch_relineage.assembly_stats_relineage_tsv, - ntc_min_unambig = 15000 + ntc_min_unambig = ntc_max_unambig } ### QC metrics @@ -274,6 +281,11 @@ workflow sarscov2_illumina_full { id_col = 'sample_sanitized', out_basename = "picard_metrics_insertsize-~{flowcell_id}" } + call utils.cat_except_headers as samtools_ampliconstats_merge { + input: + infiles = assemble_refbased.samtools_ampliconstats_parsed, + out_filename = "samtools_ampliconstats-~{flowcell_id}.txt" + } ### filter and concatenate final sets for delivery ("passing" and "submittable") call sarscov2.sc2_meta_final { @@ -395,14 +407,14 @@ workflow sarscov2_illumina_full { if(defined(gcs_out_metrics)) { call terra.gcs_copy as gcs_metrics_dump { input: - infiles = flatten([[assembly_meta_tsv.combined, sc2_meta_final.meta_tsv, ivar_trim_stats.trim_stats_tsv, demux_deplete.multiqc_report_raw, demux_deplete.multiqc_report_cleaned, demux_deplete.spikein_counts, picard_wgs_merge.out_tsv, picard_alignment_merge.out_tsv, picard_insertsize_merge.out_tsv, sarscov2_batch_relineage.nextclade_all_json, sarscov2_batch_relineage.nextclade_all_tsv], demux_deplete.demux_metrics]), + infiles = flatten([[assembly_meta_tsv.combined, sc2_meta_final.meta_tsv, ivar_trim_stats.trim_stats_tsv, demux_deplete.multiqc_report_raw, demux_deplete.multiqc_report_cleaned, demux_deplete.spikein_counts, picard_wgs_merge.out_tsv, picard_alignment_merge.out_tsv, picard_insertsize_merge.out_tsv, samtools_ampliconstats_merge.out_tsv, sarscov2_batch_relineage.nextclade_all_json, sarscov2_batch_relineage.nextclade_all_tsv], demux_deplete.demux_metrics]), gcs_uri_prefix = "~{gcs_out_metrics}/~{flowcell_id}/" } } if(defined(gcs_out_cdc)) { call terra.gcs_copy as gcs_cdc_dump { input: - infiles = [sc2_meta_final.meta_tsv, passing_cat.filtered_fasta], + infiles = [sc2_meta_final.meta_tsv, passing_cat.filtered_fasta, gisaid_meta_prep.meta_csv, prefix_gisaid.renamed_fasta, package_genbank_ftp_submission.submission_zip, select_first([demux_deplete.sra_metadata])], gcs_uri_prefix = "~{gcs_out_cdc}/~{demux_deplete.run_date}/~{flowcell_id}/" } call terra.gcs_copy as gcs_cdc_dump_reads {