From 2fe8abf0436151a3940e8b84cfdfff39bf636c8e Mon Sep 17 00:00:00 2001 From: LaraFuhrmann <55209716+LaraFuhrmann@users.noreply.github.com> Date: Tue, 24 Oct 2023 11:30:28 +0200 Subject: [PATCH] Feat subsample use jvarkit (#3) * current ba.2 samples * [add] subsampling with jvarkit * update threads * subsampling * conda env paths * conda env paths * conda env paths * correct path * primer file for BA.2 variant * add TODO for env_jvarkit * [add] collect haplotype sequences of all samples * typo * [run_viloca] adapt resources * sort subsampled bam file * fix: typo * add conda env for subsampling * [subsampling rule] update command * add jvarkit conda package + change coverage cap threshold * modify coverage threshold * use full bed file * add coverage stats after subsampling * add new insert bed file version * fix * add conda for stats_coverage * new threshold --- config/config.yaml | 2 +- config/samples.csv | 15 ++-- resources/SARS-CoV-2.v532.insert.bed | 96 +++++++++++++++++++++++++ workflow/Snakefile | 77 ++++++++++++++++---- workflow/envs/smallgenomeutilities.yaml | 6 ++ workflow/envs/subsampling.yaml | 6 ++ workflow/scripts/combine_txt_files.py | 16 +++++ workflow/scripts/haplotypes.py | 46 ++++++++++++ workflow/scripts/run_viloca.py | 5 +- 9 files changed, 245 insertions(+), 24 deletions(-) create mode 100644 resources/SARS-CoV-2.v532.insert.bed create mode 100644 workflow/envs/smallgenomeutilities.yaml create mode 100644 workflow/envs/subsampling.yaml create mode 100644 workflow/scripts/combine_txt_files.py create mode 100644 workflow/scripts/haplotypes.py diff --git a/config/config.yaml b/config/config.yaml index c9ec413..5c2f441 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -1,4 +1,4 @@ fname_reference: resources/NC_045512.2.fasta -fname_insert_bed: resources/SARS-CoV-2.insert.bed +fname_insert_bed: resources/SARS-CoV-2.v532.insert.bed fname_samples: config/samples.csv dir_path_samples: # add path to samples here diff --git a/config/samples.csv b/config/samples.csv index d9947ff..a471437 100644 --- a/config/samples.csv +++ b/config/samples.csv @@ -1,11 +1,6 @@ ,sample,batch -5342,A2_10_2021_04_08,20210423_JFPC6 -2529,A2_10_2022_04_07,20220422_HTN5VDRXY -2480,A2_10_2022_04_14,20220429_HTYNFDRXY -2434,A2_10_2022_04_21,20220506_HTYK5DRXY -2390,A2_10_2022_04_28,20220513_HTLLHDRXY -1,A2_10_2023_04_06,20230421_HNH2FDRX2 -5354,B2_10_2021_04_09,20210423_JFPC6 -2577,B2_10_2022_04_01,20220414_HTNCFDRXY -2535,B2_10_2022_04_08,20220422_HTN5VDRXY -2486,B2_10_2022_04_15,20220429_HTYNFDRXY +,H2_15_2023_08_08,20230825_HHK3MDRX3 +,A3_15_2023_08_10,20230825_HHK3MDRX3 +,B3_15_2023_08_05,20230818_HHKGTDRX3 +,C6_25_2023_08_05,20230818_HHKGTDRX3 +,D6_25_2023_08_06,20230818_HHKGTDRX3 diff --git a/resources/SARS-CoV-2.v532.insert.bed b/resources/SARS-CoV-2.v532.insert.bed new file mode 100644 index 0000000..305b039 --- /dev/null +++ b/resources/SARS-CoV-2.v532.insert.bed @@ -0,0 +1,96 @@ +NC_045512.2 78 419 SARS-CoV-2_400_INSERT_1 1 - +NC_045512.2 366 707 SARS-CoV-2_400_INSERT_2 2 - +NC_045512.2 661 1018 SARS-CoV-2_400_INSERT_3 1 - +NC_045512.2 995 1340 SARS-CoV-2_400_INSERT_4 2 - +NC_045512.2 1320 1660 SARS-CoV-2_400_INSERT_5 1 - +NC_045512.2 1596 1945 SARS-CoV-2_400_INSERT_6 2 - +NC_045512.2 1905 2259 SARS-CoV-2_400_INSERT_7 1 - +NC_045512.2 2252 2603 SARS-CoV-2_400_INSERT_8 2 - +NC_045512.2 2563 2900 SARS-CoV-2_400_INSERT_9 1 - +NC_045512.2 2880 3233 SARS-CoV-2_400_INSERT_10 2 - +NC_045512.2 3213 3560 SARS-CoV-2_400_INSERT_11 1 - +NC_045512.2 3540 3883 SARS-CoV-2_400_INSERT_12 2 - +NC_045512.2 3824 4147 SARS-CoV-2_400_INSERT_13 1 - +NC_045512.2 4108 4457 SARS-CoV-2_400_INSERT_14 2 - +NC_045512.2 4425 4776 SARS-CoV-2_400_INSERT_15 1 - +NC_045512.2 4756 5089 SARS-CoV-2_400_INSERT_16 2 - +NC_045512.2 5063 5398 SARS-CoV-2_400_INSERT_17 1 - +NC_045512.2 5370 5716 SARS-CoV-2_400_INSERT_18 2 - +NC_045512.2 5696 6031 SARS-CoV-2_400_INSERT_19 1 - +NC_045512.2 5923 6257 SARS-CoV-2_400_INSERT_20 2 - +NC_045512.2 6237 6562 SARS-CoV-2_400_INSERT_21 1 - +NC_045512.2 6542 6882 SARS-CoV-2_400_INSERT_22 2 - +NC_045512.2 6854 7199 SARS-CoV-2_400_INSERT_23 1 - +NC_045512.2 7179 7518 SARS-CoV-2_400_INSERT_24 2 - +NC_045512.2 7482 7819 SARS-CoV-2_400_INSERT_25 1 - +NC_045512.2 7797 8136 SARS-CoV-2_400_INSERT_26 2 - +NC_045512.2 8112 8468 SARS-CoV-2_400_INSERT_27 1 - +NC_045512.2 8436 8781 SARS-CoV-2_400_INSERT_28 2 - +NC_045512.2 8761 9107 SARS-CoV-2_400_INSERT_29 1 - +NC_045512.2 9052 9397 SARS-CoV-2_400_INSERT_30 2 - +NC_045512.2 9324 9673 SARS-CoV-2_400_INSERT_31 1 - +NC_045512.2 9604 9949 SARS-CoV-2_400_INSERT_32 2 - +NC_045512.2 9929 10266 SARS-CoV-2_400_INSERT_33 1 - +NC_045512.2 10245 10587 SARS-CoV-2_400_INSERT_34 2 - +NC_045512.2 10557 10897 SARS-CoV-2_400_INSERT_35 1 - +NC_045512.2 10865 11201 SARS-CoV-2_400_INSERT_36 2 - +NC_045512.2 11181 11514 SARS-CoV-2_400_INSERT_37 1 - +NC_045512.2 11494 11832 SARS-CoV-2_400_INSERT_38 2 - +NC_045512.2 11811 12161 SARS-CoV-2_400_INSERT_39 1 - +NC_045512.2 12137 12477 SARS-CoV-2_400_INSERT_40 2 - +NC_045512.2 12444 12794 SARS-CoV-2_400_INSERT_41 1 - +NC_045512.2 12774 13121 SARS-CoV-2_400_INSERT_42 2 - +NC_045512.2 13099 13458 SARS-CoV-2_400_INSERT_43 1 - +NC_045512.2 13435 13787 SARS-CoV-2_400_INSERT_44 2 - +NC_045512.2 13767 14120 SARS-CoV-2_400_INSERT_45 1 - +NC_045512.2 14100 14427 SARS-CoV-2_400_INSERT_46 2 - +NC_045512.2 14407 14745 SARS-CoV-2_400_INSERT_47 1 - +NC_045512.2 14725 15065 SARS-CoV-2_400_INSERT_48 2 - +NC_045512.2 15045 15386 SARS-CoV-2_400_INSERT_49 1 - +NC_045512.2 15366 15716 SARS-CoV-2_400_INSERT_50 2 - +NC_045512.2 15688 16028 SARS-CoV-2_400_INSERT_51 1 - +NC_045512.2 16018 16386 SARS-CoV-2_400_INSERT_52 2 - +NC_045512.2 16311 16650 SARS-CoV-2_400_INSERT_53 1 - +NC_045512.2 16647 17004 SARS-CoV-2_400_INSERT_54 2 - +NC_045512.2 16994 17333 SARS-CoV-2_400_INSERT_55 1 - +NC_045512.2 17212 17560 SARS-CoV-2_400_INSERT_56 2 - +NC_045512.2 17507 17859 SARS-CoV-2_400_INSERT_57 1 - +NC_045512.2 17839 18181 SARS-CoV-2_400_INSERT_58 2 - +NC_045512.2 18153 18504 SARS-CoV-2_400_INSERT_59 1 - +NC_045512.2 18484 18835 SARS-CoV-2_400_INSERT_60 2 - +NC_045512.2 18815 19170 SARS-CoV-2_400_INSERT_61 1 - +NC_045512.2 19112 19469 SARS-CoV-2_400_INSERT_62 2 - +NC_045512.2 19449 19770 SARS-CoV-2_400_INSERT_63 1 - +NC_045512.2 19750 20091 SARS-CoV-2_400_INSERT_64 2 - +NC_045512.2 20054 20408 SARS-CoV-2_400_INSERT_65 1 - +NC_045512.2 20388 20729 SARS-CoV-2_400_INSERT_66 2 - +NC_045512.2 20676 21018 SARS-CoV-2_400_INSERT_67 1 - +NC_045512.2 21018 21372 SARS-CoV-2_400_INSERT_68 2 - +NC_045512.2 21352 21696 SARS-CoV-2_400_INSERT_69 1 - +NC_045512.2 21607 21927 SARS-CoV-2_400_INSERT_70 2 - +NC_045512.2 21894 22238 SARS-CoV-2_400_INSERT_71 1 - +NC_045512.2 22189 22517 SARS-CoV-2_400_INSERT_72 2 - +NC_045512.2 22494 22839 SARS-CoV-2_400_INSERT_73 1 - +NC_045512.2 22774 23119 SARS-CoV-2_400_INSERT_74 2 - +NC_045512.2 23109 23452 SARS-CoV-2_400_INSERT_75 1 - +NC_045512.2 23258 23609 SARS-CoV-2_400_INSERT_76 2 - +NC_045512.2 23589 23914 SARS-CoV-2_400_INSERT_77 1 - +NC_045512.2 23853 24209 SARS-CoV-2_400_INSERT_78 2 - +NC_045512.2 24189 24535 SARS-CoV-2_400_INSERT_79 1 - +NC_045512.2 24468 24815 SARS-CoV-2_400_INSERT_80 2 - +NC_045512.2 24774 25120 SARS-CoV-2_400_INSERT_81 1 - +NC_045512.2 25082 25423 SARS-CoV-2_400_INSERT_82 2 - +NC_045512.2 25402 25744 SARS-CoV-2_400_INSERT_83 1 - +NC_045512.2 25680 26048 SARS-CoV-2_400_INSERT_84 2 - +NC_045512.2 26039 26382 SARS-CoV-2_400_INSERT_85 1 - +NC_045512.2 26362 26730 SARS-CoV-2_400_INSERT_86 2 - +NC_045512.2 26621 26989 SARS-CoV-2_400_INSERT_87 1 - +NC_045512.2 26981 27349 SARS-CoV-2_400_INSERT_88 2 - +NC_045512.2 27226 27583 SARS-CoV-2_400_INSERT_89 1 - +NC_045512.2 27558 27927 SARS-CoV-2_400_INSERT_90 2 - +NC_045512.2 27860 28209 SARS-CoV-2_400_INSERT_91 1 - +NC_045512.2 28166 28513 SARS-CoV-2_400_INSERT_92 2 - +NC_045512.2 28493 28849 SARS-CoV-2_400_INSERT_93 1 - +NC_045512.2 28829 29203 SARS-CoV-2_400_INSERT_94 2 - +NC_045512.2 29183 29538 SARS-CoV-2_400_INSERT_95 1 - +NC_045512.2 29486 29840 SARS-CoV-2_400_INSERT_96 2 - diff --git a/workflow/Snakefile b/workflow/Snakefile index bbcdc98..d8c1b9c 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -22,7 +22,12 @@ rule all: input: f"results/mutations_of_interest.csv", f"results/all_mutations.csv", + f"results/all_haplotypes.fasta", f"results/all_mutations.annotated.csv", # note that here deletions are excluded + [ + f"results/{sample}/alignment/coverage.merged.subsampled.tsv" + for sample in all_samples + ], rule copy_coverage_file: input: @@ -128,23 +133,41 @@ rule alignment_merged: samtools sort -o {output.fname_bam} {output.fname_bam} """ -# TODO: integrate this tool for capping at a given coverage -# https://lindenb.github.io/jvarkit/Biostar154220.html rule subsample_alignment: input: fname_bam=f"results/{{sample}}/alignment/REF_aln.merged.bam", - fname_coverage=f"results/{{sample}}/alignment/coverage.tsv", output: + fname_bam_unsort=temp(f"results/{{sample}}/alignment/REF_aln.merged.unsort.bam"), + fname_bam_unsort_sub=temp(f"results/{{sample}}/alignment/REF_aln.merged.unsort.subsampled.bam"), fname_bam=f"results/{{sample}}/alignment/REF_aln.merged.subsampled.bam", params: - sample= lambda wc: wc.get("sample"), - coverage_threshold=200000 + coverage_threshold=10000, # note the acutally threshold is at 2*coverage_threshold with jvarkit biostar154220 conda: - "envs/split.yaml" - script: - "./scripts/subsample_aln.py" + "envs/subsampling.yaml" + shell: + """ + jvarkit sortsamrefname --samoutputformat BAM {input.fname_bam} > {output.fname_bam_unsort} + + jvarkit biostar154220 -n {params.coverage_threshold} --samoutputformat BAM {output.fname_bam_unsort} > {output.fname_bam_unsort_sub} + + samtools sort -o {output.fname_bam} {output.fname_bam_unsort_sub} + samtools index {output.fname_bam} + """ +rule stats_coverage: + input: + fname_bam=f"results/{{sample}}/alignment/REF_aln.merged.subsampled.bam", + output: + fname_cov=f"results/{{sample}}/alignment/coverage.merged.subsampled.tsv", + fname_b=f"results/{{sample}}/alignment/basecounts.merged.subsampled.tsv", + conda: + "envs/smallgenomeutilities.yaml" + shell: + """ + aln2basecnt -b {output.fname_b} -c {output.fname_cov} {input.fname_bam} + """ + rule remove_original_aln: input: fname_bam_merged=f"results/{{sample}}/alignment/REF_aln.merged.bam", @@ -158,7 +181,7 @@ rule remove_original_aln: rule run_viloca: input: - fname_bam=f"results/{{sample}}/alignment/REF_aln.merged.bam", + fname_bam=f"results/{{sample}}/alignment/REF_aln.merged.subsampled.bam", #fname_bam=rules.alignment_merged.output.fname_bam, fname_reference=fname_reference, fname_insert_bed=fname_insert_bed, @@ -168,20 +191,50 @@ rule run_viloca: dname_work=directory( f"results/{{sample}}/variant_calling/" ), + dname_haplotypes=directory( + f"results/{{sample}}/variant_calling/support/" + ), benchmark: f"results/{{sample}}/benchmark.tsv" params: sample= lambda wc: wc.get("sample") conda: "envs/viloca.yaml" - threads: 32, + threads: 15, resources: - mem_mb=64000, - runtime=5760, + mem_mb=10000, + runtime=4*24*60, # 4 days script: "scripts/run_viloca.py" +rule get_haplotypes: + input: + dname_haplotypes=directory( + f"results/{{sample}}/variant_calling/support/" + ), + output: + fname_haplotypes=f"results/{{sample}}/haplotypes.fasta", + params: + sample= lambda wc: wc.get("sample") + conda: + "envs/annotate_vcf.yaml" + script: + "scripts/haplotypes.py" + +rule collect_haplotypes: + input: + haplos=[ + f"results/{sample}/haplotypes.fasta" + for sample in all_samples + ], + output: + fname=f"results/all_haplotypes.fasta", + script: + "scripts/combine_txt_files.py" + + + rule annotate_vcf: input: fname_snvs_vcf=f"results/{{sample}}/snvs.vcf", diff --git a/workflow/envs/smallgenomeutilities.yaml b/workflow/envs/smallgenomeutilities.yaml new file mode 100644 index 0000000..c79be54 --- /dev/null +++ b/workflow/envs/smallgenomeutilities.yaml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - smallgenomeutilities=0.3.9 diff --git a/workflow/envs/subsampling.yaml b/workflow/envs/subsampling.yaml new file mode 100644 index 0000000..8a293a1 --- /dev/null +++ b/workflow/envs/subsampling.yaml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda +dependencies: + - jvarkit + - samtools=1.9 diff --git a/workflow/scripts/combine_txt_files.py b/workflow/scripts/combine_txt_files.py new file mode 100644 index 0000000..a42ad92 --- /dev/null +++ b/workflow/scripts/combine_txt_files.py @@ -0,0 +1,16 @@ +#!/usr/bin/env python3 + +def main(input_files, output_file): + + with open(output_file, 'w') as outfile: + for fname in input_files: + with open(fname) as infile: + outfile.write(infile.read()) + + + +if __name__ == "__main__": + main( + snakemake.input.haplos, + snakemake.output.fname, + ) diff --git a/workflow/scripts/haplotypes.py b/workflow/scripts/haplotypes.py new file mode 100644 index 0000000..813897d --- /dev/null +++ b/workflow/scripts/haplotypes.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python3 + +from Bio import SeqIO +from Bio.Seq import Seq +from Bio.SeqRecord import SeqRecord +import glob +import subprocess + +def unzip(fname): + subprocess.run( + [ + "gunzip", + str(fname), + ], + ) + + +def main(dname_haplotypes, sample_name, fname_out_haplos): + + for fname in glob.glob(f"{dname_haplotypes}*.reads-support.fas.gz"): + unzip(fname) + + all_haplotype_files = [file for file in glob.glob(f"{dname_haplotypes}*.reads-support.fas")] + + # define output file + with open(fname_out_haplos, 'w') as out_f: + my_seqs = [] + + # iterate over all haplotype files + for fname_haplotype_region in fnames_haplotypes: + region = fname_haplotype_region.split(".reads-support.fas")[0].split("w-")[1] + for record in SeqIO.parse(fname_haplotype_region, 'fasta'): + new_id = sample_name+'-'+region + '-' +str(record.id) + my_seqs.append(SeqRecord(Seq(record.seq), id = new_id)) + + # write all sequences to file + SeqIO.write(my_seqs, out_f, "fasta") + + + +if __name__ == "__main__": + main( + snakemake.input.dname_haplotypes, + snakemake.params.sample, + snakemake.output.fname_haplotypes, + ) diff --git a/workflow/scripts/run_viloca.py b/workflow/scripts/run_viloca.py index 8812081..17167a9 100644 --- a/workflow/scripts/run_viloca.py +++ b/workflow/scripts/run_viloca.py @@ -5,7 +5,7 @@ import pandas as pd -def main(fname_bam, fname_reference, fname_insert_bed, fname_results_snv, dname_work, fname_bad_samples, sample): +def main(fname_bam, fname_reference, fname_insert_bed, fname_results_snv, dname_work, fname_bad_samples, sample, threads): bad_samples = pd.read_csv(fname_bad_samples)['sample'].values.tolist() @@ -46,6 +46,8 @@ def main(fname_bam, fname_reference, fname_insert_bed, fname_results_snv, dname_ #"NC_045512.2:10099-23327", "--min_windows_coverage", "1", + "--threads", + str(threads) ], cwd=dname_work, ) @@ -62,4 +64,5 @@ def main(fname_bam, fname_reference, fname_insert_bed, fname_results_snv, dname_ Path(snakemake.output.dname_work), snakemake.input.fname_bad_samples, snakemake.params.sample, + snakemake.threads )