Skip to content

Commit

Permalink
Feat subsample use jvarkit (#3)
Browse files Browse the repository at this point in the history
* current ba.2 samples

* [add] subsampling with jvarkit

* update threads

* subsampling

* conda env paths

* conda env paths

* conda env paths

* correct path

* primer file for BA.2 variant

* add TODO for env_jvarkit

* [add] collect haplotype sequences of all samples

* typo

* [run_viloca] adapt resources

* sort subsampled bam file

* fix: typo

* add conda env for subsampling

* [subsampling rule] update command

* add jvarkit conda package + change coverage cap threshold

* modify coverage threshold

* use full bed file

* add coverage stats after subsampling

* add new insert bed file version

* fix

* add conda for stats_coverage

* new threshold
  • Loading branch information
LaraFuhrmann authored Oct 24, 2023
1 parent 37c7fc0 commit 2fe8abf
Show file tree
Hide file tree
Showing 9 changed files with 245 additions and 24 deletions.
2 changes: 1 addition & 1 deletion config/config.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
fname_reference: resources/NC_045512.2.fasta
fname_insert_bed: resources/SARS-CoV-2.insert.bed
fname_insert_bed: resources/SARS-CoV-2.v532.insert.bed
fname_samples: config/samples.csv
dir_path_samples: # add path to samples here
15 changes: 5 additions & 10 deletions config/samples.csv
Original file line number Diff line number Diff line change
@@ -1,11 +1,6 @@
,sample,batch
5342,A2_10_2021_04_08,20210423_JFPC6
2529,A2_10_2022_04_07,20220422_HTN5VDRXY
2480,A2_10_2022_04_14,20220429_HTYNFDRXY
2434,A2_10_2022_04_21,20220506_HTYK5DRXY
2390,A2_10_2022_04_28,20220513_HTLLHDRXY
1,A2_10_2023_04_06,20230421_HNH2FDRX2
5354,B2_10_2021_04_09,20210423_JFPC6
2577,B2_10_2022_04_01,20220414_HTNCFDRXY
2535,B2_10_2022_04_08,20220422_HTN5VDRXY
2486,B2_10_2022_04_15,20220429_HTYNFDRXY
,H2_15_2023_08_08,20230825_HHK3MDRX3
,A3_15_2023_08_10,20230825_HHK3MDRX3
,B3_15_2023_08_05,20230818_HHKGTDRX3
,C6_25_2023_08_05,20230818_HHKGTDRX3
,D6_25_2023_08_06,20230818_HHKGTDRX3
96 changes: 96 additions & 0 deletions resources/SARS-CoV-2.v532.insert.bed
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
NC_045512.2 78 419 SARS-CoV-2_400_INSERT_1 1 -
NC_045512.2 366 707 SARS-CoV-2_400_INSERT_2 2 -
NC_045512.2 661 1018 SARS-CoV-2_400_INSERT_3 1 -
NC_045512.2 995 1340 SARS-CoV-2_400_INSERT_4 2 -
NC_045512.2 1320 1660 SARS-CoV-2_400_INSERT_5 1 -
NC_045512.2 1596 1945 SARS-CoV-2_400_INSERT_6 2 -
NC_045512.2 1905 2259 SARS-CoV-2_400_INSERT_7 1 -
NC_045512.2 2252 2603 SARS-CoV-2_400_INSERT_8 2 -
NC_045512.2 2563 2900 SARS-CoV-2_400_INSERT_9 1 -
NC_045512.2 2880 3233 SARS-CoV-2_400_INSERT_10 2 -
NC_045512.2 3213 3560 SARS-CoV-2_400_INSERT_11 1 -
NC_045512.2 3540 3883 SARS-CoV-2_400_INSERT_12 2 -
NC_045512.2 3824 4147 SARS-CoV-2_400_INSERT_13 1 -
NC_045512.2 4108 4457 SARS-CoV-2_400_INSERT_14 2 -
NC_045512.2 4425 4776 SARS-CoV-2_400_INSERT_15 1 -
NC_045512.2 4756 5089 SARS-CoV-2_400_INSERT_16 2 -
NC_045512.2 5063 5398 SARS-CoV-2_400_INSERT_17 1 -
NC_045512.2 5370 5716 SARS-CoV-2_400_INSERT_18 2 -
NC_045512.2 5696 6031 SARS-CoV-2_400_INSERT_19 1 -
NC_045512.2 5923 6257 SARS-CoV-2_400_INSERT_20 2 -
NC_045512.2 6237 6562 SARS-CoV-2_400_INSERT_21 1 -
NC_045512.2 6542 6882 SARS-CoV-2_400_INSERT_22 2 -
NC_045512.2 6854 7199 SARS-CoV-2_400_INSERT_23 1 -
NC_045512.2 7179 7518 SARS-CoV-2_400_INSERT_24 2 -
NC_045512.2 7482 7819 SARS-CoV-2_400_INSERT_25 1 -
NC_045512.2 7797 8136 SARS-CoV-2_400_INSERT_26 2 -
NC_045512.2 8112 8468 SARS-CoV-2_400_INSERT_27 1 -
NC_045512.2 8436 8781 SARS-CoV-2_400_INSERT_28 2 -
NC_045512.2 8761 9107 SARS-CoV-2_400_INSERT_29 1 -
NC_045512.2 9052 9397 SARS-CoV-2_400_INSERT_30 2 -
NC_045512.2 9324 9673 SARS-CoV-2_400_INSERT_31 1 -
NC_045512.2 9604 9949 SARS-CoV-2_400_INSERT_32 2 -
NC_045512.2 9929 10266 SARS-CoV-2_400_INSERT_33 1 -
NC_045512.2 10245 10587 SARS-CoV-2_400_INSERT_34 2 -
NC_045512.2 10557 10897 SARS-CoV-2_400_INSERT_35 1 -
NC_045512.2 10865 11201 SARS-CoV-2_400_INSERT_36 2 -
NC_045512.2 11181 11514 SARS-CoV-2_400_INSERT_37 1 -
NC_045512.2 11494 11832 SARS-CoV-2_400_INSERT_38 2 -
NC_045512.2 11811 12161 SARS-CoV-2_400_INSERT_39 1 -
NC_045512.2 12137 12477 SARS-CoV-2_400_INSERT_40 2 -
NC_045512.2 12444 12794 SARS-CoV-2_400_INSERT_41 1 -
NC_045512.2 12774 13121 SARS-CoV-2_400_INSERT_42 2 -
NC_045512.2 13099 13458 SARS-CoV-2_400_INSERT_43 1 -
NC_045512.2 13435 13787 SARS-CoV-2_400_INSERT_44 2 -
NC_045512.2 13767 14120 SARS-CoV-2_400_INSERT_45 1 -
NC_045512.2 14100 14427 SARS-CoV-2_400_INSERT_46 2 -
NC_045512.2 14407 14745 SARS-CoV-2_400_INSERT_47 1 -
NC_045512.2 14725 15065 SARS-CoV-2_400_INSERT_48 2 -
NC_045512.2 15045 15386 SARS-CoV-2_400_INSERT_49 1 -
NC_045512.2 15366 15716 SARS-CoV-2_400_INSERT_50 2 -
NC_045512.2 15688 16028 SARS-CoV-2_400_INSERT_51 1 -
NC_045512.2 16018 16386 SARS-CoV-2_400_INSERT_52 2 -
NC_045512.2 16311 16650 SARS-CoV-2_400_INSERT_53 1 -
NC_045512.2 16647 17004 SARS-CoV-2_400_INSERT_54 2 -
NC_045512.2 16994 17333 SARS-CoV-2_400_INSERT_55 1 -
NC_045512.2 17212 17560 SARS-CoV-2_400_INSERT_56 2 -
NC_045512.2 17507 17859 SARS-CoV-2_400_INSERT_57 1 -
NC_045512.2 17839 18181 SARS-CoV-2_400_INSERT_58 2 -
NC_045512.2 18153 18504 SARS-CoV-2_400_INSERT_59 1 -
NC_045512.2 18484 18835 SARS-CoV-2_400_INSERT_60 2 -
NC_045512.2 18815 19170 SARS-CoV-2_400_INSERT_61 1 -
NC_045512.2 19112 19469 SARS-CoV-2_400_INSERT_62 2 -
NC_045512.2 19449 19770 SARS-CoV-2_400_INSERT_63 1 -
NC_045512.2 19750 20091 SARS-CoV-2_400_INSERT_64 2 -
NC_045512.2 20054 20408 SARS-CoV-2_400_INSERT_65 1 -
NC_045512.2 20388 20729 SARS-CoV-2_400_INSERT_66 2 -
NC_045512.2 20676 21018 SARS-CoV-2_400_INSERT_67 1 -
NC_045512.2 21018 21372 SARS-CoV-2_400_INSERT_68 2 -
NC_045512.2 21352 21696 SARS-CoV-2_400_INSERT_69 1 -
NC_045512.2 21607 21927 SARS-CoV-2_400_INSERT_70 2 -
NC_045512.2 21894 22238 SARS-CoV-2_400_INSERT_71 1 -
NC_045512.2 22189 22517 SARS-CoV-2_400_INSERT_72 2 -
NC_045512.2 22494 22839 SARS-CoV-2_400_INSERT_73 1 -
NC_045512.2 22774 23119 SARS-CoV-2_400_INSERT_74 2 -
NC_045512.2 23109 23452 SARS-CoV-2_400_INSERT_75 1 -
NC_045512.2 23258 23609 SARS-CoV-2_400_INSERT_76 2 -
NC_045512.2 23589 23914 SARS-CoV-2_400_INSERT_77 1 -
NC_045512.2 23853 24209 SARS-CoV-2_400_INSERT_78 2 -
NC_045512.2 24189 24535 SARS-CoV-2_400_INSERT_79 1 -
NC_045512.2 24468 24815 SARS-CoV-2_400_INSERT_80 2 -
NC_045512.2 24774 25120 SARS-CoV-2_400_INSERT_81 1 -
NC_045512.2 25082 25423 SARS-CoV-2_400_INSERT_82 2 -
NC_045512.2 25402 25744 SARS-CoV-2_400_INSERT_83 1 -
NC_045512.2 25680 26048 SARS-CoV-2_400_INSERT_84 2 -
NC_045512.2 26039 26382 SARS-CoV-2_400_INSERT_85 1 -
NC_045512.2 26362 26730 SARS-CoV-2_400_INSERT_86 2 -
NC_045512.2 26621 26989 SARS-CoV-2_400_INSERT_87 1 -
NC_045512.2 26981 27349 SARS-CoV-2_400_INSERT_88 2 -
NC_045512.2 27226 27583 SARS-CoV-2_400_INSERT_89 1 -
NC_045512.2 27558 27927 SARS-CoV-2_400_INSERT_90 2 -
NC_045512.2 27860 28209 SARS-CoV-2_400_INSERT_91 1 -
NC_045512.2 28166 28513 SARS-CoV-2_400_INSERT_92 2 -
NC_045512.2 28493 28849 SARS-CoV-2_400_INSERT_93 1 -
NC_045512.2 28829 29203 SARS-CoV-2_400_INSERT_94 2 -
NC_045512.2 29183 29538 SARS-CoV-2_400_INSERT_95 1 -
NC_045512.2 29486 29840 SARS-CoV-2_400_INSERT_96 2 -
77 changes: 65 additions & 12 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,12 @@ rule all:
input:
f"results/mutations_of_interest.csv",
f"results/all_mutations.csv",
f"results/all_haplotypes.fasta",
f"results/all_mutations.annotated.csv", # note that here deletions are excluded
[
f"results/{sample}/alignment/coverage.merged.subsampled.tsv"
for sample in all_samples
],

rule copy_coverage_file:
input:
Expand Down Expand Up @@ -128,23 +133,41 @@ rule alignment_merged:
samtools sort -o {output.fname_bam} {output.fname_bam}
"""

# TODO: integrate this tool for capping at a given coverage
# https://lindenb.github.io/jvarkit/Biostar154220.html
rule subsample_alignment:
input:
fname_bam=f"results/{{sample}}/alignment/REF_aln.merged.bam",
fname_coverage=f"results/{{sample}}/alignment/coverage.tsv",
output:
fname_bam_unsort=temp(f"results/{{sample}}/alignment/REF_aln.merged.unsort.bam"),
fname_bam_unsort_sub=temp(f"results/{{sample}}/alignment/REF_aln.merged.unsort.subsampled.bam"),
fname_bam=f"results/{{sample}}/alignment/REF_aln.merged.subsampled.bam",
params:
sample= lambda wc: wc.get("sample"),
coverage_threshold=200000
coverage_threshold=10000, # note the acutally threshold is at 2*coverage_threshold with jvarkit biostar154220
conda:
"envs/split.yaml"
script:
"./scripts/subsample_aln.py"
"envs/subsampling.yaml"
shell:
"""
jvarkit sortsamrefname --samoutputformat BAM {input.fname_bam} > {output.fname_bam_unsort}
jvarkit biostar154220 -n {params.coverage_threshold} --samoutputformat BAM {output.fname_bam_unsort} > {output.fname_bam_unsort_sub}
samtools sort -o {output.fname_bam} {output.fname_bam_unsort_sub}
samtools index {output.fname_bam}
"""


rule stats_coverage:
input:
fname_bam=f"results/{{sample}}/alignment/REF_aln.merged.subsampled.bam",
output:
fname_cov=f"results/{{sample}}/alignment/coverage.merged.subsampled.tsv",
fname_b=f"results/{{sample}}/alignment/basecounts.merged.subsampled.tsv",
conda:
"envs/smallgenomeutilities.yaml"
shell:
"""
aln2basecnt -b {output.fname_b} -c {output.fname_cov} {input.fname_bam}
"""

rule remove_original_aln:
input:
fname_bam_merged=f"results/{{sample}}/alignment/REF_aln.merged.bam",
Expand All @@ -158,7 +181,7 @@ rule remove_original_aln:

rule run_viloca:
input:
fname_bam=f"results/{{sample}}/alignment/REF_aln.merged.bam",
fname_bam=f"results/{{sample}}/alignment/REF_aln.merged.subsampled.bam",
#fname_bam=rules.alignment_merged.output.fname_bam,
fname_reference=fname_reference,
fname_insert_bed=fname_insert_bed,
Expand All @@ -168,20 +191,50 @@ rule run_viloca:
dname_work=directory(
f"results/{{sample}}/variant_calling/"
),
dname_haplotypes=directory(
f"results/{{sample}}/variant_calling/support/"
),
benchmark:
f"results/{{sample}}/benchmark.tsv"
params:
sample= lambda wc: wc.get("sample")
conda:
"envs/viloca.yaml"
threads: 32,
threads: 15,
resources:
mem_mb=64000,
runtime=5760,
mem_mb=10000,
runtime=4*24*60, # 4 days
script:
"scripts/run_viloca.py"


rule get_haplotypes:
input:
dname_haplotypes=directory(
f"results/{{sample}}/variant_calling/support/"
),
output:
fname_haplotypes=f"results/{{sample}}/haplotypes.fasta",
params:
sample= lambda wc: wc.get("sample")
conda:
"envs/annotate_vcf.yaml"
script:
"scripts/haplotypes.py"

rule collect_haplotypes:
input:
haplos=[
f"results/{sample}/haplotypes.fasta"
for sample in all_samples
],
output:
fname=f"results/all_haplotypes.fasta",
script:
"scripts/combine_txt_files.py"



rule annotate_vcf:
input:
fname_snvs_vcf=f"results/{{sample}}/snvs.vcf",
Expand Down
6 changes: 6 additions & 0 deletions workflow/envs/smallgenomeutilities.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- smallgenomeutilities=0.3.9
6 changes: 6 additions & 0 deletions workflow/envs/subsampling.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
channels:
- conda-forge
- bioconda
dependencies:
- jvarkit
- samtools=1.9
16 changes: 16 additions & 0 deletions workflow/scripts/combine_txt_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/usr/bin/env python3

def main(input_files, output_file):

with open(output_file, 'w') as outfile:
for fname in input_files:
with open(fname) as infile:
outfile.write(infile.read())



if __name__ == "__main__":
main(
snakemake.input.haplos,
snakemake.output.fname,
)
46 changes: 46 additions & 0 deletions workflow/scripts/haplotypes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#!/usr/bin/env python3

from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import glob
import subprocess

def unzip(fname):
subprocess.run(
[
"gunzip",
str(fname),
],
)


def main(dname_haplotypes, sample_name, fname_out_haplos):

for fname in glob.glob(f"{dname_haplotypes}*.reads-support.fas.gz"):
unzip(fname)

all_haplotype_files = [file for file in glob.glob(f"{dname_haplotypes}*.reads-support.fas")]

# define output file
with open(fname_out_haplos, 'w') as out_f:
my_seqs = []

# iterate over all haplotype files
for fname_haplotype_region in fnames_haplotypes:
region = fname_haplotype_region.split(".reads-support.fas")[0].split("w-")[1]
for record in SeqIO.parse(fname_haplotype_region, 'fasta'):
new_id = sample_name+'-'+region + '-' +str(record.id)
my_seqs.append(SeqRecord(Seq(record.seq), id = new_id))

# write all sequences to file
SeqIO.write(my_seqs, out_f, "fasta")



if __name__ == "__main__":
main(
snakemake.input.dname_haplotypes,
snakemake.params.sample,
snakemake.output.fname_haplotypes,
)
5 changes: 4 additions & 1 deletion workflow/scripts/run_viloca.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import pandas as pd


def main(fname_bam, fname_reference, fname_insert_bed, fname_results_snv, dname_work, fname_bad_samples, sample):
def main(fname_bam, fname_reference, fname_insert_bed, fname_results_snv, dname_work, fname_bad_samples, sample, threads):

bad_samples = pd.read_csv(fname_bad_samples)['sample'].values.tolist()

Expand Down Expand Up @@ -46,6 +46,8 @@ def main(fname_bam, fname_reference, fname_insert_bed, fname_results_snv, dname_
#"NC_045512.2:10099-23327",
"--min_windows_coverage",
"1",
"--threads",
str(threads)
],
cwd=dname_work,
)
Expand All @@ -62,4 +64,5 @@ def main(fname_bam, fname_reference, fname_insert_bed, fname_results_snv, dname_
Path(snakemake.output.dname_work),
snakemake.input.fname_bad_samples,
snakemake.params.sample,
snakemake.threads
)

0 comments on commit 2fe8abf

Please sign in to comment.