Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add VEP annotation #44

Open
wants to merge 1 commit into
base: ReleaseBranch_2.4.1
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions resources/analysisTools/indelCallingWorkflow/annotate_with_VEP.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#!/bin/bash

## Annotate the variants with VEP

## To run
## LOCAL: sh annotate_variant_with_VEP.sh snvs_{pid}_somatic_snvs_conf_8_to_10.vcf snvs_{pid}_somatic_snvs_conf_8_to_10.VEP.vcf

vep_species="homo_sapiens"
vep_assembly="GRCh37"
vep_out_format="vcf"

input_vcf=${1}
output_vcf=${2}
threads=${VEP_FORKS}

## Annotate the high confidence variants
## Parse for the functional consequences
${PERL_BINARY} ${VEP_SW_PATH} \
--input_file $input_vcf \
--species $vep_species \
--assembly $vep_assembly \
--output_file STDOUT \
--format $vep_out_format \
--fork $threads \
--fasta ${VEP_FA_INDEX} \
--everything \
--vcf \
--cache \
--offline \
--force_overwrite \
--no_stats \
--dir_cache ${VEP_CACHE_BASE} \
| ${PYTHON_BINARY} ${TOOL_PARSE_VEP} | ${BGZIP_BINARY} -f > $output_vcf
10 changes: 10 additions & 0 deletions resources/analysisTools/indelCallingWorkflow/filter_vcf.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,16 @@
source ${TOOL_ANALYZE_BAM_HEADER}
getRefGenomeAndChrPrefixFromHeader ${FILENAME_TUMOR_BAM} # Sets CHR_PREFIX and REFERENCE_GENOME

########################################## VEP annotation #######################################
## Run VEP on the somatic high confidence SNVs
${TOOL_ANNOTATE_VEP} ${FILENAME_VCF} ${FILENAME_VCF}.tmp
[[ "$?" != 0 ]] && echo "There was a non-zero exit code in VEP annotation" && exit 8

# Overwrite the original VCF file with the VEP annotated one
# NOTE: If there is an error here, one has to rerun the Annotation and DeepAnnotation steps
mv ${FILENAME_VCF}.tmp ${FILENAME_VCF} && ${TABIX_BINARY} -f -p vcf ${FILENAME_VCF}
[[ "$?" != 0 ]] && echo "There was a non-zero exit code in VEP annotation" && exit 9

########################################## Filter ###############################################

outputFilenamePrefix=${FILENAME_VCF%.vcf.gz}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ ${PLATYPUS_BINARY} callVariants \
--verbosity=1 \
--bufferSize=${PLATYPUS_BUFFER_SIZE} \
--maxReads=${PLATYPUS_MAX_READS} \
--minFlank=0 \
--minFlank=0 \
${PLATYPUS_PARAMS}

[[ $? -gt 0 ]] && echo "Error during platypus indel calling." && exit 1
Expand Down
147 changes: 147 additions & 0 deletions resources/analysisTools/indelCallingWorkflow/parse_VEP_annotations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
"""
parse_VEP_annotations.py

This script parses VEP annotations from the input and writes the parsed output to STDOUT.
It contains functions to parse VEP format, gene consequences and HGVSc annotations, and format transcript information.

Usage:
cat VEP_annotated.vcf | python parse_VEP_annotations.py > VEP_annotated_parsed.vcf

"""

import sys

def parse_vep_annotations():
"""
Parses VEP annotations from the input and writes the parsed output to STDOUT.

This function reads input from `sys.stdin` line by line and processes each line.
If a line starts with "#" and contains "##INFO=<ID=CSQ", it extracts the VEP format.
If a line starts with "#CHROM", it appends the header for the VEP gene consequence and HGVSc columns.
For all other lines, it splits the line by tab and extracts the info field.
The info field is then split by semicolon and parsed to extract gene consequence and HGVSc information.
Finally, the transcript information is formatted and written to STDOUT.

Args:
None

Returns:
None
"""
vep_format = {}

for line in sys.stdin:
line = line.strip()
gene_consequence_hgvsc_ds = {}

if line.startswith("#"):
if line.startswith("##INFO=<ID=CSQ"):
vep_format = parse_vep_format(line)
if line.startswith("#CHROM"):
line = line + "\tVEP_TRANSCRIPTS"
# write to STDOUT
sys.stdout.write("{0}\n".format(line))
else:
variant_infos = line.split("\t")
info = variant_infos[7]
info = info.split(";")
gene_consequence_hgvsc_ds = parse_gene_consequence_hgvsc(info, vep_format, gene_consequence_hgvsc_ds)
line = format_transcript_info(line, gene_consequence_hgvsc_ds)

# write to STDOUT
sys.stdout.write(line)

def parse_vep_format(line):
"""
Parses the VEP format line and returns a dictionary mapping each field to its index.

Args:
line (str): The VEP format line.

Returns:
dict: A dictionary mapping each field to its index.
"""
vep_format = {}
vep_format_line = line.split("Format: ")[1]
vep_format_line = vep_format_line.split("|")
for i, field in enumerate(vep_format_line):
vep_format[field] = i
return vep_format

def parse_gene_consequence_hgvsc(info, vep_format, gene_consequence_hgvsc_ds):
"""
Parses gene consequences and HGVSc annotations from VEP annotations.

Args:
info (list): List of VEP annotations.
vep_format (dict): Dictionary mapping VEP annotation fields to their indices.
gene_consequence_hgvsc_ds (dict): Dictionary to store gene consequences and HGVSc annotations.

Returns:
dict: Updated gene_consequence_hgvsc_ds dictionary with gene consequences and HGVSc annotations.

"""
for i in info:
if i.startswith("CSQ="):
i = i.replace("CSQ=", "")
i = i.split(",")
for j in i:
j = j.split("|")
gene_name = j[vep_format["SYMBOL"]]
consequence = j[vep_format["Consequence"]]
transcript = j[vep_format["Feature"]]
hgvsc = j[vep_format["HGVSc"]].split(":")[1] if j[vep_format["HGVSc"]] != "" else ""
hgvsp = j[vep_format["HGVSp"]].split(":")[1] if j[vep_format["HGVSp"]] != "" else ""
exon = j[vep_format["EXON"]].split("/")[0] if j[vep_format["EXON"]] != "" else ""
intron = j[vep_format["INTRON"]].split("/")[0] if j[vep_format["INTRON"]] != "" else ""

if exon != "" and intron == "":
gene_part = "exon" + exon
elif intron != "" and exon == "":
gene_part = "intron" + intron
else:
gene_part = "exon" + exon + "-intron" + intron

biotype = j[vep_format["BIOTYPE"]]
impact = j[vep_format["IMPACT"]]

# Format transcript information with exon number, HGVSc, and HGVSp
tran_exon_hgvsc_p = "{0}:{1}:{2}:{3}".format(transcript, gene_part, hgvsc, hgvsp)

if biotype == "protein_coding" and impact in ["HIGH", "MODERATE"]:
if gene_name in gene_consequence_hgvsc_ds:
if consequence in gene_consequence_hgvsc_ds[gene_name]:
gene_consequence_hgvsc_ds[gene_name][consequence].append(tran_exon_hgvsc_p)
else:
gene_consequence_hgvsc_ds[gene_name][consequence] = [tran_exon_hgvsc_p]
else:
gene_consequence_hgvsc_ds[gene_name] = {consequence: [tran_exon_hgvsc_p]}
return gene_consequence_hgvsc_ds

def format_transcript_info(line, gene_consequence_hgvsc_ds):
"""
Formats the transcript information based on the gene, consequence, and HGVSc data.

Args:
line (str): The input line to be formatted.
gene_consequence_hgvsc_ds (dict): A dictionary containing gene, consequence, and HGVSc data.

Returns:
str: The formatted line with transcript information.

"""
if len(gene_consequence_hgvsc_ds) > 0:
transcript_info = ""
for gene in gene_consequence_hgvsc_ds:
for consequence in gene_consequence_hgvsc_ds[gene]:
transcript_info += "{0}|{1}({2});".format(gene, consequence, ','.join(gene_consequence_hgvsc_ds[gene][consequence]))
line = line + "\t" + transcript_info + "\n"
else:
line = line + "\t.\n"
return line


if __name__ == "__main__":

# Parse VEP annotations and write to STDOUT
parse_vep_annotations()
14 changes: 10 additions & 4 deletions resources/configurationFiles/analysisIndelCalling.xml
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,12 @@

-->

<cvalue name='VEP_VERSION' value='110.1' type='string' />
<cvalue name='VEP_SW_PATH' value='/software/vep/${VEP_VERSION}/vep' type='path' />
<cvalue name='VEP_FORKS' value='3' type='integer'/>
<cvalue name='VEP_FA_INDEX' value='${assembliesHG191000GenomesDirectory}/tools_data/germlineSmVs/VEP/reference/hs37d5_PhiX.fa' type ='path' />
<cvalue name='VEP_CACHE_BASE' value='/omics/odcf/reference_data/by-tool/vep' type='path' />

<configurationValueBundle name='PIPE_CONFIG:INDEL_RELIABILITY'>
<cvalue name='MAPABILITY' value='"${hg19DatabaseUCSCDirectory}/wgEncodeCrgMapabilityAlign100mer_chr.bedGraph.gz:::--breakPointMode --aEndOffset=1"' type="path"/>
<cvalue name='HISEQDEPTH' value='${hg19DatabaseUCSCDirectory}/HiSeqDepthTop10Pct_chr.bed.gz' type="path"/>
Expand Down Expand Up @@ -270,10 +276,10 @@
</tool>
<tool name="indelVcfFilter" value="filter_vcf.sh" basepath="indelCallingWorkflow">
<resourcesets>
<rset size="s" memory="1" cores="1" nodes="1" walltime="1"/>
<rset size="m" memory="1" cores="1" nodes="1" walltime="2"/>
<rset size="l" memory="1" cores="1" nodes="1" walltime="10"/>
<rset size="xl" memory="1" cores="1" nodes="1" walltime="72"/>
<rset size="s" memory="2" cores="3" nodes="1" walltime="1"/>
<rset size="m" memory="3" cores="3" nodes="1" walltime="2"/>
<rset size="l" memory="3" cores="3" nodes="1" walltime="10"/>
<rset size="xl" memory="4" cores="3" nodes="1" walltime="72"/>
</resourcesets>
<input type="file" typeof="TumorBamFile" scriptparameter="FILENAME_TUMOR_BAM"/>
<input type="file" typeof="ControlBamFile" scriptparameter="FILENAME_CONTROL_BAM"/>
Expand Down