DKFZ-ODCF · NagaComBio · May 31, 2024
diff --git a/resources/analysisTools/indelCallingWorkflow/annotate_with_VEP.sh b/resources/analysisTools/indelCallingWorkflow/annotate_with_VEP.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+## Annotate the variants with VEP
+
+## To run
+## LOCAL: sh annotate_variant_with_VEP.sh snvs_{pid}_somatic_snvs_conf_8_to_10.vcf snvs_{pid}_somatic_snvs_conf_8_to_10.VEP.vcf
+
+vep_species="homo_sapiens"
+vep_assembly="GRCh37"
+vep_out_format="vcf"
+
+input_vcf=${1}
+output_vcf=${2}
+threads=${VEP_FORKS}
+
+## Annotate the high confidence variants
+## Parse for the functional consequences
+${PERL_BINARY} ${VEP_SW_PATH} \
+  --input_file $input_vcf \
+  --species $vep_species \
+  --assembly $vep_assembly \
+  --output_file STDOUT \
+  --format $vep_out_format \
+  --fork $threads \
+  --fasta ${VEP_FA_INDEX} \
+  --everything \
+  --vcf \
+  --cache \
+  --offline \
+  --force_overwrite \
+  --no_stats \
+  --dir_cache ${VEP_CACHE_BASE} \
+  | ${PYTHON_BINARY} ${TOOL_PARSE_VEP} | ${BGZIP_BINARY} -f > $output_vcf
diff --git a/resources/analysisTools/indelCallingWorkflow/filter_vcf.sh b/resources/analysisTools/indelCallingWorkflow/filter_vcf.sh
@@ -22,6 +22,16 @@
 source ${TOOL_ANALYZE_BAM_HEADER}
 getRefGenomeAndChrPrefixFromHeader ${FILENAME_TUMOR_BAM} # Sets CHR_PREFIX and REFERENCE_GENOME
 
+########################################## VEP annotation #######################################
+## Run VEP on the somatic high confidence SNVs
+${TOOL_ANNOTATE_VEP} ${FILENAME_VCF} ${FILENAME_VCF}.tmp
+[[ "$?" != 0 ]] && echo "There was a non-zero exit code in VEP annotation" && exit 8
+
+# Overwrite the original VCF file with the VEP annotated one
+# NOTE: If there is an error here, one has to rerun the Annotation and DeepAnnotation steps
+mv ${FILENAME_VCF}.tmp ${FILENAME_VCF} && ${TABIX_BINARY} -f -p vcf ${FILENAME_VCF}
+[[ "$?" != 0 ]] && echo "There was a non-zero exit code in VEP annotation" && exit 9
+
 ########################################## Filter ###############################################
 
 outputFilenamePrefix=${FILENAME_VCF%.vcf.gz}

diff --git a/resources/analysisTools/indelCallingWorkflow/indelCalling.sh b/resources/analysisTools/indelCallingWorkflow/indelCalling.sh
@@ -49,7 +49,7 @@ ${PLATYPUS_BINARY} callVariants \
 	--verbosity=1 \
 	--bufferSize=${PLATYPUS_BUFFER_SIZE} \
 	--maxReads=${PLATYPUS_MAX_READS} \
-        --minFlank=0 \
+	--minFlank=0 \
 	${PLATYPUS_PARAMS}
 
 [[ $? -gt 0 ]] && echo "Error during platypus indel calling." && exit 1

diff --git a/resources/analysisTools/indelCallingWorkflow/parse_VEP_annotations.py b/resources/analysisTools/indelCallingWorkflow/parse_VEP_annotations.py
@@ -0,0 +1,147 @@
+"""
+parse_VEP_annotations.py
+
+This script parses VEP annotations from the input and writes the parsed output to STDOUT.
+It contains functions to parse VEP format, gene consequences and HGVSc annotations, and format transcript information.
+
+Usage:
+    cat VEP_annotated.vcf | python parse_VEP_annotations.py > VEP_annotated_parsed.vcf
+
+"""
+
+import sys
+
+def parse_vep_annotations():
+    """
+    Parses VEP annotations from the input and writes the parsed output to STDOUT.
+
+    This function reads input from `sys.stdin` line by line and processes each line.
+    If a line starts with "#" and contains "##INFO=<ID=CSQ", it extracts the VEP format.
+    If a line starts with "#CHROM", it appends the header for the VEP gene consequence and HGVSc columns.
+    For all other lines, it splits the line by tab and extracts the info field.
+    The info field is then split by semicolon and parsed to extract gene consequence and HGVSc information.
+    Finally, the transcript information is formatted and written to STDOUT.
+
+    Args:
+        None
+
+    Returns:
+        None
+    """
+    vep_format = {}
+
+    for line in sys.stdin:
+        line = line.strip()
+        gene_consequence_hgvsc_ds = {}
+
+        if line.startswith("#"):
+            if line.startswith("##INFO=<ID=CSQ"):
+                vep_format = parse_vep_format(line)
+            if line.startswith("#CHROM"):
+                line = line + "\tVEP_TRANSCRIPTS"
+            # write to STDOUT
+            sys.stdout.write("{0}\n".format(line))
+        else:
+            variant_infos = line.split("\t")
+            info = variant_infos[7]
+            info = info.split(";")
+            gene_consequence_hgvsc_ds = parse_gene_consequence_hgvsc(info, vep_format, gene_consequence_hgvsc_ds)
+            line = format_transcript_info(line, gene_consequence_hgvsc_ds)
+
+            # write to STDOUT
+            sys.stdout.write(line)
+
+def parse_vep_format(line):
+    """
+    Parses the VEP format line and returns a dictionary mapping each field to its index.
+
+    Args:
+        line (str): The VEP format line.
+
+    Returns:
+        dict: A dictionary mapping each field to its index.
+    """
+    vep_format = {}
+    vep_format_line = line.split("Format: ")[1]
+    vep_format_line = vep_format_line.split("|")
+    for i, field in enumerate(vep_format_line):
+        vep_format[field] = i
+    return vep_format
+
+def parse_gene_consequence_hgvsc(info, vep_format, gene_consequence_hgvsc_ds):
+    """
+    Parses gene consequences and HGVSc annotations from VEP annotations.
+
+    Args:
+        info (list): List of VEP annotations.
+        vep_format (dict): Dictionary mapping VEP annotation fields to their indices.
+        gene_consequence_hgvsc_ds (dict): Dictionary to store gene consequences and HGVSc annotations.
+
+    Returns:
+        dict: Updated gene_consequence_hgvsc_ds dictionary with gene consequences and HGVSc annotations.
+
+    """
+    for i in info:
+        if i.startswith("CSQ="):
+            i = i.replace("CSQ=", "")
+            i = i.split(",")
+            for j in i:
+                j = j.split("|")
+                gene_name = j[vep_format["SYMBOL"]]
+                consequence = j[vep_format["Consequence"]]
+                transcript = j[vep_format["Feature"]]
+                hgvsc = j[vep_format["HGVSc"]].split(":")[1] if j[vep_format["HGVSc"]] != "" else ""
+                hgvsp = j[vep_format["HGVSp"]].split(":")[1] if j[vep_format["HGVSp"]] != "" else ""
+                exon = j[vep_format["EXON"]].split("/")[0] if j[vep_format["EXON"]] != "" else ""
+                intron = j[vep_format["INTRON"]].split("/")[0] if j[vep_format["INTRON"]] != "" else ""
+
+                if exon != "" and intron == "":
+                    gene_part = "exon" + exon
+                elif intron != "" and exon == "":
+                    gene_part = "intron" + intron
+                else:
+                    gene_part = "exon" + exon + "-intron" + intron
+
+                biotype = j[vep_format["BIOTYPE"]]
+                impact = j[vep_format["IMPACT"]]
+
+                # Format transcript information with exon number, HGVSc, and HGVSp
+                tran_exon_hgvsc_p = "{0}:{1}:{2}:{3}".format(transcript, gene_part, hgvsc, hgvsp)
+
+                if biotype == "protein_coding" and impact in ["HIGH", "MODERATE"]:
+                    if gene_name in gene_consequence_hgvsc_ds:
+                        if consequence in gene_consequence_hgvsc_ds[gene_name]:
+                            gene_consequence_hgvsc_ds[gene_name][consequence].append(tran_exon_hgvsc_p)
+                        else:
+                            gene_consequence_hgvsc_ds[gene_name][consequence] = [tran_exon_hgvsc_p]
+                    else:
+                        gene_consequence_hgvsc_ds[gene_name] = {consequence: [tran_exon_hgvsc_p]}
+    return gene_consequence_hgvsc_ds
+
+def format_transcript_info(line, gene_consequence_hgvsc_ds):
+    """
+    Formats the transcript information based on the gene, consequence, and HGVSc data.
+
+    Args:
+        line (str): The input line to be formatted.
+        gene_consequence_hgvsc_ds (dict): A dictionary containing gene, consequence, and HGVSc data.
+
+    Returns:
+        str: The formatted line with transcript information.
+
+    """
+    if len(gene_consequence_hgvsc_ds) > 0:
+        transcript_info = ""
+        for gene in gene_consequence_hgvsc_ds:
+            for consequence in gene_consequence_hgvsc_ds[gene]:
+                transcript_info += "{0}|{1}({2});".format(gene, consequence, ','.join(gene_consequence_hgvsc_ds[gene][consequence]))
+        line = line + "\t" + transcript_info + "\n"
+    else:
+        line = line + "\t.\n"
+    return line
+
+
+if __name__ == "__main__":
+
+    # Parse VEP annotations and write to STDOUT
+    parse_vep_annotations()
diff --git a/resources/configurationFiles/analysisIndelCalling.xml b/resources/configurationFiles/analysisIndelCalling.xml
@@ -160,6 +160,12 @@
 
              -->
 
+        <cvalue name='VEP_VERSION' value='110.1' type='string' />
+        <cvalue name='VEP_SW_PATH' value='/software/vep/${VEP_VERSION}/vep' type='path' />
+        <cvalue name='VEP_FORKS' value='3' type='integer'/>
+        <cvalue name='VEP_FA_INDEX' value='${assembliesHG191000GenomesDirectory}/tools_data/germlineSmVs/VEP/reference/hs37d5_PhiX.fa' type ='path' />
+        <cvalue name='VEP_CACHE_BASE' value='/omics/odcf/reference_data/by-tool/vep' type='path' />
+
         <configurationValueBundle name='PIPE_CONFIG:INDEL_RELIABILITY'>
             <cvalue name='MAPABILITY' value='"${hg19DatabaseUCSCDirectory}/wgEncodeCrgMapabilityAlign100mer_chr.bedGraph.gz:::--breakPointMode --aEndOffset=1"' type="path"/>
             <cvalue name='HISEQDEPTH' value='${hg19DatabaseUCSCDirectory}/HiSeqDepthTop10Pct_chr.bed.gz' type="path"/>
@@ -270,10 +276,10 @@
         </tool>
         <tool name="indelVcfFilter" value="filter_vcf.sh" basepath="indelCallingWorkflow">
             <resourcesets>
-                <rset size="s" memory="1" cores="1" nodes="1" walltime="1"/>
-                <rset size="m" memory="1" cores="1" nodes="1" walltime="2"/>
-                <rset size="l" memory="1" cores="1" nodes="1" walltime="10"/>
-                <rset size="xl" memory="1" cores="1" nodes="1" walltime="72"/>
+                <rset size="s" memory="2" cores="3" nodes="1" walltime="1"/>
+                <rset size="m" memory="3" cores="3" nodes="1" walltime="2"/>
+                <rset size="l" memory="3" cores="3" nodes="1" walltime="10"/>
+                <rset size="xl" memory="4" cores="3" nodes="1" walltime="72"/>
             </resourcesets>
             <input type="file" typeof="TumorBamFile" scriptparameter="FILENAME_TUMOR_BAM"/>
             <input type="file" typeof="ControlBamFile" scriptparameter="FILENAME_CONTROL_BAM"/>