Merge pull request #221 from jonasscheid/depricate_gsvar_and_variant_tsv

Depricate GSvar and variant tsv input types
nf-core · Dec 11, 2023 · ee311d9 · ee311d9
2 parents 182fa7d + e2e3f80
commit ee311d9
Show file tree

Hide file tree

Showing 11 changed files with 30 additions and 279 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -55,15 +55,7 @@ jobs:
           - NXF_VER: ""
             NXF_EDGE: "1"
         tests:
-          [
-            "test_variant_tsv",
-            "test_grch38_variant_tsv",
-            "test_peptides",
-            "test_peptides_h2",
-            "test_proteins",
-            "test_mhcnuggets",
-            "test_mhcflurry",
-          ]
+          ["test_grch38", "test_peptides", "test_peptides_h2", "test_proteins", "test_mhcnuggets", "test_mhcflurry"]
     steps:
       - name: Check out pipeline code
         uses: actions/checkout@v2

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -17,6 +17,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - [#219](https://github.com/nf-core/epitopeprediction/pull/219) - Fix `EXTERNAL_TOOLS_IMPORT`` container registry and bump version
 
+### `Removed`
+
+- [#221](https://github.com/nf-core/epitopeprediction/pull/221) - Remove support of `GSvar` and variant `tsv` input files
+
 ## v2.2.1 - WaldhaeuserOst Hotfix - 2023-03-16
 
 ### `Fixed`

diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
@@ -23,7 +23,7 @@ class RowChecker:
 
     """
 
-    VALID_FORMATS = (".tsv", ".fasta", ".vcf", "GSvar")
+    VALID_FORMATS = (".tsv", ".fasta", ".vcf", ".vcf.gz")
 
     def __init__(
         self,
@@ -138,34 +138,30 @@ def get_file_type(file):
     # check input file is empty
     # it needs to be distinguished if there's a given local file or internet address
     if str(file).startswith("http"):
-        with urllib.request.urlopen(file) as response:
-            file = response.read().decode("utf-8").split("\n")
-            if len(file) == 0:
-                raise AssertionError(f"Input file {file} is empty.")
+        # Temporarily skip checking gz files, samplesheet check will be replaced by nf-validation in the next PR
+        if not str(file).endswith("vcf.gz"):
+            with urllib.request.urlopen(file) as response:
+                file = response.read().decode("utf-8").split("\n")
+                if len(file) == 0:
+                    raise AssertionError(f"Input file {file} is empty.")
     else:
         file = open(file, "r").readlines()
         if file == 0:
             raise AssertionError(f"Input file {file} is empty.")
 
     try:
-        if extension == "vcf.gz":
-            file_type = "compressed_variant"
+        if str(file).endswith("vcf.gz"):
+            file_type = "variant_compressed"
         elif extension == "vcf":
             file_type = "variant"
         elif extension == "fasta":
             file_type = "protein"
-        elif extension in ["tsv", "GSvar"]:
+        elif extension == "tsv":
             # Check if the file is a variant annotation file or a peptide file
             header_columns = [col.strip() for col in file[0].split("\t")]
-
-            required_variant_columns = ["#chr", "start", "end"]
-
-            file_type = "peptide"
-
-            if all(col in header_columns for col in required_variant_columns):
-                file_type = "variant"
-            elif "sequence" not in header_columns:
+            if "sequence" not in header_columns:
                 raise AssertionError("Peptide input file does not contain mandatory column 'sequence'")
+            file_type = "peptide"
 
         return file_type
 

diff --git a/bin/epaa.py b/bin/epaa.py
@@ -70,26 +70,6 @@ def get_epytope_annotation(vt, p, r, alt):
     return position, reference, alternative
 
 
-def check_min_req_GSvar(row):
-    """
-    checking the presence of mandatory columns
-    :param row: dictionary of a GSvar row
-    :return: boolean, True if min req met
-    """
-    if (
-        "#chr" in row.keys()
-        and "start" in row.keys()
-        and "end" in row.keys()
-        and "ref" in row.keys()
-        and "obs" in row.keys()
-        and (
-            "coding_and_splicing_details" in row.keys() or "coding" in row.keys() or "coding_and_splicing" in row.keys()
-        )
-    ):
-        return True
-    return False
-
-
 def determine_variant_type(record, alternative):
     vt = VariationType.UNKNOWN
     if record.is_snp:
@@ -129,152 +109,6 @@ def determine_zygosity(record):
     return isHomozygous
 
 
-def read_GSvar(filename, pass_only=True):
-    """
-    reads GSvar and tsv files (tab sep files in context of genetic variants), omitting and warning about rows missing
-    mandatory columns
-    :param filename: /path/to/file
-    :return: list epytope variants
-    """
-    global ID_SYSTEM_USED
-    RE = re.compile("(\w+):([\w.]+):([&\w]+):\w*:exon(\d+)\D*\d*:(c.\D*([_\d]+)\D*):(p.\D*(\d+)\w*)")
-
-    # list of mandatory (meta)data
-    exclusion_list = [
-        "start",
-        "end",
-        "#chr",
-        "ref",
-        "obs",
-        "gene",
-        "tumour_genotype",
-        "coding_and_splicing_details",
-        "variant_details",
-        "variant_type",
-        "coding_and_splicing",
-    ]
-
-    list_vars = list()
-    lines = list()
-    transcript_ids = []
-    dict_vars = {}
-
-    cases = 0
-
-    with open(filename, "rt") as tsvfile:
-        tsvreader = csv.DictReader((row for row in tsvfile if not row.startswith("##")), delimiter="\t")
-        for row in tsvreader:
-            if not check_min_req_GSvar(row):
-                logger.warning("read_GSvar: Omitted row! Mandatory columns not present in: \n" + str(row) + ".")
-                continue
-            lines.append(row)
-
-    # get list of additional metadata
-    metadata_list = set(tsvreader.fieldnames) - set(exclusion_list)
-
-    for mut_id, line in enumerate(lines):
-        if "filter" in line and pass_only and line["filter"].strip():
-            continue
-        genome_start = int(line["start"]) - 1
-        genome_stop = int(line["end"]) - 1
-        chrom = line["#chr"]
-        ref = line["ref"]
-        alt = line["obs"]
-        gene = line.get("gene", "")
-
-        isHomozygous = (
-            True
-            if (
-                ("tumour_genotype" in line)
-                and (line["tumour_genotype"].split("/")[0] == line["tumour_genotype"].split("/")[1])
-            )
-            else False
-        )
-
-        # old GSvar version
-        if "coding_and_splicing_details" in line:
-            mut_type = line.get("variant_details", "")
-            annots = RE.findall(line["coding_and_splicing_details"])
-        else:
-            mut_type = line.get("variant_type", "")
-            # Gene, transcript number, type, impact, exon/intron number, HGVS.c, HGVS.p, Pfam
-            annots = RE.findall(line["coding_and_splicing"])
-        isyn = mut_type == "synonymous_variant"
-
-        """
-        Enum for variation types:
-        type.SNP, type.DEL, type.INS, type.FSDEL, type.FSINS, type.UNKNOWN
-        """
-        vt = VariationType.UNKNOWN
-        if mut_type == "missense_variant" or "missense_variant" in mut_type:
-            vt = VariationType.SNP
-        elif mut_type == "frameshift_variant":
-            if (ref == "-") or (len(ref) < len(alt)):
-                vt = VariationType.FSINS
-            else:
-                vt = VariationType.FSDEL
-        elif mut_type == "inframe_deletion":
-            vt = VariationType.DEL
-        elif mut_type == "inframe_insertion":
-            vt = VariationType.INS
-
-        coding = dict()
-
-        for annot in annots:
-            a_gene, transcript_id, a_mut_type, exon, trans_coding, trans_pos, prot_coding, prot_start = annot
-            if "NM" in transcript_id:
-                ID_SYSTEM_USED = EIdentifierTypes.REFSEQ
-            if "stop_gained" not in mut_type:
-                if not gene:
-                    gene = a_gene
-                if not mut_type:
-                    mut_type = a_mut_type
-
-                # with the latest epytope release (3.3.1), we can now handle full transcript IDs
-                coding[transcript_id] = MutationSyntax(
-                    transcript_id, int(trans_pos.split("_")[0]) - 1, int(prot_start) - 1, trans_coding, prot_coding
-                )
-                transcript_ids.append(transcript_id)
-        if coding:
-            var = Variant(
-                mut_id,
-                vt,
-                chrom.strip("chr"),
-                int(genome_start),
-                ref.upper(),
-                alt.upper(),
-                coding,
-                isHomozygous,
-                isSynonymous=isyn,
-            )
-            var.gene = gene
-
-            # metadata logging
-            for meta_name in metadata_list:
-                var.log_metadata(meta_name, line.get(meta_name, ""))
-
-            dict_vars[var] = var
-            list_vars.append(var)
-
-    transToVar = {}
-
-    # fix because of memory/timing issues due to combinatorial explosion
-    for variant in list_vars:
-        for trans_id in variant.coding.keys():
-            transToVar.setdefault(trans_id, []).append(variant)
-
-    for tId, vs in transToVar.items():
-        if len(vs) > 10:
-            cases += 1
-            for v in vs:
-                vs_new = Variant(v.id, v.type, v.chrom, v.genomePos, v.ref, v.obs, v.coding, True, v.isSynonymous)
-                vs_new.gene = v.gene
-                for m in metadata_list:
-                    vs_new.log_metadata(m, v.get_metadata(m)[0])
-                dict_vars[v] = vs_new
-    return dict_vars.values(), transcript_ids, metadata_list
-
-
 def read_vcf(filename, pass_only=True):
     """
     reads vcf files
@@ -1224,18 +1058,19 @@ def __main__():
         logger.info("Running epaa for peptides...")
         peptides, metadata = read_peptide_input(args.peptides)
     else:
-        if args.somatic_mutations.endswith(".GSvar") or args.somatic_mutations.endswith(".tsv"):
-            logger.info("Running epaa for variants...")
-            variant_list, transcripts, metadata = read_GSvar(args.somatic_mutations)
-        elif args.somatic_mutations.endswith(".vcf"):
+        logger.info("Running epaa for variants...")
+        if args.somatic_mutations.endswith(".vcf"):
             variant_list, transcripts, metadata = read_vcf(args.somatic_mutations)
+        else:
+            raise ValueError("File is not in VCF format. Please provide a VCF file.")
 
         transcripts = list(set(transcripts))
 
         # use function provided by epytope to retrieve protein IDs (different systems) for transcript IDs
         transcriptProteinTable = ma.get_protein_ids_from_transcripts(transcripts, type=ID_SYSTEM_USED)
 
     # get the alleles
+    # TODO: remove this in PR of nf-validation
     if args.alleles.startswith("http"):
         alleles = [Allele(a) for a in urllib.request.urlopen(args.alleles).read().decode("utf-8").splitlines()]
     elif args.alleles.endswith(".txt"):

diff --git a/conf/modules.config b/conf/modules.config
@@ -148,13 +148,6 @@ process {
         ]
     }
 
-    withName: CSVTK_SPLIT {
-        publishDir = [
-            path: { "${params.outdir}/split_input/${meta.sample}" },
-            mode: params.publish_dir_mode
-        ]
-    }
-
     withName: GET_PREDICTION_VERSIONS {
         publishDir = [
             path: { "${params.outdir}/reports" },

diff --git a/conf/test_grch38_variant_tsv.config → conf/test_grch38.config b/conf/test_grch38_variant_tsv.config → conf/test_grch38.config
@@ -4,7 +4,7 @@
  * -------------------------------------------------
  * Defines bundled input files and everything required
  * to run a fast and simple test. Use as follows:
- *   nextflow run nf-core/epitopeprediction -profile test_grch38_variant_tsv,<docker/singularity> --outdir <OUTDIR>
+ *   nextflow run nf-core/epitopeprediction -profile test_grch38,<docker/singularity> --outdir <OUTDIR>
  */
 
 params {
@@ -13,6 +13,6 @@ params {
     max_time = 48.h
 
     // Input data
-    input = 'https://raw.githubusercontent.com/nf-core/test-datasets/epitopeprediction/testdata/sample_sheets/sample_sheet_variants_tab.csv'
+    input = 'https://raw.githubusercontent.com/nf-core/test-datasets/epitopeprediction/testdata/sample_sheets/sample_sheet_variants.csv'
     genome_version = 'GRCh38'
 }
diff --git a/conf/test_variant_tsv.config b/conf/test_variant_tsv.config
diff --git a/docs/usage.md b/docs/usage.md
@@ -83,7 +83,7 @@ GBM_2,alleles.txt,I,gbm_2_variants.vcf
 | `sample`    | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). |
 | `alleles`   | A string that consists of the patient's alleles (separated by ";"), or a full path to a allele ".txt" file where each allele is saved on a row.                                        |
 | `mhc_class` | Specifies the MHC class for which the prediction should be performed. Valid values are: `I`, `II` and `H-2` (mouse).                                                                   |
-| `filename`  | Full path to a variant/peptide or protein file (".vcf", ".vcf.gz", "tsv", "fasta", or "GSvar").                                                                                        |
+| `filename`  | Full path to a variant/peptide or protein file (".vcf", ".vcf.gz", "tsv" or "fasta").                                                                                                  |
 
 An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline.
 

diff --git a/modules/local/csvtk_split.nf b/modules/local/csvtk_split.nf
diff --git a/nextflow.config b/nextflow.config
@@ -206,8 +206,7 @@ profiles {
         executor.memory        = 8.GB
     }
     test { includeConfig 'conf/test.config' }
-    test_variant_tsv { includeConfig 'conf/test_variant_tsv.config' }
-    test_grch38_variant_tsv { includeConfig 'conf/test_grch38_variant_tsv.config' }
+    test_grch38 { includeConfig 'conf/test_grch38.config' }
     test_peptides { includeConfig 'conf/test_peptides.config' }
     test_peptides_h2 { includeConfig 'conf/test_peptides_h2.config' }
     test_proteins { includeConfig 'conf/test_proteins.config' }