Skip to content

Commit

Permalink
Merge pull request #221 from jonasscheid/depricate_gsvar_and_variant_tsv
Browse files Browse the repository at this point in the history
Depricate GSvar and variant tsv input types
  • Loading branch information
christopher-mohr authored Dec 11, 2023
2 parents 182fa7d + e2e3f80 commit ee311d9
Show file tree
Hide file tree
Showing 11 changed files with 30 additions and 279 deletions.
10 changes: 1 addition & 9 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -55,15 +55,7 @@ jobs:
- NXF_VER: ""
NXF_EDGE: "1"
tests:
[
"test_variant_tsv",
"test_grch38_variant_tsv",
"test_peptides",
"test_peptides_h2",
"test_proteins",
"test_mhcnuggets",
"test_mhcflurry",
]
["test_grch38", "test_peptides", "test_peptides_h2", "test_proteins", "test_mhcnuggets", "test_mhcflurry"]
steps:
- name: Check out pipeline code
uses: actions/checkout@v2
Expand Down
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

- [#219](https://github.com/nf-core/epitopeprediction/pull/219) - Fix `EXTERNAL_TOOLS_IMPORT`` container registry and bump version

### `Removed`

- [#221](https://github.com/nf-core/epitopeprediction/pull/221) - Remove support of `GSvar` and variant `tsv` input files

## v2.2.1 - WaldhaeuserOst Hotfix - 2023-03-16

### `Fixed`
Expand Down
28 changes: 12 additions & 16 deletions bin/check_samplesheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ class RowChecker:
"""

VALID_FORMATS = (".tsv", ".fasta", ".vcf", "GSvar")
VALID_FORMATS = (".tsv", ".fasta", ".vcf", ".vcf.gz")

def __init__(
self,
Expand Down Expand Up @@ -138,34 +138,30 @@ def get_file_type(file):
# check input file is empty
# it needs to be distinguished if there's a given local file or internet address
if str(file).startswith("http"):
with urllib.request.urlopen(file) as response:
file = response.read().decode("utf-8").split("\n")
if len(file) == 0:
raise AssertionError(f"Input file {file} is empty.")
# Temporarily skip checking gz files, samplesheet check will be replaced by nf-validation in the next PR
if not str(file).endswith("vcf.gz"):
with urllib.request.urlopen(file) as response:
file = response.read().decode("utf-8").split("\n")
if len(file) == 0:
raise AssertionError(f"Input file {file} is empty.")
else:
file = open(file, "r").readlines()
if file == 0:
raise AssertionError(f"Input file {file} is empty.")

try:
if extension == "vcf.gz":
file_type = "compressed_variant"
if str(file).endswith("vcf.gz"):
file_type = "variant_compressed"
elif extension == "vcf":
file_type = "variant"
elif extension == "fasta":
file_type = "protein"
elif extension in ["tsv", "GSvar"]:
elif extension == "tsv":
# Check if the file is a variant annotation file or a peptide file
header_columns = [col.strip() for col in file[0].split("\t")]

required_variant_columns = ["#chr", "start", "end"]

file_type = "peptide"

if all(col in header_columns for col in required_variant_columns):
file_type = "variant"
elif "sequence" not in header_columns:
if "sequence" not in header_columns:
raise AssertionError("Peptide input file does not contain mandatory column 'sequence'")
file_type = "peptide"

return file_type

Expand Down
175 changes: 5 additions & 170 deletions bin/epaa.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,26 +70,6 @@ def get_epytope_annotation(vt, p, r, alt):
return position, reference, alternative


def check_min_req_GSvar(row):
"""
checking the presence of mandatory columns
:param row: dictionary of a GSvar row
:return: boolean, True if min req met
"""
if (
"#chr" in row.keys()
and "start" in row.keys()
and "end" in row.keys()
and "ref" in row.keys()
and "obs" in row.keys()
and (
"coding_and_splicing_details" in row.keys() or "coding" in row.keys() or "coding_and_splicing" in row.keys()
)
):
return True
return False


def determine_variant_type(record, alternative):
vt = VariationType.UNKNOWN
if record.is_snp:
Expand Down Expand Up @@ -129,152 +109,6 @@ def determine_zygosity(record):
return isHomozygous


def read_GSvar(filename, pass_only=True):
"""
reads GSvar and tsv files (tab sep files in context of genetic variants), omitting and warning about rows missing
mandatory columns
:param filename: /path/to/file
:return: list epytope variants
"""
global ID_SYSTEM_USED
RE = re.compile("(\w+):([\w.]+):([&\w]+):\w*:exon(\d+)\D*\d*:(c.\D*([_\d]+)\D*):(p.\D*(\d+)\w*)")

# list of mandatory (meta)data
exclusion_list = [
"start",
"end",
"#chr",
"ref",
"obs",
"gene",
"tumour_genotype",
"coding_and_splicing_details",
"variant_details",
"variant_type",
"coding_and_splicing",
]

list_vars = list()
lines = list()
transcript_ids = []
dict_vars = {}

cases = 0

with open(filename, "rt") as tsvfile:
tsvreader = csv.DictReader((row for row in tsvfile if not row.startswith("##")), delimiter="\t")
for row in tsvreader:
if not check_min_req_GSvar(row):
logger.warning("read_GSvar: Omitted row! Mandatory columns not present in: \n" + str(row) + ".")
continue
lines.append(row)

# get list of additional metadata
metadata_list = set(tsvreader.fieldnames) - set(exclusion_list)

for mut_id, line in enumerate(lines):
if "filter" in line and pass_only and line["filter"].strip():
continue
genome_start = int(line["start"]) - 1
genome_stop = int(line["end"]) - 1
chrom = line["#chr"]
ref = line["ref"]
alt = line["obs"]
gene = line.get("gene", "")

isHomozygous = (
True
if (
("tumour_genotype" in line)
and (line["tumour_genotype"].split("/")[0] == line["tumour_genotype"].split("/")[1])
)
else False
)

# old GSvar version
if "coding_and_splicing_details" in line:
mut_type = line.get("variant_details", "")
annots = RE.findall(line["coding_and_splicing_details"])
else:
mut_type = line.get("variant_type", "")
# Gene, transcript number, type, impact, exon/intron number, HGVS.c, HGVS.p, Pfam
annots = RE.findall(line["coding_and_splicing"])
isyn = mut_type == "synonymous_variant"

"""
Enum for variation types:
type.SNP, type.DEL, type.INS, type.FSDEL, type.FSINS, type.UNKNOWN
"""
vt = VariationType.UNKNOWN
if mut_type == "missense_variant" or "missense_variant" in mut_type:
vt = VariationType.SNP
elif mut_type == "frameshift_variant":
if (ref == "-") or (len(ref) < len(alt)):
vt = VariationType.FSINS
else:
vt = VariationType.FSDEL
elif mut_type == "inframe_deletion":
vt = VariationType.DEL
elif mut_type == "inframe_insertion":
vt = VariationType.INS

coding = dict()

for annot in annots:
a_gene, transcript_id, a_mut_type, exon, trans_coding, trans_pos, prot_coding, prot_start = annot
if "NM" in transcript_id:
ID_SYSTEM_USED = EIdentifierTypes.REFSEQ
if "stop_gained" not in mut_type:
if not gene:
gene = a_gene
if not mut_type:
mut_type = a_mut_type

# with the latest epytope release (3.3.1), we can now handle full transcript IDs
coding[transcript_id] = MutationSyntax(
transcript_id, int(trans_pos.split("_")[0]) - 1, int(prot_start) - 1, trans_coding, prot_coding
)
transcript_ids.append(transcript_id)
if coding:
var = Variant(
mut_id,
vt,
chrom.strip("chr"),
int(genome_start),
ref.upper(),
alt.upper(),
coding,
isHomozygous,
isSynonymous=isyn,
)
var.gene = gene

# metadata logging
for meta_name in metadata_list:
var.log_metadata(meta_name, line.get(meta_name, ""))

dict_vars[var] = var
list_vars.append(var)

transToVar = {}

# fix because of memory/timing issues due to combinatorial explosion
for variant in list_vars:
for trans_id in variant.coding.keys():
transToVar.setdefault(trans_id, []).append(variant)

for tId, vs in transToVar.items():
if len(vs) > 10:
cases += 1
for v in vs:
vs_new = Variant(v.id, v.type, v.chrom, v.genomePos, v.ref, v.obs, v.coding, True, v.isSynonymous)
vs_new.gene = v.gene
for m in metadata_list:
vs_new.log_metadata(m, v.get_metadata(m)[0])
dict_vars[v] = vs_new
return dict_vars.values(), transcript_ids, metadata_list


def read_vcf(filename, pass_only=True):
"""
reads vcf files
Expand Down Expand Up @@ -1224,18 +1058,19 @@ def __main__():
logger.info("Running epaa for peptides...")
peptides, metadata = read_peptide_input(args.peptides)
else:
if args.somatic_mutations.endswith(".GSvar") or args.somatic_mutations.endswith(".tsv"):
logger.info("Running epaa for variants...")
variant_list, transcripts, metadata = read_GSvar(args.somatic_mutations)
elif args.somatic_mutations.endswith(".vcf"):
logger.info("Running epaa for variants...")
if args.somatic_mutations.endswith(".vcf"):
variant_list, transcripts, metadata = read_vcf(args.somatic_mutations)
else:
raise ValueError("File is not in VCF format. Please provide a VCF file.")

transcripts = list(set(transcripts))

# use function provided by epytope to retrieve protein IDs (different systems) for transcript IDs
transcriptProteinTable = ma.get_protein_ids_from_transcripts(transcripts, type=ID_SYSTEM_USED)

# get the alleles
# TODO: remove this in PR of nf-validation
if args.alleles.startswith("http"):
alleles = [Allele(a) for a in urllib.request.urlopen(args.alleles).read().decode("utf-8").splitlines()]
elif args.alleles.endswith(".txt"):
Expand Down
7 changes: 0 additions & 7 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -148,13 +148,6 @@ process {
]
}

withName: CSVTK_SPLIT {
publishDir = [
path: { "${params.outdir}/split_input/${meta.sample}" },
mode: params.publish_dir_mode
]
}

withName: GET_PREDICTION_VERSIONS {
publishDir = [
path: { "${params.outdir}/reports" },
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
* -------------------------------------------------
* Defines bundled input files and everything required
* to run a fast and simple test. Use as follows:
* nextflow run nf-core/epitopeprediction -profile test_grch38_variant_tsv,<docker/singularity> --outdir <OUTDIR>
* nextflow run nf-core/epitopeprediction -profile test_grch38,<docker/singularity> --outdir <OUTDIR>
*/

params {
Expand All @@ -13,6 +13,6 @@ params {
max_time = 48.h

// Input data
input = 'https://raw.githubusercontent.com/nf-core/test-datasets/epitopeprediction/testdata/sample_sheets/sample_sheet_variants_tab.csv'
input = 'https://raw.githubusercontent.com/nf-core/test-datasets/epitopeprediction/testdata/sample_sheets/sample_sheet_variants.csv'
genome_version = 'GRCh38'
}
17 changes: 0 additions & 17 deletions conf/test_variant_tsv.config

This file was deleted.

2 changes: 1 addition & 1 deletion docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ GBM_2,alleles.txt,I,gbm_2_variants.vcf
| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). |
| `alleles` | A string that consists of the patient's alleles (separated by ";"), or a full path to a allele ".txt" file where each allele is saved on a row. |
| `mhc_class` | Specifies the MHC class for which the prediction should be performed. Valid values are: `I`, `II` and `H-2` (mouse). |
| `filename` | Full path to a variant/peptide or protein file (".vcf", ".vcf.gz", "tsv", "fasta", or "GSvar"). |
| `filename` | Full path to a variant/peptide or protein file (".vcf", ".vcf.gz", "tsv" or "fasta"). |

An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline.

Expand Down
30 changes: 0 additions & 30 deletions modules/local/csvtk_split.nf

This file was deleted.

3 changes: 1 addition & 2 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -206,8 +206,7 @@ profiles {
executor.memory = 8.GB
}
test { includeConfig 'conf/test.config' }
test_variant_tsv { includeConfig 'conf/test_variant_tsv.config' }
test_grch38_variant_tsv { includeConfig 'conf/test_grch38_variant_tsv.config' }
test_grch38 { includeConfig 'conf/test_grch38.config' }
test_peptides { includeConfig 'conf/test_peptides.config' }
test_peptides_h2 { includeConfig 'conf/test_peptides_h2.config' }
test_proteins { includeConfig 'conf/test_proteins.config' }
Expand Down
Loading

0 comments on commit ee311d9

Please sign in to comment.