Skip to content

Commit

Permalink
Add initial netmhciipan module with new version and parsing around it…
Browse files Browse the repository at this point in the history
…. Breaks the current main workflow structure, which needs to be adjust in future PRs
  • Loading branch information
jonasscheid committed Dec 16, 2024
1 parent 0bf7764 commit bf6dc84
Show file tree
Hide file tree
Showing 5 changed files with 246 additions and 214 deletions.
44 changes: 37 additions & 7 deletions modules/local/merge_predictions/templates/merge_predictions.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
import mhcgnomes
import pandas as pd

# Create logger object with date and time
import logging

logging.basicConfig(
Expand All @@ -18,6 +17,10 @@
datefmt="%Y-%m-%d %H:%M:%S",
)

class PredictorBindingThreshold(Enum):
NETMHCPAN = 2
NETMHCIIPAN = 5

class Arguments:
"""
Parses the argments, including the ones coming from $task.ext.args.
Expand Down Expand Up @@ -83,10 +86,9 @@ def format_yaml_like(data: dict, indent: int = 0) -> str:
return yaml_str

class PredictionResult:
def __init__(self, file_path, alleles, threshold=2):
def __init__(self, file_path, alleles):
self.file_path = file_path
self.alleles = alleles
self.threshold = threshold
self.predictor = None
self.prediction_df = self._format_prediction_result()

Expand Down Expand Up @@ -120,7 +122,7 @@ def _format_mhcnuggets_prediction(self):
pass

def _format_netmhcpan_prediction(self) -> pd.DataFrame:
# Map with allele index to allele name
# Map with allele index to allele name.NetMHCpan sorts alleles alphabetically
alleles_dict = {i: allele for i, allele in enumerate(self.alleles)}
# Read the file into a DataFrame with no headers initially
df = pd.read_csv(self.file_path, sep='\t', skiprows=1)
Expand All @@ -142,20 +144,48 @@ def _format_netmhcpan_prediction(self) -> pd.DataFrame:
# Pivot table to organize columns properly
df_pivot = df_long.pivot_table(index=["sequence", "allele"], columns="metric", values="value").reset_index()
df_pivot['allele'] = [alleles_dict[int(index.strip("."))] for index in df_pivot['allele']]
df_pivot['binder'] = df_pivot['EL_Rank'] <= self.threshold
df_pivot['binder'] = df_pivot['EL_Rank'] <= PredictorBindingThreshold.NETMHCPAN.value
df_pivot['predictor'] = 'netmhcpan'
df_pivot.index.name = ''

return df_pivot

def _format_netmhciipan_prediction(self, threshold=None):
pass
def _format_netmhciipan_prediction(self) -> pd.DataFrame:
# Map with allele index to allele name. NetMHCIIpan sorts alleles alphabetically
alleles_dict = {i: allele for i, allele in enumerate(self.alleles)}
# Read the file into a DataFrame with no headers initially
df = pd.read_csv(self.file_path, sep='\t', skiprows=1)
df = df[df.columns[df.columns.str.contains('Peptide|Rank|Rank_BA')]]
# TODO: Naming needs to be harmonized down the line once all predictors are implemented
df = df.rename(columns={'Peptide':'sequence','Rank':'Rank.0','Rank_BA':'Rank_BA.0'})
# to longformat based on .0|1|2..
df_long = pd.melt(
df,
id_vars=["sequence"],
value_vars=[col for col in df.columns if col != "sequence"],
var_name="metric",
value_name="value",
)
# Extract the allele information (e.g., .0, .1, etc.)
df_long["allele"] = df_long["metric"].str.split('.').str[1]
df_long["metric"] = df_long["metric"].str.split('.').str[0]

# Pivot table to organize columns properly
df_pivot = df_long.pivot_table(index=["sequence", "allele"], columns="metric", values="value").reset_index()
df_pivot['allele'] = [alleles_dict[int(index.strip("."))] for index in df_pivot['allele']]
df_pivot['binder'] = df_pivot['Rank'] <= PredictorBindingThreshold.NETMHCIIPAN.value
df_pivot['predictor'] = 'netmhciipan'
df_pivot.index.name = ''

return df_pivot

def main():
args = Arguments()

for file in args.input:
result = PredictionResult(file, args.alleles)

logging.info(f"Writing {len(result.prediction_df)} {result.predictor} predictions to file..")
result.prediction_df.to_csv(f"{args.prefix}_{result.predictor}.tsv", sep="\t", index=False)

# Parse versions
Expand Down
26 changes: 22 additions & 4 deletions modules/local/netmhciipan.nf
Original file line number Diff line number Diff line change
Expand Up @@ -8,28 +8,46 @@ process NETMHCIIPAN {
tuple val(meta), path(peptide_file), path(software)

output:
tuple val(meta), path("*.tsv"), emit: predicted
tuple val(meta), path("*.xls"), emit: predicted
path "versions.yml", emit: versions

script:
if (meta.mhc_class != "II") {
error "NETMHCIIPAN only supports MHC class II. Use NETMHCPAN for MHC class I."
error "NETMHCIIPAN only supports MHC class II. Use NETMHCIIPAN for MHC class II."
}
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: meta.sample
// Adjust for netMHCIIpan allele format
def alleles = meta.alleles.tokenize(';').collect {
it.contains('DRB') ?
it.replace('*', '_').replace(':', '') :
('HLA-' + it.replace('*', '').replace(':', ''))
}.join(',')

"""
netmhciipan/netMHCIIpan \
-f $peptide_file \
-inptype 1 \
-a $alleles \
-xls \
-xlsfile ${prefix}_predicted_netmhciipan.xls \
$args
cat <<-END_VERSIONS > versions.yml
"${task.process}":
\$(cat netmhciipan/data/version | sed -s 's/ version/:/g')
END_VERSIONS
"""

stub:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: meta.sample
"""
touch ${prefix}_predicted_netmhciipan.tsv
touch ${prefix}_predicted_netmhciipan.xls
cat <<-END_VERSIONS > versions.yml
"${task.process}":
netmhciipan \$(cat data/version | sed -s 's/ version/:/g')
\$(cat netmhciipan/data/version | sed -s 's/ version/:/g')
END_VERSIONS
"""
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,27 @@
import argparse
import shlex
from enum import Enum
import logging

import pandas as pd


logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
class MinLength(Enum):
NETMHCPAN = 8
NETMHCIIPAN = 8

class MaxLength(Enum):
NETMHCPAN = 14
NETMHCIIPAN = 25

# TODO: Implement
class MaxNumberOfAlleles(Enum):
NETMHCPAN = 50
NETMHCPAN = 50
NETMHCIIPAN = 50

class Arguments:
"""
Expand Down Expand Up @@ -90,16 +98,31 @@ def main():
args = Arguments()

df = pd.read_csv(args.input, sep="\t")
len_df = len(df)
logging.info(f"Reading in file with {len(df)} peptides..")

# Filter peptides based on user-defined length
if args.mhc_class == "I":
df = df[df["sequence"].str.len().between(args.min_peptide_length_classI, args.max_peptide_length_classI)]
else:
df = df[df["sequence"].str.len().between(args.min_peptide_length_classII, args.max_peptide_length_classII)]

# Filter peptides based on tool length boundaries and adjust input format
if "netmhcpan" in args.tools and args.mhc_class == "I":
logging.info("Input for NetMHCpan detected. Parsing input..")
df = df[df["sequence"].str.len().between(MinLength.NETMHCPAN.value, MaxLength.NETMHCPAN.value)]
# We only need a file containing the sequences underneath each other
df[['sequence']].to_csv(f'{args.prefix}_netmhcpan_input.tsv', sep="\t", header=False, index=False)

if "netmhciipan" in args.tools and args.mhc_class == "II":
logging.info("Input for NetMHCIIpan detected. Parsing input..")
df = df[df["sequence"].str.len().between(MinLength.NETMHCIIPAN.value, MaxLength.NETMHCIIPAN.value)]
df[['sequence']].to_csv(f'{args.prefix}_netmhciipan_input.tsv', sep="\t", header=False, index=False)

if len(df) == 0:
raise ValueError("No peptides left after applying length filters! Aborting..")
else:
logging.info(f"{len(df)} peptides post-filtering will be predicted..")

# Parse versions
versions_this_module = {}
versions_this_module["${task.process}"] = Version.get_versions([argparse, pd])
Expand Down
6 changes: 3 additions & 3 deletions subworkflows/local/mhc_binding_prediction.nf
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ workflow MHC_BINDING_PREDICTION {
return [meta, file]
}
.set{ ch_prediction_input }
ch_prediction_input.netmhcpan.view()
ch_prediction_input.netmhciipan.view()
SYFPEITHI ( ch_prediction_input.syfpeithi )
ch_versions = ch_versions.mix(SYFPEITHI.out.versions)
ch_binding_predictors_out = ch_binding_predictors_out.mix(SYFPEITHI.out.predicted)
Expand All @@ -67,12 +67,12 @@ workflow MHC_BINDING_PREDICTION {
ch_binding_predictors_out = ch_binding_predictors_out.mix(NETMHCPAN.out.predicted)
}

if ( "netmhciipan" in params.tools.tokenize(",") )
if ( "netmhciipan-4.3" in params.tools.tokenize(",") )
{
// TODO: External tools import for netmhciipan
NETMHCIIPAN_IMPORT( parse_netmhc_params("netmhciipan", "4.3") )
//TODO: Update netmhc container
NETMHCIIPAN ( ch_prediction_input.netmhcpan.combine(NETMHCIIPAN_IMPORT.out.nonfree_tools) )
NETMHCIIPAN ( ch_prediction_input.netmhciipan.combine(NETMHCIIPAN_IMPORT.out.nonfree_tools) )
ch_versions = ch_versions.mix(NETMHCIIPAN.out.versions)
ch_binding_predictors_out = ch_binding_predictors_out.mix(NETMHCIIPAN.out.predicted)
}
Expand Down
Loading

0 comments on commit bf6dc84

Please sign in to comment.