From 1083bae3acc7b23f80812ec8e44d251c6274d12f Mon Sep 17 00:00:00 2001
From: jonasscheid <jonas.scheid@uni-tuebingen.de>
Date: Mon, 16 Dec 2024 21:04:18 +0000
Subject: [PATCH] parse in and output of mhcflurry

---
 .../templates/prepare_prediction_input.py     | 47 +++++++++++--------
 subworkflows/local/mhc_binding_prediction.nf  |  2 +-
 2 files changed, 29 insertions(+), 20 deletions(-)

diff --git a/modules/local/prepare_prediction_input/templates/prepare_prediction_input.py b/modules/local/prepare_prediction_input/templates/prepare_prediction_input.py
index 894b9b1..9748e86 100644
--- a/modules/local/prepare_prediction_input/templates/prepare_prediction_input.py
+++ b/modules/local/prepare_prediction_input/templates/prepare_prediction_input.py
@@ -13,10 +13,12 @@
     datefmt="%Y-%m-%d %H:%M:%S",
 )
 class MinLength(Enum):
+    MHCFLURRY = 5
     NETMHCPAN = 8
     NETMHCIIPAN = 8
 
 class MaxLength(Enum):
+    MHCFLURRY = 15
     NETMHCPAN = 14
     NETMHCIIPAN = 25
 
@@ -27,13 +29,14 @@ class MaxNumberOfAlleles(Enum):
 
 class Arguments:
     """
-    Parses the argments, including the ones coming from $task.ext.args.
+    Parses the arguments, including the ones coming from $task.ext.args.
     """
 
     def __init__(self) -> None:
         self.input = "$tsv"
         self.prefix = "$task.ext.prefix" if "$task.ext.prefix" != "null" else "$meta.id"
         self.mhc_class = "$meta.mhc_class"
+        self.alleles = "$meta.alleles".split(";")
         self.tools = "$params.tools"
         self.min_peptide_length_classI = int("$params.min_peptide_length_classI")
         self.max_peptide_length_classI = int("$params.max_peptide_length_classI")
@@ -97,31 +100,37 @@ def format_yaml_like(data: dict, indent: int = 0) -> str:
 def main():
     args = Arguments()
 
-    df = pd.read_csv(args.input, sep="\t")
-    len_df = len(df)
-    logging.info(f"Reading in file with {len(df)} peptides..")
+    df_input = pd.read_csv(args.input, sep="\t")
+    logging.info(f"Reading in file with {len(df_input)} peptides..")
 
     # Filter peptides based on user-defined length
     if args.mhc_class == "I":
-        df = df[df["sequence"].str.len().between(args.min_peptide_length_classI, args.max_peptide_length_classI)]
+        df = df_input[df_input["sequence"].str.len().between(args.min_peptide_length_classI, args.max_peptide_length_classI)]
     else:
-        df = df[df["sequence"].str.len().between(args.min_peptide_length_classII, args.max_peptide_length_classII)]
-
-	# Filter peptides based on tool length boundaries and adjust input format
-    if "netmhcpan" in args.tools and args.mhc_class == "I":
-        logging.info("Input for NetMHCpan detected. Parsing input..")
-        df = df[df["sequence"].str.len().between(MinLength.NETMHCPAN.value, MaxLength.NETMHCPAN.value)]
-        df[['sequence']].to_csv(f'{args.prefix}_netmhcpan_input.tsv', sep="\t", header=False, index=False)
-
-    if "netmhciipan" in args.tools and args.mhc_class == "II":
-        logging.info("Input for NetMHCIIpan detected. Parsing input..")
-        df = df[df["sequence"].str.len().between(MinLength.NETMHCIIPAN.value, MaxLength.NETMHCIIPAN.value)]
-        df[['sequence']].to_csv(f'{args.prefix}_netmhciipan_input.tsv', sep="\t", header=False, index=False)
+        df = df_input[df_input["sequence"].str.len().between(args.min_peptide_length_classII, args.max_peptide_length_classII)]
 
     if len(df) == 0:
         raise ValueError("No peptides left after applying length filters! Aborting..")
-    else:
-        logging.info(f"{len(df)} peptides post-filtering will be predicted..")
+
+    # Filter peptides based on tool length boundaries and adjust input format
+    if "mhcflurry" in args.tools and args.mhc_class == "I":
+        df_mhcflurry = df[df["sequence"].str.len().between(MinLength.MHCFLURRY.value, MaxLength.MHCFLURRY.value)]
+        logging.info(f"Input for NetMHCpan detected. Preparing {len(df_mhcflurry)} peptides for prediction..")
+        # Get every combination of sequence and allele and write them to csv with columns sequence and allele
+        df_mhcflurry['allele'] = [args.alleles] * len(df_mhcflurry)
+        df_mhcflurry = df_mhcflurry.explode('allele').reset_index(drop=True)
+        df_mhcflurry.rename(columns={"sequence": "peptide"}, inplace=True)
+        df_mhcflurry[['peptide','allele']].to_csv(f'{args.prefix}_mhcflurry_input.csv', index=False)
+
+    if "netmhcpan" in args.tools and args.mhc_class == "I":
+        df_netmhcpan = df[df["sequence"].str.len().between(MinLength.NETMHCPAN.value, MaxLength.NETMHCPAN.value)]
+        logging.info(f"Input for NetMHCpan detected. Preparing {len(df_netmhcpan)} peptides for prediction..")
+        df_netmhcpan[['sequence']].to_csv(f'{args.prefix}_netmhcpan_input.tsv', sep="\t", header=False, index=False)
+
+    elif "netmhciipan" in args.tools and args.mhc_class == "II":
+        df_netmhciipan = df[df["sequence"].str.len().between(MinLength.NETMHCIIPAN.value, MaxLength.NETMHCIIPAN.value)]
+        logging.info(f"Input for NetMHCpan detected. Preparing {len(df_netmhciipan)} peptides for prediction..")
+        df_netmhciipan[['sequence']].to_csv(f'{args.prefix}_netmhciipan_input.tsv', sep="\t", header=False, index=False)
 
     # Parse versions
     versions_this_module = {}
diff --git a/subworkflows/local/mhc_binding_prediction.nf b/subworkflows/local/mhc_binding_prediction.nf
index 69dd51c..cdcae8f 100755
--- a/subworkflows/local/mhc_binding_prediction.nf
+++ b/subworkflows/local/mhc_binding_prediction.nf
@@ -42,7 +42,7 @@ workflow MHC_BINDING_PREDICTION {
                         return [meta, file]
                     }
             .set{ ch_prediction_input }
-        ch_prediction_input.netmhciipan.view()
+        ch_prediction_input.mhcflurry.view()
         SYFPEITHI ( ch_prediction_input.syfpeithi )
         ch_versions = ch_versions.mix(SYFPEITHI.out.versions)
         ch_binding_predictors_out = ch_binding_predictors_out.mix(SYFPEITHI.out.predicted)