Add initial netmhciipan module with new version and parsing around it…

…. Breaks the current main workflow structure, which needs to be adjust in future PRs
nf-core · Dec 16, 2024 · bf6dc84 · bf6dc84
1 parent 0bf7764
commit bf6dc84
Show file tree

Hide file tree

Showing 5 changed files with 246 additions and 214 deletions.
diff --git a/modules/local/merge_predictions/templates/merge_predictions.py b/modules/local/merge_predictions/templates/merge_predictions.py
@@ -9,7 +9,6 @@
 import mhcgnomes
 import pandas as pd
 
-# Create logger object with date and time
 import logging
 
 logging.basicConfig(
@@ -18,6 +17,10 @@
     datefmt="%Y-%m-%d %H:%M:%S",
 )
 
+class PredictorBindingThreshold(Enum):
+    NETMHCPAN = 2
+    NETMHCIIPAN = 5
+
 class Arguments:
     """
     Parses the argments, including the ones coming from $task.ext.args.
@@ -83,10 +86,9 @@ def format_yaml_like(data: dict, indent: int = 0) -> str:
         return yaml_str
 
 class PredictionResult:
-    def __init__(self, file_path, alleles, threshold=2):
+    def __init__(self, file_path, alleles):
         self.file_path = file_path
         self.alleles = alleles
-        self.threshold = threshold
         self.predictor = None
         self.prediction_df = self._format_prediction_result()
 
@@ -120,7 +122,7 @@ def _format_mhcnuggets_prediction(self):
         pass
 
     def _format_netmhcpan_prediction(self) -> pd.DataFrame:
-        # Map with allele index to allele name
+        # Map with allele index to allele name.NetMHCpan sorts alleles alphabetically
         alleles_dict = {i: allele for i, allele in enumerate(self.alleles)}
         # Read the file into a DataFrame with no headers initially
         df = pd.read_csv(self.file_path, sep='\t', skiprows=1)
@@ -142,20 +144,48 @@ def _format_netmhcpan_prediction(self) -> pd.DataFrame:
         # Pivot table to organize columns properly
         df_pivot = df_long.pivot_table(index=["sequence", "allele"], columns="metric", values="value").reset_index()
         df_pivot['allele'] = [alleles_dict[int(index.strip("."))] for index in df_pivot['allele']]
-        df_pivot['binder'] = df_pivot['EL_Rank'] <= self.threshold
+        df_pivot['binder'] = df_pivot['EL_Rank'] <= PredictorBindingThreshold.NETMHCPAN.value
         df_pivot['predictor'] = 'netmhcpan'
         df_pivot.index.name = ''
 
         return df_pivot
 
-    def _format_netmhciipan_prediction(self, threshold=None):
-        pass
+    def _format_netmhciipan_prediction(self) -> pd.DataFrame:
+        # Map with allele index to allele name. NetMHCIIpan sorts alleles alphabetically
+        alleles_dict = {i: allele for i, allele in enumerate(self.alleles)}
+        # Read the file into a DataFrame with no headers initially
+        df = pd.read_csv(self.file_path, sep='\t', skiprows=1)
+        df = df[df.columns[df.columns.str.contains('Peptide|Rank|Rank_BA')]]
+		# TODO: Naming needs to be harmonized down the line once all predictors are implemented
+        df = df.rename(columns={'Peptide':'sequence','Rank':'Rank.0','Rank_BA':'Rank_BA.0'})
+        # to longformat based on .0|1|2..
+        df_long = pd.melt(
+            df,
+            id_vars=["sequence"],
+            value_vars=[col for col in df.columns if col != "sequence"],
+            var_name="metric",
+            value_name="value",
+        )
+        # Extract the allele information (e.g., .0, .1, etc.)
+        df_long["allele"] = df_long["metric"].str.split('.').str[1]
+        df_long["metric"] = df_long["metric"].str.split('.').str[0]
+
+        # Pivot table to organize columns properly
+        df_pivot = df_long.pivot_table(index=["sequence", "allele"], columns="metric", values="value").reset_index()
+        df_pivot['allele'] = [alleles_dict[int(index.strip("."))] for index in df_pivot['allele']]
+        df_pivot['binder'] = df_pivot['Rank'] <= PredictorBindingThreshold.NETMHCIIPAN.value
+        df_pivot['predictor'] = 'netmhciipan'
+        df_pivot.index.name = ''
+
+        return df_pivot
 
 def main():
     args = Arguments()
 
     for file in args.input:
         result = PredictionResult(file, args.alleles)
+
+        logging.info(f"Writing {len(result.prediction_df)} {result.predictor} predictions to file..")
         result.prediction_df.to_csv(f"{args.prefix}_{result.predictor}.tsv", sep="\t", index=False)
 
     # Parse versions

diff --git a/modules/local/netmhciipan.nf b/modules/local/netmhciipan.nf
@@ -8,28 +8,46 @@ process NETMHCIIPAN {
     tuple val(meta), path(peptide_file), path(software)
 
     output:
-    tuple val(meta), path("*.tsv"), emit: predicted
+    tuple val(meta), path("*.xls"), emit: predicted
     path "versions.yml", emit: versions
 
     script:
     if (meta.mhc_class != "II") {
-        error "NETMHCIIPAN only supports MHC class II. Use NETMHCPAN for MHC class I."
+        error "NETMHCIIPAN only supports MHC class II. Use NETMHCIIPAN for MHC class II."
     }
     def args       = task.ext.args ?: ''
     def prefix     = task.ext.prefix ?: meta.sample
+    // Adjust for netMHCIIpan allele format
+    def alleles = meta.alleles.tokenize(';').collect {
+                    it.contains('DRB') ?
+                        it.replace('*', '_').replace(':', '') :
+                        ('HLA-' + it.replace('*', '').replace(':', ''))
+                }.join(',')
 
     """
+    netmhciipan/netMHCIIpan \
+        -f $peptide_file \
+		-inptype 1 \
+        -a $alleles \
+        -xls \
+        -xlsfile ${prefix}_predicted_netmhciipan.xls \
+        $args
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        \$(cat netmhciipan/data/version | sed -s 's/ version/:/g')
+    END_VERSIONS
     """
 
     stub:
     def args       = task.ext.args ?: ''
     def prefix     = task.ext.prefix ?: meta.sample
     """
-    touch ${prefix}_predicted_netmhciipan.tsv
+    touch ${prefix}_predicted_netmhciipan.xls
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
-        netmhciipan \$(cat data/version | sed -s 's/ version/:/g')
+        \$(cat netmhciipan/data/version | sed -s 's/ version/:/g')
     END_VERSIONS
     """
 }
diff --git a/modules/local/prepare_prediction_input/templates/prepare_prediction_input.py b/modules/local/prepare_prediction_input/templates/prepare_prediction_input.py
@@ -3,19 +3,27 @@
 import argparse
 import shlex
 from enum import Enum
+import logging
 
 import pandas as pd
 
-
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+)
 class MinLength(Enum):
     NETMHCPAN = 8
+    NETMHCIIPAN = 8
 
 class MaxLength(Enum):
     NETMHCPAN = 14
+    NETMHCIIPAN = 25
 
 # TODO: Implement
 class MaxNumberOfAlleles(Enum):
-	NETMHCPAN = 50
+    NETMHCPAN = 50
+    NETMHCIIPAN = 50
 
 class Arguments:
     """
@@ -90,16 +98,31 @@ def main():
     args = Arguments()
 
     df = pd.read_csv(args.input, sep="\t")
+    len_df = len(df)
+    logging.info(f"Reading in file with {len(df)} peptides..")
+
+    # Filter peptides based on user-defined length
     if args.mhc_class == "I":
         df = df[df["sequence"].str.len().between(args.min_peptide_length_classI, args.max_peptide_length_classI)]
     else:
         df = df[df["sequence"].str.len().between(args.min_peptide_length_classII, args.max_peptide_length_classII)]
 
+	# Filter peptides based on tool length boundaries and adjust input format
     if "netmhcpan" in args.tools and args.mhc_class == "I":
+        logging.info("Input for NetMHCpan detected. Parsing input..")
         df = df[df["sequence"].str.len().between(MinLength.NETMHCPAN.value, MaxLength.NETMHCPAN.value)]
-        # We only need a file containing the sequences underneath each other
         df[['sequence']].to_csv(f'{args.prefix}_netmhcpan_input.tsv', sep="\t", header=False, index=False)
 
+    if "netmhciipan" in args.tools and args.mhc_class == "II":
+        logging.info("Input for NetMHCIIpan detected. Parsing input..")
+        df = df[df["sequence"].str.len().between(MinLength.NETMHCIIPAN.value, MaxLength.NETMHCIIPAN.value)]
+        df[['sequence']].to_csv(f'{args.prefix}_netmhciipan_input.tsv', sep="\t", header=False, index=False)
+
+    if len(df) == 0:
+        raise ValueError("No peptides left after applying length filters! Aborting..")
+    else:
+        logging.info(f"{len(df)} peptides post-filtering will be predicted..")
+
     # Parse versions
     versions_this_module = {}
     versions_this_module["${task.process}"] = Version.get_versions([argparse, pd])

diff --git a/subworkflows/local/mhc_binding_prediction.nf b/subworkflows/local/mhc_binding_prediction.nf
@@ -42,7 +42,7 @@ workflow MHC_BINDING_PREDICTION {
                         return [meta, file]
                     }
             .set{ ch_prediction_input }
-        ch_prediction_input.netmhcpan.view()
+        ch_prediction_input.netmhciipan.view()
         SYFPEITHI ( ch_prediction_input.syfpeithi )
         ch_versions = ch_versions.mix(SYFPEITHI.out.versions)
         ch_binding_predictors_out = ch_binding_predictors_out.mix(SYFPEITHI.out.predicted)
@@ -67,12 +67,12 @@ workflow MHC_BINDING_PREDICTION {
             ch_binding_predictors_out = ch_binding_predictors_out.mix(NETMHCPAN.out.predicted)
         }
 
-        if ( "netmhciipan" in params.tools.tokenize(",") )
+        if ( "netmhciipan-4.3" in params.tools.tokenize(",") )
         {
             // TODO: External tools import for netmhciipan
             NETMHCIIPAN_IMPORT( parse_netmhc_params("netmhciipan", "4.3") )
             //TODO: Update netmhc container
-            NETMHCIIPAN ( ch_prediction_input.netmhcpan.combine(NETMHCIIPAN_IMPORT.out.nonfree_tools) )
+            NETMHCIIPAN ( ch_prediction_input.netmhciipan.combine(NETMHCIIPAN_IMPORT.out.nonfree_tools) )
             ch_versions = ch_versions.mix(NETMHCIIPAN.out.versions)
             ch_binding_predictors_out = ch_binding_predictors_out.mix(NETMHCIIPAN.out.predicted)
         }