From 71a83881716ff3d81e610dff4f6ad4422df63461 Mon Sep 17 00:00:00 2001 From: jonasscheid Date: Mon, 16 Dec 2024 21:03:05 +0000 Subject: [PATCH 1/5] parse input for mhcflurry --- conf/modules.config | 11 ++++-- .../templates/merge_predictions.py | 35 +++++++++++++++++-- 2 files changed, 41 insertions(+), 5 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 30ff2d0..6056952 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -74,7 +74,7 @@ process { ] } - withName: NETMHCPAN{ + withName: NETMHCPAN { ext.args = "-BA" publishDir = [ path: { "${params.outdir}/netmhcpan" }, @@ -82,7 +82,7 @@ process { ] } - withName: NETMHCIIPAN{ + withName: NETMHCIIPAN { ext.args = "-BA" publishDir = [ path: { "${params.outdir}/netmhciipan" }, @@ -90,6 +90,13 @@ process { ] } + withName: MERGE_PREDICTIONS { + publishDir = [ + path: { "${params.outdir}/merged_predictions" }, + mode: params.publish_dir_mode + ] + } + withName: EPYTOPE_GENERATE_PEPTIDES { publishDir = [ path: { "${params.outdir}/generated_peptides/${meta.sample}" }, diff --git a/modules/local/merge_predictions/templates/merge_predictions.py b/modules/local/merge_predictions/templates/merge_predictions.py index 59fb9d4..42fcb3a 100644 --- a/modules/local/merge_predictions/templates/merge_predictions.py +++ b/modules/local/merge_predictions/templates/merge_predictions.py @@ -18,6 +18,7 @@ ) class PredictorBindingThreshold(Enum): + MHCFLURRY = 2 NETMHCPAN = 2 NETMHCIIPAN = 5 @@ -86,6 +87,7 @@ def format_yaml_like(data: dict, indent: int = 0) -> str: return yaml_str class PredictionResult: + """ Class to handle prediction results from different predictors. """ def __init__(self, file_path, alleles): self.file_path = file_path self.alleles = alleles @@ -115,13 +117,33 @@ def _format_prediction_result(self): def _format_syfpeithi_prediction(self): pass - def _format_mhcflurry_prediction(self): - pass + def _format_mhcflurry_prediction(self) -> pd.DataFrame: + """ + Read in mhcflurry prediction output comprising the columns + `peptide,allele,mhcflurry_affinity,mhcflurry_affinity_percentile,mhcflurry_processing_score, + mhcflurry_presentation_score,mhcflurry_presentation_percentile` + + Returns: pd.DataFrame with columns `sequence,allele,PS_Rank,BA_Rank,binder,predictor` + """ + df = pd.read_csv(self.file_path) + df.rename(columns={"peptide": "sequence", "mhcflurry_presentation_percentile": "PS_Rank", "mhcflurry_affinity_percentile": "BA_Rank"}, inplace=True) + df = df[['sequence', 'allele', 'PS_Rank', 'BA_Rank']] + df["binder"] = df["PS_Rank"] <= PredictorBindingThreshold.MHCFLURRY.value + df["predictor"] = "mhcflurry" + + return df + def _format_mhcnuggets_prediction(self): pass def _format_netmhcpan_prediction(self) -> pd.DataFrame: + """ + Read in netmhcpan prediction output comprising the columns + `Peptide,Rank,EL_Rank,BA_Rank` for multiple alleles. + + Returns: pd.DataFrame with columns `sequence,allele,EL_Rank,BA_Rank,binder,predictor` + """ # Map with allele index to allele name.NetMHCpan sorts alleles alphabetically alleles_dict = {i: allele for i, allele in enumerate(self.alleles)} # Read the file into a DataFrame with no headers initially @@ -151,6 +173,12 @@ def _format_netmhcpan_prediction(self) -> pd.DataFrame: return df_pivot def _format_netmhciipan_prediction(self) -> pd.DataFrame: + """ + Read in netmhciipan prediction output comprising the columns + `Peptide,Rank,Rank_BA` for multiple alleles. + + Returns: pd.DataFrame with columns `sequence,allele,Rank,Rank_BA,binder,predictor` + """ # Map with allele index to allele name. NetMHCIIpan sorts alleles alphabetically alleles_dict = {i: allele for i, allele in enumerate(self.alleles)} # Read the file into a DataFrame with no headers initially @@ -173,7 +201,8 @@ def _format_netmhciipan_prediction(self) -> pd.DataFrame: # Pivot table to organize columns properly df_pivot = df_long.pivot_table(index=["sequence", "allele"], columns="metric", values="value").reset_index() df_pivot['allele'] = [alleles_dict[int(index.strip("."))] for index in df_pivot['allele']] - df_pivot['binder'] = df_pivot['Rank'] <= PredictorBindingThreshold.NETMHCIIPAN.value + df_pivot.rename(columns={"Rank": "EL_Rank", "Rank_BA": "BA_Rank"}, inplace=True) + df_pivot['binder'] = df_pivot['EL_Rank'] <= PredictorBindingThreshold.NETMHCIIPAN.value df_pivot['predictor'] = 'netmhciipan' df_pivot.index.name = '' From 79f692fd7c8f8359f1185a7ef76d142ed0fcf419 Mon Sep 17 00:00:00 2001 From: jonasscheid Date: Mon, 16 Dec 2024 21:03:30 +0000 Subject: [PATCH 2/5] add local mhcflurry module --- modules/local/mhcflurry.nf | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/modules/local/mhcflurry.nf b/modules/local/mhcflurry.nf index 61ed356..db9120f 100644 --- a/modules/local/mhcflurry.nf +++ b/modules/local/mhcflurry.nf @@ -2,33 +2,47 @@ process MHCFLURRY { label 'process_single' tag "${meta.sample}" - conda "bioconda::mhcflurry=2.0.6" + conda "bioconda::mhcflurry=2.1.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/mhcflurry:2.1.4--pyh7e72e81_0' : - 'quay.io/biocontainers/mhcflurry:2.1.4--pyh7e72e81_0' }" + 'https://depot.galaxyproject.org/singularity/mhcflurry:2.1.1--pyh7cba7a3_0' : + 'quay.io/biocontainers/mhcflurry:2.1.1--pyh7cba7a3_0' }" + + // userEmulation settings when docker is specified + containerOptions = (workflow.containerEngine == 'docker') ? '-u $(id -u) -e "HOME=${HOME}" -v /etc/passwd:/etc/passwd:ro -v /etc/shadow:/etc/shadow:ro -v /etc/group:/etc/group:ro -v $HOME:$HOME' : '' input: tuple val(meta), path(peptide_file) output: - tuple val(meta), path("*.tsv"), emit: predicted + tuple val(meta), path("*.csv"), emit: predicted path "versions.yml", emit: versions script: if (meta.mhc_class == "II") { error("MHCflurry prediction of ${meta.sample} is not possible with MHC class II!") } - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: meta.sample + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: meta.sample """ + mhcflurry-downloads fetch models_class1_presentation + mhcflurry-predict \\ + $peptide_file \\ + --out ${prefix}_predicted_mhcflurry.csv \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + \$(mhcflurry-predict --version) + END_VERSIONS """ + stub: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: meta.sample + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: meta.sample """ - touch ${prefix}_predicted_mhcflurry.tsv + touch ${prefix}_predicted_mhcflurry.csv cat <<-END_VERSIONS > versions.yml "${task.process}": From 1083bae3acc7b23f80812ec8e44d251c6274d12f Mon Sep 17 00:00:00 2001 From: jonasscheid Date: Mon, 16 Dec 2024 21:04:18 +0000 Subject: [PATCH 3/5] parse in and output of mhcflurry --- .../templates/prepare_prediction_input.py | 47 +++++++++++-------- subworkflows/local/mhc_binding_prediction.nf | 2 +- 2 files changed, 29 insertions(+), 20 deletions(-) diff --git a/modules/local/prepare_prediction_input/templates/prepare_prediction_input.py b/modules/local/prepare_prediction_input/templates/prepare_prediction_input.py index 894b9b1..9748e86 100644 --- a/modules/local/prepare_prediction_input/templates/prepare_prediction_input.py +++ b/modules/local/prepare_prediction_input/templates/prepare_prediction_input.py @@ -13,10 +13,12 @@ datefmt="%Y-%m-%d %H:%M:%S", ) class MinLength(Enum): + MHCFLURRY = 5 NETMHCPAN = 8 NETMHCIIPAN = 8 class MaxLength(Enum): + MHCFLURRY = 15 NETMHCPAN = 14 NETMHCIIPAN = 25 @@ -27,13 +29,14 @@ class MaxNumberOfAlleles(Enum): class Arguments: """ - Parses the argments, including the ones coming from $task.ext.args. + Parses the arguments, including the ones coming from $task.ext.args. """ def __init__(self) -> None: self.input = "$tsv" self.prefix = "$task.ext.prefix" if "$task.ext.prefix" != "null" else "$meta.id" self.mhc_class = "$meta.mhc_class" + self.alleles = "$meta.alleles".split(";") self.tools = "$params.tools" self.min_peptide_length_classI = int("$params.min_peptide_length_classI") self.max_peptide_length_classI = int("$params.max_peptide_length_classI") @@ -97,31 +100,37 @@ def format_yaml_like(data: dict, indent: int = 0) -> str: def main(): args = Arguments() - df = pd.read_csv(args.input, sep="\t") - len_df = len(df) - logging.info(f"Reading in file with {len(df)} peptides..") + df_input = pd.read_csv(args.input, sep="\t") + logging.info(f"Reading in file with {len(df_input)} peptides..") # Filter peptides based on user-defined length if args.mhc_class == "I": - df = df[df["sequence"].str.len().between(args.min_peptide_length_classI, args.max_peptide_length_classI)] + df = df_input[df_input["sequence"].str.len().between(args.min_peptide_length_classI, args.max_peptide_length_classI)] else: - df = df[df["sequence"].str.len().between(args.min_peptide_length_classII, args.max_peptide_length_classII)] - - # Filter peptides based on tool length boundaries and adjust input format - if "netmhcpan" in args.tools and args.mhc_class == "I": - logging.info("Input for NetMHCpan detected. Parsing input..") - df = df[df["sequence"].str.len().between(MinLength.NETMHCPAN.value, MaxLength.NETMHCPAN.value)] - df[['sequence']].to_csv(f'{args.prefix}_netmhcpan_input.tsv', sep="\t", header=False, index=False) - - if "netmhciipan" in args.tools and args.mhc_class == "II": - logging.info("Input for NetMHCIIpan detected. Parsing input..") - df = df[df["sequence"].str.len().between(MinLength.NETMHCIIPAN.value, MaxLength.NETMHCIIPAN.value)] - df[['sequence']].to_csv(f'{args.prefix}_netmhciipan_input.tsv', sep="\t", header=False, index=False) + df = df_input[df_input["sequence"].str.len().between(args.min_peptide_length_classII, args.max_peptide_length_classII)] if len(df) == 0: raise ValueError("No peptides left after applying length filters! Aborting..") - else: - logging.info(f"{len(df)} peptides post-filtering will be predicted..") + + # Filter peptides based on tool length boundaries and adjust input format + if "mhcflurry" in args.tools and args.mhc_class == "I": + df_mhcflurry = df[df["sequence"].str.len().between(MinLength.MHCFLURRY.value, MaxLength.MHCFLURRY.value)] + logging.info(f"Input for NetMHCpan detected. Preparing {len(df_mhcflurry)} peptides for prediction..") + # Get every combination of sequence and allele and write them to csv with columns sequence and allele + df_mhcflurry['allele'] = [args.alleles] * len(df_mhcflurry) + df_mhcflurry = df_mhcflurry.explode('allele').reset_index(drop=True) + df_mhcflurry.rename(columns={"sequence": "peptide"}, inplace=True) + df_mhcflurry[['peptide','allele']].to_csv(f'{args.prefix}_mhcflurry_input.csv', index=False) + + if "netmhcpan" in args.tools and args.mhc_class == "I": + df_netmhcpan = df[df["sequence"].str.len().between(MinLength.NETMHCPAN.value, MaxLength.NETMHCPAN.value)] + logging.info(f"Input for NetMHCpan detected. Preparing {len(df_netmhcpan)} peptides for prediction..") + df_netmhcpan[['sequence']].to_csv(f'{args.prefix}_netmhcpan_input.tsv', sep="\t", header=False, index=False) + + elif "netmhciipan" in args.tools and args.mhc_class == "II": + df_netmhciipan = df[df["sequence"].str.len().between(MinLength.NETMHCIIPAN.value, MaxLength.NETMHCIIPAN.value)] + logging.info(f"Input for NetMHCpan detected. Preparing {len(df_netmhciipan)} peptides for prediction..") + df_netmhciipan[['sequence']].to_csv(f'{args.prefix}_netmhciipan_input.tsv', sep="\t", header=False, index=False) # Parse versions versions_this_module = {} diff --git a/subworkflows/local/mhc_binding_prediction.nf b/subworkflows/local/mhc_binding_prediction.nf index 69dd51c..cdcae8f 100755 --- a/subworkflows/local/mhc_binding_prediction.nf +++ b/subworkflows/local/mhc_binding_prediction.nf @@ -42,7 +42,7 @@ workflow MHC_BINDING_PREDICTION { return [meta, file] } .set{ ch_prediction_input } - ch_prediction_input.netmhciipan.view() + ch_prediction_input.mhcflurry.view() SYFPEITHI ( ch_prediction_input.syfpeithi ) ch_versions = ch_versions.mix(SYFPEITHI.out.versions) ch_binding_predictors_out = ch_binding_predictors_out.mix(SYFPEITHI.out.predicted) From 0140bc59ac68230058ea44e42d5dc321a2159e9d Mon Sep 17 00:00:00 2001 From: jonasscheid Date: Tue, 17 Dec 2024 06:58:51 +0000 Subject: [PATCH 4/5] bump mhcflurry version --- modules/local/mhcflurry.nf | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/modules/local/mhcflurry.nf b/modules/local/mhcflurry.nf index db9120f..50f5db9 100644 --- a/modules/local/mhcflurry.nf +++ b/modules/local/mhcflurry.nf @@ -2,12 +2,12 @@ process MHCFLURRY { label 'process_single' tag "${meta.sample}" - conda "bioconda::mhcflurry=2.1.1" + conda "bioconda::mhcflurry=2.1.4" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/mhcflurry:2.1.1--pyh7cba7a3_0' : - 'quay.io/biocontainers/mhcflurry:2.1.1--pyh7cba7a3_0' }" + 'https://depot.galaxyproject.org/singularity/mhcflurry:2.1.4--pyh7e72e81_1' : + 'quay.io/biocontainers/mhcflurry:2.1.4--pyh7e72e81_1' }" - // userEmulation settings when docker is specified + // MHCflurry downloads models always to ~/.local/share/mhcflurry containerOptions = (workflow.containerEngine == 'docker') ? '-u $(id -u) -e "HOME=${HOME}" -v /etc/passwd:/etc/passwd:ro -v /etc/shadow:/etc/shadow:ro -v /etc/group:/etc/group:ro -v $HOME:$HOME' : '' input: @@ -37,7 +37,6 @@ process MHCFLURRY { END_VERSIONS """ - stub: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: meta.sample From 0610c87f27ef50b7475c4b33d85a7f968847ab44 Mon Sep 17 00:00:00 2001 From: jonasscheid Date: Tue, 17 Dec 2024 07:01:45 +0000 Subject: [PATCH 5/5] align config and schema params --- nextflow_schema.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 4001745..eb9df96 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -79,18 +79,18 @@ }, "max_peptide_length_classI": { "type": "integer", - "default": 11, + "default": 14, "description": "Specifies the maximum peptide length.", "help_text": "Specifies the maximum peptide length (not applied when `--peptides` is specified). Default: MHC class I: 11 aa, MHC class II: 16 aa" }, "min_peptide_length_classII": { "type": "integer", - "default": 15, + "default": 8, "description": "Specifies the minimum peptide length for MHC class II peptides." }, "max_peptide_length_classII": { "type": "integer", - "default": 16, + "default": 25, "description": "Specifies the maximum peptide length for MHC class II peptides." }, "tools": {