Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add initial mhcflurry module #253

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -74,22 +74,29 @@ process {
]
}

withName: NETMHCPAN{
withName: NETMHCPAN {
ext.args = "-BA"
publishDir = [
path: { "${params.outdir}/netmhcpan" },
mode: params.publish_dir_mode
]
}

withName: NETMHCIIPAN{
withName: NETMHCIIPAN {
ext.args = "-BA"
publishDir = [
path: { "${params.outdir}/netmhciipan" },
mode: params.publish_dir_mode
]
}

withName: MERGE_PREDICTIONS {
publishDir = [
path: { "${params.outdir}/merged_predictions" },
mode: params.publish_dir_mode
]
}

withName: EPYTOPE_GENERATE_PEPTIDES {
publishDir = [
path: { "${params.outdir}/generated_peptides/${meta.sample}" },
Expand Down
35 changes: 32 additions & 3 deletions modules/local/merge_predictions/templates/merge_predictions.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
)

class PredictorBindingThreshold(Enum):
MHCFLURRY = 2
NETMHCPAN = 2
NETMHCIIPAN = 5

Expand Down Expand Up @@ -86,6 +87,7 @@ def format_yaml_like(data: dict, indent: int = 0) -> str:
return yaml_str

class PredictionResult:
""" Class to handle prediction results from different predictors. """
def __init__(self, file_path, alleles):
self.file_path = file_path
self.alleles = alleles
Expand Down Expand Up @@ -115,13 +117,33 @@ def _format_prediction_result(self):
def _format_syfpeithi_prediction(self):
pass

def _format_mhcflurry_prediction(self):
pass
def _format_mhcflurry_prediction(self) -> pd.DataFrame:
"""
Read in mhcflurry prediction output comprising the columns
`peptide,allele,mhcflurry_affinity,mhcflurry_affinity_percentile,mhcflurry_processing_score,
mhcflurry_presentation_score,mhcflurry_presentation_percentile`

Returns: pd.DataFrame with columns `sequence,allele,PS_Rank,BA_Rank,binder,predictor`
"""
df = pd.read_csv(self.file_path)
df.rename(columns={"peptide": "sequence", "mhcflurry_presentation_percentile": "PS_Rank", "mhcflurry_affinity_percentile": "BA_Rank"}, inplace=True)
df = df[['sequence', 'allele', 'PS_Rank', 'BA_Rank']]
df["binder"] = df["PS_Rank"] <= PredictorBindingThreshold.MHCFLURRY.value
df["predictor"] = "mhcflurry"

return df


def _format_mhcnuggets_prediction(self):
pass

def _format_netmhcpan_prediction(self) -> pd.DataFrame:
"""
Read in netmhcpan prediction output comprising the columns
`Peptide,Rank,EL_Rank,BA_Rank` for multiple alleles.

Returns: pd.DataFrame with columns `sequence,allele,EL_Rank,BA_Rank,binder,predictor`
"""
# Map with allele index to allele name.NetMHCpan sorts alleles alphabetically
alleles_dict = {i: allele for i, allele in enumerate(self.alleles)}
# Read the file into a DataFrame with no headers initially
Expand Down Expand Up @@ -151,6 +173,12 @@ def _format_netmhcpan_prediction(self) -> pd.DataFrame:
return df_pivot

def _format_netmhciipan_prediction(self) -> pd.DataFrame:
"""
Read in netmhciipan prediction output comprising the columns
`Peptide,Rank,Rank_BA` for multiple alleles.

Returns: pd.DataFrame with columns `sequence,allele,Rank,Rank_BA,binder,predictor`
"""
# Map with allele index to allele name. NetMHCIIpan sorts alleles alphabetically
alleles_dict = {i: allele for i, allele in enumerate(self.alleles)}
# Read the file into a DataFrame with no headers initially
Expand All @@ -173,7 +201,8 @@ def _format_netmhciipan_prediction(self) -> pd.DataFrame:
# Pivot table to organize columns properly
df_pivot = df_long.pivot_table(index=["sequence", "allele"], columns="metric", values="value").reset_index()
df_pivot['allele'] = [alleles_dict[int(index.strip("."))] for index in df_pivot['allele']]
df_pivot['binder'] = df_pivot['Rank'] <= PredictorBindingThreshold.NETMHCIIPAN.value
df_pivot.rename(columns={"Rank": "EL_Rank", "Rank_BA": "BA_Rank"}, inplace=True)
df_pivot['binder'] = df_pivot['EL_Rank'] <= PredictorBindingThreshold.NETMHCIIPAN.value
df_pivot['predictor'] = 'netmhciipan'
df_pivot.index.name = ''

Expand Down
31 changes: 22 additions & 9 deletions modules/local/mhcflurry.nf
Original file line number Diff line number Diff line change
Expand Up @@ -2,33 +2,46 @@ process MHCFLURRY {
label 'process_single'
tag "${meta.sample}"

conda "bioconda::mhcflurry=2.0.6"
conda "bioconda::mhcflurry=2.1.4"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/mhcflurry:2.1.4--pyh7e72e81_0' :
'quay.io/biocontainers/mhcflurry:2.1.4--pyh7e72e81_0' }"
'https://depot.galaxyproject.org/singularity/mhcflurry:2.1.4--pyh7e72e81_1' :
'quay.io/biocontainers/mhcflurry:2.1.4--pyh7e72e81_1' }"

// MHCflurry downloads models always to ~/.local/share/mhcflurry
containerOptions = (workflow.containerEngine == 'docker') ? '-u $(id -u) -e "HOME=${HOME}" -v /etc/passwd:/etc/passwd:ro -v /etc/shadow:/etc/shadow:ro -v /etc/group:/etc/group:ro -v $HOME:$HOME' : ''

input:
tuple val(meta), path(peptide_file)

output:
tuple val(meta), path("*.tsv"), emit: predicted
tuple val(meta), path("*.csv"), emit: predicted
path "versions.yml", emit: versions

script:
if (meta.mhc_class == "II") {
error("MHCflurry prediction of ${meta.sample} is not possible with MHC class II!")
}
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: meta.sample
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: meta.sample

"""
mhcflurry-downloads fetch models_class1_presentation
mhcflurry-predict \\
$peptide_file \\
--out ${prefix}_predicted_mhcflurry.csv \\
$args

cat <<-END_VERSIONS > versions.yml
"${task.process}":
\$(mhcflurry-predict --version)
END_VERSIONS
"""

stub:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: meta.sample
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: meta.sample
"""
touch ${prefix}_predicted_mhcflurry.tsv
touch ${prefix}_predicted_mhcflurry.csv

cat <<-END_VERSIONS > versions.yml
"${task.process}":
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,12 @@
datefmt="%Y-%m-%d %H:%M:%S",
)
class MinLength(Enum):
MHCFLURRY = 5
NETMHCPAN = 8
NETMHCIIPAN = 8

class MaxLength(Enum):
MHCFLURRY = 15
NETMHCPAN = 14
NETMHCIIPAN = 25

Expand All @@ -27,13 +29,14 @@ class MaxNumberOfAlleles(Enum):

class Arguments:
"""
Parses the argments, including the ones coming from $task.ext.args.
Parses the arguments, including the ones coming from $task.ext.args.
"""

def __init__(self) -> None:
self.input = "$tsv"
self.prefix = "$task.ext.prefix" if "$task.ext.prefix" != "null" else "$meta.id"
self.mhc_class = "$meta.mhc_class"
self.alleles = "$meta.alleles".split(";")
self.tools = "$params.tools"
self.min_peptide_length_classI = int("$params.min_peptide_length_classI")
self.max_peptide_length_classI = int("$params.max_peptide_length_classI")
Expand Down Expand Up @@ -97,31 +100,37 @@ def format_yaml_like(data: dict, indent: int = 0) -> str:
def main():
args = Arguments()

df = pd.read_csv(args.input, sep="\t")
len_df = len(df)
logging.info(f"Reading in file with {len(df)} peptides..")
df_input = pd.read_csv(args.input, sep="\t")
logging.info(f"Reading in file with {len(df_input)} peptides..")

# Filter peptides based on user-defined length
if args.mhc_class == "I":
df = df[df["sequence"].str.len().between(args.min_peptide_length_classI, args.max_peptide_length_classI)]
df = df_input[df_input["sequence"].str.len().between(args.min_peptide_length_classI, args.max_peptide_length_classI)]
else:
df = df[df["sequence"].str.len().between(args.min_peptide_length_classII, args.max_peptide_length_classII)]

# Filter peptides based on tool length boundaries and adjust input format
if "netmhcpan" in args.tools and args.mhc_class == "I":
logging.info("Input for NetMHCpan detected. Parsing input..")
df = df[df["sequence"].str.len().between(MinLength.NETMHCPAN.value, MaxLength.NETMHCPAN.value)]
df[['sequence']].to_csv(f'{args.prefix}_netmhcpan_input.tsv', sep="\t", header=False, index=False)

if "netmhciipan" in args.tools and args.mhc_class == "II":
logging.info("Input for NetMHCIIpan detected. Parsing input..")
df = df[df["sequence"].str.len().between(MinLength.NETMHCIIPAN.value, MaxLength.NETMHCIIPAN.value)]
df[['sequence']].to_csv(f'{args.prefix}_netmhciipan_input.tsv', sep="\t", header=False, index=False)
df = df_input[df_input["sequence"].str.len().between(args.min_peptide_length_classII, args.max_peptide_length_classII)]

if len(df) == 0:
raise ValueError("No peptides left after applying length filters! Aborting..")
else:
logging.info(f"{len(df)} peptides post-filtering will be predicted..")

# Filter peptides based on tool length boundaries and adjust input format
if "mhcflurry" in args.tools and args.mhc_class == "I":
df_mhcflurry = df[df["sequence"].str.len().between(MinLength.MHCFLURRY.value, MaxLength.MHCFLURRY.value)]
logging.info(f"Input for NetMHCpan detected. Preparing {len(df_mhcflurry)} peptides for prediction..")
# Get every combination of sequence and allele and write them to csv with columns sequence and allele
df_mhcflurry['allele'] = [args.alleles] * len(df_mhcflurry)
df_mhcflurry = df_mhcflurry.explode('allele').reset_index(drop=True)
df_mhcflurry.rename(columns={"sequence": "peptide"}, inplace=True)
df_mhcflurry[['peptide','allele']].to_csv(f'{args.prefix}_mhcflurry_input.csv', index=False)

if "netmhcpan" in args.tools and args.mhc_class == "I":
df_netmhcpan = df[df["sequence"].str.len().between(MinLength.NETMHCPAN.value, MaxLength.NETMHCPAN.value)]
logging.info(f"Input for NetMHCpan detected. Preparing {len(df_netmhcpan)} peptides for prediction..")
df_netmhcpan[['sequence']].to_csv(f'{args.prefix}_netmhcpan_input.tsv', sep="\t", header=False, index=False)

elif "netmhciipan" in args.tools and args.mhc_class == "II":
df_netmhciipan = df[df["sequence"].str.len().between(MinLength.NETMHCIIPAN.value, MaxLength.NETMHCIIPAN.value)]
logging.info(f"Input for NetMHCpan detected. Preparing {len(df_netmhciipan)} peptides for prediction..")
df_netmhciipan[['sequence']].to_csv(f'{args.prefix}_netmhciipan_input.tsv', sep="\t", header=False, index=False)

# Parse versions
versions_this_module = {}
Expand Down
6 changes: 3 additions & 3 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -79,18 +79,18 @@
},
"max_peptide_length_classI": {
"type": "integer",
"default": 11,
"default": 14,
"description": "Specifies the maximum peptide length.",
"help_text": "Specifies the maximum peptide length (not applied when `--peptides` is specified). Default: MHC class I: 11 aa, MHC class II: 16 aa"
},
"min_peptide_length_classII": {
"type": "integer",
"default": 15,
"default": 8,
"description": "Specifies the minimum peptide length for MHC class II peptides."
},
"max_peptide_length_classII": {
"type": "integer",
"default": 16,
"default": 25,
"description": "Specifies the maximum peptide length for MHC class II peptides."
},
"tools": {
Expand Down
2 changes: 1 addition & 1 deletion subworkflows/local/mhc_binding_prediction.nf
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ workflow MHC_BINDING_PREDICTION {
return [meta, file]
}
.set{ ch_prediction_input }
ch_prediction_input.netmhciipan.view()
ch_prediction_input.mhcflurry.view()
SYFPEITHI ( ch_prediction_input.syfpeithi )
ch_versions = ch_versions.mix(SYFPEITHI.out.versions)
ch_binding_predictors_out = ch_binding_predictors_out.mix(SYFPEITHI.out.predicted)
Expand Down
Loading