From 0170cbf9393457894d936b88428f80ac127a888d Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Wed, 21 Aug 2024 15:44:23 -0700 Subject: [PATCH 1/2] psm data class --- casanovo/data/ms_io.py | 65 ++++++++++++++++++++++++++++----- casanovo/denovo/model.py | 20 +++++----- casanovo/denovo/model_runner.py | 2 +- casanovo/utils.py | 10 ++--- 4 files changed, 72 insertions(+), 25 deletions(-) diff --git a/casanovo/data/ms_io.py b/casanovo/data/ms_io.py index da8f603c..eba9fe63 100644 --- a/casanovo/data/ms_io.py +++ b/casanovo/data/ms_io.py @@ -2,11 +2,12 @@ import collections import csv +import dataclasses import operator import os import re from pathlib import Path -from typing import List +from typing import List, Tuple import natsort @@ -14,6 +15,43 @@ from ..config import Config +@dataclasses.dataclass +class PepSpecMatch: + """ + Peptide Spectrum Match (PSM) dataclass + + Parameters + ---------- + sequence : str + The amino acid sequence of the peptide. + spectrum_id : Tuple[str, str] + A tuple containing the spectrum identifier in the form + (spectrum file name, spectrum file idx) + peptide_score : float + Confidence score of the match between the full peptide sequence and the + spectrum. + charge : int + The charge state of the peptide ion observed in the spectrum. + expected_mz : float + The expected mass-to-charge ratio (m/z) of the peptide based on its + sequence and charge state. + actual_mz : float + The observed mass-to-charge ratio (m/z) of the peptide as detected in + the spectrum. + aa_scores : List[float] + A list of confidence scores for individual amino acids in the peptide + sequence, where len(aa_scores) == len(sequence) + """ + + sequence: str + spectrum_id: Tuple[str, str] + peptide_score: float + charge: int + expected_mz: float + actual_mz: float + aa_scores: List[float] + + class MztabWriter: """ Export spectrum identifications to an mzTab file. @@ -42,7 +80,7 @@ def __init__(self, filename: str): ), ] self._run_map = {} - self.psms = [] + self.psms: List[PepSpecMatch] = [] def set_metadata(self, config: Config, **kwargs) -> None: """ @@ -178,34 +216,41 @@ def save(self) -> None: ] ) for i, psm in enumerate( - natsort.natsorted(self.psms, key=operator.itemgetter(1)), 1 + natsort.natsorted( + self.psms, key=operator.attrgetter("spectrum_id") + ), + 1, ): - filename, idx = os.path.abspath(psm[1][0]), psm[1][1] + filename, idx = ( + os.path.abspath(psm.spectrum_id[0]), + psm.spectrum_id[1], + ) writer.writerow( [ "PSM", - psm[0], # sequence + psm.sequence, # sequence i, # PSM_ID "null", # accession "null", # unique "null", # database "null", # database_version f"[MS, MS:1003281, Casanovo, {__version__}]", - psm[2], # search_engine_score[1] + psm.peptide_score, # search_engine_score[1] # FIXME: Modifications should be specified as # controlled vocabulary terms. "null", # modifications # FIXME: Can we get the retention time from the data # loader? "null", # retention_time - int(psm[3]), # charge - psm[4], # exp_mass_to_charge - psm[5], # calc_mass_to_charge + psm.charge, # charge + psm.expected_mz, # exp_mass_to_charge + psm.actual_mz, # calc_mass_to_charge f"ms_run[{self._run_map[filename]}]:{idx}", "null", # pre "null", # post "null", # start "null", # end - psm[6], # opt_ms_run[1]_aa_scores + # opt_ms_run[1]_aa_scores + ",".join(list(map("{:.5f}".format, psm.aa_scores))), ] ) diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 77df6df5..da76da27 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -914,15 +914,17 @@ def on_predict_batch_end( if len(peptide) == 0: continue self.out_writer.psms.append( - ( - peptide, - tuple(spectrum_i), - peptide_score, - charge, - precursor_mz, - self.peptide_mass_calculator.mass(peptide, charge), - ",".join(list(map("{:.5f}".format, aa_scores))), - ), + ms_io.PepSpecMatch( + sequence=peptide, + spectrum_id=tuple(spectrum_i), + peptide_score=peptide_score, + charge=int(charge), + expected_mz=precursor_mz, + actual_mz=self.peptide_mass_calculator.mass( + peptide, charge + ), + aa_scores=aa_scores, + ) ) def _log_history(self) -> None: diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index b88c5542..0f39fe46 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -132,7 +132,7 @@ def log_metrics(self, test_index: AnnotatedSpectrumIndex) -> None: Index containing the annotated spectra used to generate model predictions """ - model_output = [psm[0] for psm in self.writer.psms] + model_output = [psm.sequence for psm in self.writer.psms] spectrum_annotations = [ test_index[i][4] for i in range(test_index.n_spectra) ] diff --git a/casanovo/utils.py b/casanovo/utils.py index fde6cd05..0d2698ce 100644 --- a/casanovo/utils.py +++ b/casanovo/utils.py @@ -1,13 +1,11 @@ """Small utility functions""" -import heapq import logging import os import platform import re import socket import sys -import time from datetime import datetime from typing import Tuple, Dict, List, Optional @@ -16,6 +14,8 @@ import psutil import torch +from .data.ms_io import PepSpecMatch + SCORE_BINS = [0.0, 0.5, 0.9, 0.95, 0.99] @@ -195,7 +195,7 @@ def log_run_report( def log_sequencing_report( - predictions: Tuple[str, Tuple[str, str], float, float, float, float, str], + predictions: List[PepSpecMatch], start_time: Optional[int] = None, end_time: Optional[int] = None, score_bins: List[float] = SCORE_BINS, @@ -219,8 +219,8 @@ def log_sequencing_report( run_report = get_report_dict( pd.DataFrame( { - "sequence": [psm[0] for psm in predictions], - "score": [psm[2] for psm in predictions], + "sequence": [psm.sequence for psm in predictions], + "score": [psm.peptide_score for psm in predictions], } ), score_bins=score_bins, From 9cf9b65e519ccac2673face19b28e2c269c8c11d Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Thu, 22 Aug 2024 12:51:41 -0700 Subject: [PATCH 2/2] PepSpecMatch field naming and documentation --- casanovo/data/ms_io.py | 36 +++++++++++++++++------------------- casanovo/denovo/model.py | 6 ++---- 2 files changed, 19 insertions(+), 23 deletions(-) diff --git a/casanovo/data/ms_io.py b/casanovo/data/ms_io.py index eba9fe63..d1e937f9 100644 --- a/casanovo/data/ms_io.py +++ b/casanovo/data/ms_io.py @@ -7,7 +7,7 @@ import os import re from pathlib import Path -from typing import List, Tuple +from typing import List, Tuple, Iterable import natsort @@ -28,18 +28,18 @@ class PepSpecMatch: A tuple containing the spectrum identifier in the form (spectrum file name, spectrum file idx) peptide_score : float - Confidence score of the match between the full peptide sequence and the + Score of the match between the full peptide sequence and the spectrum. charge : int - The charge state of the peptide ion observed in the spectrum. - expected_mz : float - The expected mass-to-charge ratio (m/z) of the peptide based on its + The precursor charge state of the peptide ion observed in the spectrum. + calc_mz : float + The calculated mass-to-charge ratio (m/z) of the peptide based on its sequence and charge state. - actual_mz : float - The observed mass-to-charge ratio (m/z) of the peptide as detected in - the spectrum. - aa_scores : List[float] - A list of confidence scores for individual amino acids in the peptide + exp_mz : float + The observed (experimental) precursor mass-to-charge ratio (m/z) of the + peptide as detected in the spectrum. + aa_scores : Iterable[float] + A list of scores for individual amino acids in the peptide sequence, where len(aa_scores) == len(sequence) """ @@ -47,9 +47,9 @@ class PepSpecMatch: spectrum_id: Tuple[str, str] peptide_score: float charge: int - expected_mz: float - actual_mz: float - aa_scores: List[float] + calc_mz: float + exp_mz: float + aa_scores: Iterable[float] class MztabWriter: @@ -221,10 +221,8 @@ def save(self) -> None: ), 1, ): - filename, idx = ( - os.path.abspath(psm.spectrum_id[0]), - psm.spectrum_id[1], - ) + filename = os.path.abspath(psm.spectrum_id[0]) + idx = psm.spectrum_id[1] writer.writerow( [ "PSM", @@ -243,8 +241,8 @@ def save(self) -> None: # loader? "null", # retention_time psm.charge, # charge - psm.expected_mz, # exp_mass_to_charge - psm.actual_mz, # calc_mass_to_charge + psm.exp_mz, # exp_mass_to_charge + psm.calc_mz, # calc_mass_to_charge f"ms_run[{self._run_map[filename]}]:{idx}", "null", # pre "null", # post diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index da76da27..6e984a1d 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -919,10 +919,8 @@ def on_predict_batch_end( spectrum_id=tuple(spectrum_i), peptide_score=peptide_score, charge=int(charge), - expected_mz=precursor_mz, - actual_mz=self.peptide_mass_calculator.mass( - peptide, charge - ), + calc_mz=precursor_mz, + exp_mz=self.peptide_mass_calculator.mass(peptide, charge), aa_scores=aa_scores, ) )