diff --git a/.github/ISSUE_TEMPLATE/bug_report_template.md b/.github/ISSUE_TEMPLATE/bug_report_template.md new file mode 100644 index 00000000..cf4c487c --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report_template.md @@ -0,0 +1,55 @@ +--- +name: Bug Report +about: Submit a Casanovo Bug Report +labels: bug +--- + +## Describe the Issue +A clear and concise description of what the issue/bug is. + +## Steps To Reproduce +Steps to reproduce the incorrect behavior. + +## Expected Behavior +A clear and concise description of what you expected to happen. + +## Terminal Output (If Applicable) +Provide any applicable console output in between the tick marks below. + +``` + +``` + +## Environment: +- OS: [e.g. Windows 11, Windows 10, macOS 14, Ubuntu 24.04] +- Casanovo Version: [e.g. 4.2.1] +- Hardware Used (CPU or GPU, if GPU also GPU model and CUDA version): [e.g. GPU: NVIDIA GeForce RTX 2070, CUDA Version: 12.5] + +### Checking GPU Version + +The GPU model can be checked by typing `nvidia-smi` into a terminal/console window. +An example of how to use this command is shown below. +In this case, the CUDA version is 12.5 and the GPU model is GeForce RTX 2070. + + +``` +(casanovo_env) C:\Users\\OneDrive\Documents\casanovo>nvidia-smi +Fri Aug 2 12:34:57 2024 ++-----------------------------------------------------------------------------------------+ +| NVIDIA-SMI 555.99 Driver Version: 555.99 CUDA Version: 12.5 | +|-----------------------------------------+------------------------+----------------------+ +| GPU Name Driver-Model | Bus-Id Disp.A | Volatile Uncorr. ECC | +| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | +| | | MIG M. | +|=========================================+========================+======================| +| 0 NVIDIA GeForce RTX 2070 ... WDDM | 00000000:01:00.0 On | N/A | +| N/A 60C P8 16W / 90W | 1059MiB / 8192MiB | 0% Default | +| | | N/A | ++-----------------------------------------+------------------------+----------------------+ +``` + +## Additional Context +Add any other context about the problem here. + +## Attach Files +Please attach all input files used and the full Casanovo log file. diff --git a/CHANGELOG.md b/CHANGELOG.md index 9bd936a4..12a92e38 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,10 +11,18 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - During training, model checkpoints will be saved at the end of each training epoch in addition to the checkpoints saved at the end of every validation run. - Besides as a local file, model weights can be specified from a URL. Upon initial download, the weights file is cached for future re-use. +### Changed + +- Removed the `evaluate` sub-command, and all model evaluation functionality has been moved to the `sequence` command using the new `--evaluate` flag. + ### Fixed - Precursor charges are exported as integers instead of floats in the mzTab output file, in compliance with the mzTab specification. +### Removed + +- Removed the `save_top_k` option from the Casanovo config, the model with the lowest validation loss during training will now be saved to a fixed filename `.best.ckpt`. + ## [4.2.1] - 2024-06-25 ### Fixed diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py index cd2274a0..0c220ddb 100644 --- a/casanovo/casanovo.py +++ b/casanovo/casanovo.py @@ -128,64 +128,50 @@ def main() -> None: nargs=-1, type=click.Path(exists=True, dir_okay=False), ) +@click.option( + "--evaluate", + "-e", + is_flag=True, + default=False, + help=""" + Run in evaluation mode. When this flag is set the peptide and amino + acid precision will be calculated and logged at the end of the sequencing + run. All input files must be annotated MGF files if running in evaluation + mode. + """, +) def sequence( peak_path: Tuple[str], model: Optional[str], config: Optional[str], output: Optional[str], verbosity: str, + evaluate: bool, ) -> None: """De novo sequence peptides from tandem mass spectra. - PEAK_PATH must be one or more mzMl, mzXML, or MGF files from which - to sequence peptides. + PEAK_PATH must be one or more mzML, mzXML, or MGF files from which + to sequence peptides. If evaluate is set to True PEAK_PATH must be + one or more annotated MGF file. """ output = setup_logging(output, verbosity) config, model = setup_model(model, config, output, False) start_time = time.time() with ModelRunner(config, model) as runner: - logger.info("Sequencing peptides from:") + logger.info( + "Sequencing %speptides from:", + "and evaluating " if evaluate else "", + ) for peak_file in peak_path: logger.info(" %s", peak_file) - runner.predict(peak_path, output) + runner.predict(peak_path, output, evaluate=evaluate) psms = runner.writer.psms utils.log_sequencing_report( psms, start_time=start_time, end_time=time.time() ) -@main.command(cls=_SharedParams) -@click.argument( - "annotated_peak_path", - required=True, - nargs=-1, - type=click.Path(exists=True, dir_okay=False), -) -def evaluate( - annotated_peak_path: Tuple[str], - model: Optional[str], - config: Optional[str], - output: Optional[str], - verbosity: str, -) -> None: - """Evaluate de novo peptide sequencing performance. - - ANNOTATED_PEAK_PATH must be one or more annoated MGF files, - such as those provided by MassIVE-KB. - """ - output = setup_logging(output, verbosity) - config, model = setup_model(model, config, output, False) - start_time = time.time() - with ModelRunner(config, model) as runner: - logger.info("Sequencing and evaluating peptides from:") - for peak_file in annotated_peak_path: - logger.info(" %s", peak_file) - - runner.evaluate(annotated_peak_path) - utils.log_run_report(start_time=start_time, end_time=time.time()) - - @main.command(cls=_SharedParams) @click.argument( "train_peak_path", diff --git a/casanovo/config.py b/casanovo/config.py index 792da35a..453f7b15 100644 --- a/casanovo/config.py +++ b/casanovo/config.py @@ -18,6 +18,7 @@ _config_deprecated = dict( every_n_train_steps="val_check_interval", max_iters="cosine_schedule_period_iters", + save_top_k=None, ) @@ -74,7 +75,6 @@ class Config: top_match=int, max_epochs=int, num_sanity_val_steps=int, - save_top_k=int, model_save_folder_path=str, val_check_interval=int, calculate_precision=bool, @@ -96,12 +96,20 @@ def __init__(self, config_file: Optional[str] = None): # Remap deprecated config entries. for old, new in _config_deprecated.items(): if old in self._user_config: - self._user_config[new] = self._user_config.pop(old) - warnings.warn( - f"Deprecated config option '{old}' remapped to " - f"'{new}'", - DeprecationWarning, - ) + if new is not None: + self._user_config[new] = self._user_config.pop(old) + warning_msg = ( + f"Deprecated config option '{old}' " + f"remapped to '{new}'" + ) + else: + del self._user_config[old] + warning_msg = ( + f"Deprecated config option '{old}' " + "is no longer in use" + ) + + warnings.warn(warning_msg, DeprecationWarning) # Check for missing entries in config file. config_missing = self._params.keys() - self._user_config.keys() if len(config_missing) > 0: diff --git a/casanovo/config.yaml b/casanovo/config.yaml index c7186ff7..3beb5f30 100644 --- a/casanovo/config.yaml +++ b/casanovo/config.yaml @@ -42,9 +42,6 @@ random_seed: 454 n_log: 1 # Tensorboard directory to use for keeping track of training metrics. tb_summarywriter: -# Save the top k model checkpoints during training. -1 saves all, and leaving -# this field empty saves none. -save_top_k: 5 # Path to saved checkpoints. model_save_folder_path: "" # Model validation and checkpointing frequency in training steps. diff --git a/casanovo/data/datasets.py b/casanovo/data/datasets.py index 6244e88f..3917a2c8 100644 --- a/casanovo/data/datasets.py +++ b/casanovo/data/datasets.py @@ -83,7 +83,9 @@ def __getitem__( The unique spectrum identifier, formed by its original peak file and identifier (index or scan number) therein. """ - mz_array, int_array, precursor_mz, precursor_charge = self.index[idx] + mz_array, int_array, precursor_mz, precursor_charge = self.index[idx][ + :4 + ] spectrum = self._process_peaks( mz_array, int_array, precursor_mz, precursor_charge ) diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index d5acacb3..b88c5542 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -10,6 +10,7 @@ from pathlib import Path from typing import Iterable, List, Optional, Union +import depthcharge.masses import lightning.pytorch as pl import numpy as np import torch @@ -20,6 +21,7 @@ from ..config import Config from ..data import ms_io from ..denovo.dataloaders import DeNovoDataModule +from ..denovo.evaluate import aa_match_batch, aa_match_metrics from ..denovo.model import Spec2Pep @@ -36,12 +38,15 @@ class ModelRunner: model_filename : str, optional The model filename is required for eval and de novo modes, but not for training a model from scratch. + output_rootname : str, optional + The rootname for all output files (e.g. checkpoints or results) """ def __init__( self, config: Config, model_filename: Optional[str] = None, + output_rootname: Optional[str] = None, ) -> None: """Initialize a ModelRunner""" self.config = config @@ -54,24 +59,23 @@ def __init__( self.loaders = None self.writer = None + best_filename = "best" + if output_rootname is not None: + best_filename = f"{output_rootname}.{best_filename}" + # Configure checkpoints. self.callbacks = [ ModelCheckpoint( dirpath=config.model_save_folder_path, save_on_train_epoch_end=True, - ) + ), + ModelCheckpoint( + dirpath=config.model_save_folder_path, + monitor="valid_CELoss", + filename=best_filename, + ), ] - if config.save_top_k is not None: - self.callbacks.append( - ModelCheckpoint( - dirpath=config.model_save_folder_path, - monitor="valid_CELoss", - mode="min", - save_top_k=config.save_top_k, - ) - ) - def __enter__(self): """Enter the context manager""" self.tmp_dir = tempfile.TemporaryDirectory() @@ -116,36 +120,52 @@ def train( self.loaders.val_dataloader(), ) - def evaluate(self, peak_path: Iterable[str]) -> None: - """Evaluate peptide sequence preditions from a trained Casanovo model. + def log_metrics(self, test_index: AnnotatedSpectrumIndex) -> None: + """Log peptide precision and amino acid precision + + Calculate and log peptide precision and amino acid precision + based off of model predictions and spectrum annotations Parameters ---------- - peak_path : iterable of str - The path with MS data files for predicting peptide sequences. - - Returns - ------- - self + test_index : AnnotatedSpectrumIndex + Index containing the annotated spectra used to generate model + predictions """ - self.initialize_trainer(train=False) - self.initialize_model(train=False) - - test_index = self._get_index(peak_path, True, "evaluation") - self.initialize_data_module(test_index=test_index) - self.loaders.setup(stage="test", annotated=True) + model_output = [psm[0] for psm in self.writer.psms] + spectrum_annotations = [ + test_index[i][4] for i in range(test_index.n_spectra) + ] + aa_precision, _, pep_precision = aa_match_metrics( + *aa_match_batch( + spectrum_annotations, + model_output, + depthcharge.masses.PeptideMass().masses, + ) + ) - self.trainer.validate(self.model, self.loaders.test_dataloader()) + logger.info("Peptide Precision: %.2f%%", 100 * pep_precision) + logger.info("Amino Acid Precision: %.2f%%", 100 * aa_precision) - def predict(self, peak_path: Iterable[str], output: str) -> None: + def predict( + self, peak_path: Iterable[str], output: str, evaluate: bool = False + ) -> None: """Predict peptide sequences with a trained Casanovo model. + Can also evaluate model during prediction if provided with annotated + peak files. + Parameters ---------- peak_path : iterable of str The path with the MS data files for predicting peptide sequences. output : str Where should the output be saved? + evaluate: bool + whether to run model evaluation in addition to inference + Note: peak_path most point to annotated MS data files when + running model evaluation. Files that are not an annotated + peak file format will be ignored if evaluate is set to true. Returns ------- @@ -162,12 +182,15 @@ def predict(self, peak_path: Iterable[str], output: str) -> None: self.initialize_model(train=False) self.model.out_writer = self.writer - test_index = self._get_index(peak_path, False, "") + test_index = self._get_index(peak_path, evaluate, "") self.writer.set_ms_run(test_index.ms_files) self.initialize_data_module(test_index=test_index) self.loaders.setup(stage="test", annotated=False) self.trainer.predict(self.model, self.loaders.test_dataloader()) + if evaluate: + self.log_metrics(test_index) + def initialize_trainer(self, train: bool) -> None: """Initialize the lightning Trainer. @@ -396,7 +419,22 @@ def _get_index( Index = AnnotatedSpectrumIndex if annotated else SpectrumIndex valid_charge = np.arange(1, self.config.max_charge + 1) - return Index(index_fname, filenames, valid_charge=valid_charge) + + try: + return Index(index_fname, filenames, valid_charge=valid_charge) + except TypeError as e: + if Index == AnnotatedSpectrumIndex: + error_msg = ( + "Error creating annotated spectrum index. " + "This may be the result of having an unannotated MGF file " + "present in the validation peak file path list.\n" + f"Original error message: {e}" + ) + + logger.error(error_msg) + raise TypeError(error_msg) + + raise e def _get_strategy(self) -> Union[str, DDPStrategy]: """Get the strategy for the Trainer. @@ -449,5 +487,15 @@ def _get_peak_filenames( for fname in glob.glob(path, recursive=True): if Path(fname).suffix.lower() in supported_ext: found_files.add(fname) + else: + warnings.warn( + f"Ignoring unsupported peak file: {fname}", RuntimeWarning + ) + + if len(found_files) == 0: + warnings.warn( + f"No supported peak files found under path(s): {list(paths)}", + RuntimeWarning, + ) return sorted(list(found_files)) diff --git a/docs/images/evaluate-help.svg b/docs/images/evaluate-help.svg index bd8b258f..661f0efe 100644 --- a/docs/images/evaluate-help.svg +++ b/docs/images/evaluate-help.svg @@ -1,4 +1,4 @@ - + - - + + - + - + - + - + - + - + - + - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - + - - $ casanovo evaluate --help - -Usage:casanovo evaluate [OPTIONSANNOTATED_PEAK_PATH...                       - - Evaluate de novo peptide sequencing performance.                                - ANNOTATED_PEAK_PATH must be one or more annoated MGF files, such as those       - provided by MassIVE-KB.                                                         - -╭─ Arguments ──────────────────────────────────────────────────────────────────╮ -*  ANNOTATED_PEAK_PATH    FILE[required] -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Options ────────────────────────────────────────────────────────────────────╮ ---model-mTEXT                        Either the model weights (.ckpt  -                                              file) or a URL pointing to the   -                                              model weights file. If not       -                                              provided, Casanovo will try to   -                                              download the latest release      -                                              automatically.                   ---output-oFILE                        The mzTab file to which results  -                                              will be written.                 ---config-cFILE                        The YAML configuration file      -                                              overriding the default options.  ---verbosity-v[debug|info|warning|error]  Set the verbosity of console     -                                              logging messages. Log files are  -                                              always set to 'debug'.           ---help-h  Show this message and exit.      -╰──────────────────────────────────────────────────────────────────────────────╯ - + + $ casanovo evaluate --help + +Usage:casanovo [OPTIONSCOMMAND [ARGS]...                                     + +Try'casanovo -h'for help +╭─ Error ──────────────────────────────────────────────────────────────────────╮ + No such command 'evaluate'.                                                   +╰──────────────────────────────────────────────────────────────────────────────╯ + diff --git a/docs/images/help.svg b/docs/images/help.svg index 80d63c7e..dbdc05e0 100644 --- a/docs/images/help.svg +++ b/docs/images/help.svg @@ -1,4 +1,4 @@ - + - - + + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - - - - + - + - - $ casanovo --help - -Usage:casanovo [OPTIONSCOMMAND [ARGS]...                                     - - ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓  - ┃                                  Casanovo                                  ┃  - ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛  - Casanovo de novo sequences peptides from tandem mass spectra using a            - Transformer model. Casanovo currently supports mzML, mzXML, and MGF files for   - de novo sequencing and annotated MGF files, such as those from MassIVE-KB, for  - training new models.                                                            - - Links:                                                                          - - • Documentation: https://casanovo.readthedocs.io                               - • Official code repository: https://github.com/Noble-Lab/casanovo              - - If you use Casanovo in your work, please cite:                                  - - • Yilmaz, M., Fondrie, W. E., Bittremieux, W., Oh, S. & Noble, W. S. De novo   -mass spectrometry peptide sequencing with a transformer model. Proceedings   -of the 39th International Conference on Machine Learning - ICML '22 (2022)   -doi:10.1101/2022.02.07.479481.                                               - -╭─ Options ────────────────────────────────────────────────────────────────────╮ ---help-h    Show this message and exit.                                     -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Commands ───────────────────────────────────────────────────────────────────╮ -configure Generate a Casanovo configuration file to customize.               -evaluate  Evaluate de novo peptide sequencing performance.                   -sequence  De novo sequence peptides from tandem mass spectra.                -train     Train a Casanovo model on your own data.                           -version   Get the Casanovo version information                               -╰──────────────────────────────────────────────────────────────────────────────╯ - + + $ casanovo --help + +Usage:casanovo [OPTIONSCOMMAND [ARGS]...                                     + + ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓  + ┃                                  Casanovo                                  ┃  + ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛  + Casanovo de novo sequences peptides from tandem mass spectra using a            + Transformer model. Casanovo currently supports mzML, mzXML, and MGF files for   + de novo sequencing and annotated MGF files, such as those from MassIVE-KB, for  + training new models.                                                            + + Links:                                                                          + + • Documentation: https://casanovo.readthedocs.io                               + • Official code repository: https://github.com/Noble-Lab/casanovo              + + If you use Casanovo in your work, please cite:                                  + + • Yilmaz, M., Fondrie, W. E., Bittremieux, W., Oh, S. & Noble, W. S. De novo   +mass spectrometry peptide sequencing with a transformer model. Proceedings   +of the 39th International Conference on Machine Learning - ICML '22 (2022)   +doi:10.1101/2022.02.07.479481.                                               + +╭─ Options ────────────────────────────────────────────────────────────────────╮ +--help-h    Show this message and exit.                                     +╰──────────────────────────────────────────────────────────────────────────────╯ +╭─ Commands ───────────────────────────────────────────────────────────────────╮ +configure Generate a Casanovo configuration file to customize.               +sequence  De novo sequence peptides from tandem mass spectra.                +train     Train a Casanovo model on your own data.                           +version   Get the Casanovo version information                               +╰──────────────────────────────────────────────────────────────────────────────╯ + diff --git a/docs/images/sequence-help.svg b/docs/images/sequence-help.svg index 5e75dfe4..70570e2a 100644 --- a/docs/images/sequence-help.svg +++ b/docs/images/sequence-help.svg @@ -1,4 +1,4 @@ - + - - + + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + - + - - $ casanovo sequence --help - -Usage:casanovo sequence [OPTIONSPEAK_PATH...                                 - - De novo sequence peptides from tandem mass spectra.                             - PEAK_PATH must be one or more mzMl, mzXML, or MGF files from which to sequence  - peptides.                                                                       - -╭─ Arguments ──────────────────────────────────────────────────────────────────╮ -*  PEAK_PATH    FILE[required] -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Options ────────────────────────────────────────────────────────────────────╮ ---model-mTEXT                        Either the model weights (.ckpt  -                                              file) or a URL pointing to the   -                                              model weights file. If not       -                                              provided, Casanovo will try to   -                                              download the latest release      -                                              automatically.                   ---output-oFILE                        The mzTab file to which results  -                                              will be written.                 ---config-cFILE                        The YAML configuration file      -                                              overriding the default options.  ---verbosity-v[debug|info|warning|error]  Set the verbosity of console     -                                              logging messages. Log files are  -                                              always set to 'debug'.           ---help-h  Show this message and exit.      -╰──────────────────────────────────────────────────────────────────────────────╯ - + + $ casanovo sequence --help + +Usage:casanovo sequence [OPTIONSPEAK_PATH...                                 + + De novo sequence peptides from tandem mass spectra.                             + PEAK_PATH must be one or more mzML, mzXML, or MGF files from which to sequence  + peptides. If evaluate is set to True PEAK_PATH must be one or more annotated    + MGF file.                                                                       + +╭─ Arguments ──────────────────────────────────────────────────────────────────╮ +*  PEAK_PATH    FILE[required] +╰──────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ────────────────────────────────────────────────────────────────────╮ +--evaluate-e  Run in evaluation mode. When     +                                              this flag is set the peptide     +                                              and amino acid precision will    +                                              be calculated and logged at the  +                                              end of the sequencing run. All   +                                              input files must be annotated    +                                              MGF files if running in          +                                              evaluation mode.                 +--model-mTEXT                        Either the model weights (.ckpt  +                                              file) or a URL pointing to the   +                                              model weights file. If not       +                                              provided, Casanovo will try to   +                                              download the latest release      +                                              automatically.                   +--output-oFILE                        The mzTab file to which results  +                                              will be written.                 +--config-cFILE                        The YAML configuration file      +                                              overriding the default options.  +--verbosity-v[debug|info|warning|error]  Set the verbosity of console     +                                              logging messages. Log files are  +                                              always set to 'debug'.           +--help-h  Show this message and exit.      +╰──────────────────────────────────────────────────────────────────────────────╯ + diff --git a/tests/conftest.py b/tests/conftest.py index 02a6d0f2..bff6b011 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -15,7 +15,15 @@ def mgf_small(tmp_path): return _create_mgf(peptides, mgf_file) -def _create_mgf(peptides, mgf_file, random_state=42): +@pytest.fixture +def mgf_small_unannotated(tmp_path): + """An MGF file with 2 unannotated spectra.""" + peptides = ["LESLIEK", "PEPTIDEK"] + mgf_file = tmp_path / "small_unannotated.mgf" + return _create_mgf(peptides, mgf_file, annotate=False) + + +def _create_mgf(peptides, mgf_file, random_state=42, annotate=True): """ Create a fake MGF file from one or more peptides. @@ -27,20 +35,25 @@ def _create_mgf(peptides, mgf_file, random_state=42): The MGF file to create. random_state : int or numpy.random.Generator, optional The random seed. The charge states are chosen to be 2 or 3 randomly. + annotate: bool, optional + Whether to add peptide annotations to mgf file Returns ------- mgf_file : Path """ rng = np.random.default_rng(random_state) - entries = [_create_mgf_entry(p, rng.choice([2, 3])) for p in peptides] + entries = [ + _create_mgf_entry(p, rng.choice([2, 3]), annotate=annotate) + for p in peptides + ] with mgf_file.open("w+") as mgf_ref: mgf_ref.write("\n".join(entries)) return mgf_file -def _create_mgf_entry(peptide, charge=2): +def _create_mgf_entry(peptide, charge=2, annotate=True): """ Create a MassIVE-KB style MGF entry for a single PSM. @@ -50,6 +63,8 @@ def _create_mgf_entry(peptide, charge=2): A peptide sequence. charge : int, optional The peptide charge state. + annotate: bool, optional + Whether to add peptide annotation to entry Returns ------- @@ -62,12 +77,15 @@ def _create_mgf_entry(peptide, charge=2): mgf = [ "BEGIN IONS", - f"SEQ={peptide}", f"PEPMASS={precursor_mz}", f"CHARGE={charge}+", f"{frags}", "END IONS", ] + + if annotate: + mgf.insert(1, f"SEQ={peptide}") + return "\n".join(mgf) @@ -208,7 +226,6 @@ def tiny_config(tmp_path): "random_seed": 454, "n_log": 1, "tb_summarywriter": None, - "save_top_k": 5, "n_peaks": 150, "min_mz": 50.0, "max_mz": 2500.0, diff --git a/tests/test_integration.py b/tests/test_integration.py index a622b188..47cee936 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -36,25 +36,12 @@ def test_train_and_run( result = run(train_args) model_file = tmp_path / "epoch=19-step=20.ckpt" + best_model = tmp_path / "best.ckpt" assert result.exit_code == 0 assert model_file.exists() + assert best_model.exists() - # Try evaluating: - eval_args = [ - "evaluate", - "--model", - str(model_file), - "--config", - str(tiny_config), - "--output", - str(tmp_path / "eval"), - str(mgf_small), - ] - - result = run(eval_args) - assert result.exit_code == 0 - - # Try predicting: + # Finally try predicting: output_filename = tmp_path / "test.mztab" predict_args = [ "sequence", @@ -90,6 +77,39 @@ def test_train_and_run( assert psms.loc[4, "sequence"] == "PEPTLDEK" assert psms.loc[4, "spectra_ref"] == "ms_run[2]:scan=111" + # Finally, try evaluating: + output_filename = tmp_path / "test-eval.mztab" + eval_args = [ + "sequence", + "--model", + str(model_file), + "--config", + tiny_config, + "--output", + str(output_filename), + str(mgf_small), + str(mzml_small), + "--evaluate", + ] + + result = run(eval_args) + assert result.exit_code == 0 + assert output_filename.is_file() + + mztab = pyteomics.mztab.MzTab(str(output_filename)) + filename = "small.mgf" + # Verify that the input annotated peak file is listed in the metadata. + assert f"ms_run[1]-location" in mztab.metadata + assert mztab.metadata[f"ms_run[1]-location"].endswith(filename) + + # Verify that the spectrum predictions are correct + # and indexed according to the peak input file type. + psms = mztab.spectrum_match_table + assert psms.loc[1, "sequence"] == "LESLLEK" + assert psms.loc[1, "spectra_ref"] == "ms_run[1]:index=0" + assert psms.loc[2, "sequence"] == "PEPTLDEK" + assert psms.loc[2, "spectra_ref"] == "ms_run[1]:index=1" + # Validate mztab output validate_args = [ "java", diff --git a/tests/unit_tests/test_config.py b/tests/unit_tests/test_config.py index fbe10eee..14d2abe5 100644 --- a/tests/unit_tests/test_config.py +++ b/tests/unit_tests/test_config.py @@ -43,9 +43,18 @@ def test_deprecated(tmp_path, tiny_config): filename = str(tmp_path / "config_deprecated.yml") with open(tiny_config, "r") as f_in, open(filename, "w") as f_out: cfg = yaml.safe_load(f_in) - # Insert deprecated config option. + # Insert remapped deprecated config option. cfg["max_iters"] = 1 yaml.safe_dump(cfg, f_out) with pytest.warns(DeprecationWarning): Config(filename) + + with open(tiny_config, "r") as f_in, open(filename, "w") as f_out: + cfg = yaml.safe_load(f_in) + # Insert non-remapped deprecated config option. + cfg["save_top_k"] = 5 + yaml.safe_dump(cfg, f_out) + + with pytest.warns(DeprecationWarning): + Config(filename) diff --git a/tests/unit_tests/test_runner.py b/tests/unit_tests/test_runner.py index 2d0513bd..db21725a 100644 --- a/tests/unit_tests/test_runner.py +++ b/tests/unit_tests/test_runner.py @@ -56,6 +56,7 @@ def test_save_and_load_weights(tmp_path, mgf_small, tiny_config): config.max_epochs = 1 config.n_layers = 1 ckpt = tmp_path / "test.ckpt" + mztab = tmp_path / "test.mztab" with ModelRunner(config=config) as runner: runner.train([mgf_small], [mgf_small]) @@ -83,7 +84,7 @@ def test_save_and_load_weights(tmp_path, mgf_small, tiny_config): with torch.device("meta"): with ModelRunner(other_config, model_filename=str(ckpt)) as runner: with pytest.raises(NotImplementedError) as err: - runner.evaluate([mgf_small]) + runner.predict([mgf_small], mztab) assert "meta tensor; no data!" in str(err.value) @@ -95,11 +96,11 @@ def test_save_and_load_weights(tmp_path, mgf_small, tiny_config): # Shouldn't work: with ModelRunner(other_config, model_filename=str(ckpt)) as runner: with pytest.raises(RuntimeError): - runner.evaluate([mgf_small]) + runner.predict([mgf_small], mztab) # Should work: with ModelRunner(config=config, model_filename=str(ckpt)) as runner: - runner.evaluate([mgf_small]) + runner.predict([mgf_small], mztab) def test_save_and_load_weights_deprecated(tmp_path, mgf_small, tiny_config): @@ -168,9 +169,83 @@ def test_save_final_model(tmp_path, mgf_small, tiny_config): # Test checkpoint saving when val_check_interval is not a factor of training steps config.val_check_interval = 15 - validation_file = tmp_path / "epoch=14-step=15.ckpt" - with ModelRunner(config) as runner: + validation_file = tmp_path / "foobar.best.ckpt" + with ModelRunner(config, output_rootname="foobar") as runner: runner.train([mgf_small], [mgf_small]) assert model_file.exists() assert validation_file.exists() + + +def test_evaluate( + tmp_path, mgf_small, mzml_small, mgf_small_unannotated, tiny_config +): + """Test model evaluation during sequencing""" + # Train tiny model + config = Config(tiny_config) + config.max_epochs = 1 + model_file = tmp_path / "epoch=0-step=1.ckpt" + with ModelRunner(config) as runner: + runner.train([mgf_small], [mgf_small]) + + assert model_file.is_file() + + # Test evaluation with annotated peak file + result_file = tmp_path / "result.mztab" + with ModelRunner(config, model_filename=str(model_file)) as runner: + runner.predict([mgf_small], result_file, evaluate=True) + + assert result_file.is_file() + result_file.unlink() + + exception_string = ( + "Error creating annotated spectrum index. " + "This may be the result of having an unannotated MGF file " + "present in the validation peak file path list.\n" + ) + + with pytest.raises(FileNotFoundError): + with ModelRunner(config, model_filename=str(model_file)) as runner: + runner.predict([mzml_small], result_file, evaluate=True) + + with pytest.raises(TypeError, match=exception_string): + with ModelRunner(config, model_filename=str(model_file)) as runner: + runner.predict([mgf_small_unannotated], result_file, evaluate=True) + + with pytest.raises(TypeError, match=exception_string): + with ModelRunner(config, model_filename=str(model_file)) as runner: + runner.predict( + [mgf_small_unannotated, mzml_small], result_file, evaluate=True + ) + + # MzTab with just metadata is written in the case of FileNotFound + # or TypeError early exit + assert result_file.is_file() + result_file.unlink() + + # Test mix of annotated an unannotated peak files + with pytest.warns(RuntimeWarning): + with ModelRunner(config, model_filename=str(model_file)) as runner: + runner.predict([mgf_small, mzml_small], result_file, evaluate=True) + + assert result_file.is_file() + result_file.unlink() + + with pytest.raises(TypeError, match=exception_string): + with ModelRunner(config, model_filename=str(model_file)) as runner: + runner.predict( + [mgf_small, mgf_small_unannotated], result_file, evaluate=True + ) + + assert result_file.is_file() + result_file.unlink() + + with pytest.raises(TypeError, match=exception_string): + with ModelRunner(config, model_filename=str(model_file)) as runner: + runner.predict( + [mgf_small, mgf_small_unannotated, mzml_small], + result_file, + evaluate=True, + ) + + result_file.unlink()