diff --git a/CHANGELOG.md b/CHANGELOG.md index bbc9284e..a9f5f939 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,22 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), ## [Unreleased] +## [4.1.0] - 2024-02-16 + +### Changed + +- Instead of having to specify `train_from_scratch` in the config file, training will proceed from an existing model weights file if this is given as an argument to `casanovo train`. + +### Fixed + +- Fixed beam search decoding error due to non-deterministic selection of beams with equal scores. + +## [4.0.1] - 2023-12-25 + +### Fixed + +- Fix automatic PyPI upload. + ## [4.0.0] - 2023-12-22 ### Added @@ -217,7 +233,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Initial Casanovo version. -[Unreleased]: https://github.com/Noble-Lab/casanovo/compare/v4.0.0...HEAD +[Unreleased]: https://github.com/Noble-Lab/casanovo/compare/v4.1.0...HEAD +[4.1.0]: https://github.com/Noble-Lab/casanovo/compare/v4.0.1...v4.1.0 +[4.0.1]: https://github.com/Noble-Lab/casanovo/compare/v4.0.0...v4.0.1 [4.0.0]: https://github.com/Noble-Lab/casanovo/compare/v3.5.0...v4.0.0 [3.5.0]: https://github.com/Noble-Lab/casanovo/compare/v3.4.0...v3.5.0 [3.4.0]: https://github.com/Noble-Lab/casanovo/compare/v3.3.0...v3.4.0 diff --git a/casanovo/config.py b/casanovo/config.py index 2a420de9..817766ac 100644 --- a/casanovo/config.py +++ b/casanovo/config.py @@ -65,7 +65,6 @@ class Config: top_match=int, max_epochs=int, num_sanity_val_steps=int, - train_from_scratch=bool, save_top_k=int, model_save_folder_path=str, val_check_interval=int, diff --git a/casanovo/config.yaml b/casanovo/config.yaml index 896f67bc..24bf4623 100644 --- a/casanovo/config.yaml +++ b/casanovo/config.yaml @@ -99,8 +99,6 @@ train_batch_size: 32 max_epochs: 30 # Number of validation steps to run before training begins num_sanity_val_steps: 0 -# Set to "False" to further train a pre-trained Casanovo model -train_from_scratch: True # Calculate peptide and amino acid precision during training. this # is expensive, so we recommend against it. calculate_precision: False diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index b1d51e9c..9ea9cb23 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -607,21 +607,17 @@ def _get_topk_beams( scores[:, step, :, :], "B V S -> B (V S)" ) - # Mask out terminated beams. Include precursor m/z tolerance induced - # termination. - # TODO: `clone()` is necessary to get the correct output with n_beams=1. - # An alternative implementation using base PyTorch instead of einops - # might be more efficient. - finished_mask = einops.repeat( - finished_beams, "(B S) -> B (V S)", S=beam, V=vocab - ).clone() + # Find all still active beams by masking out terminated beams. + active_mask = ( + ~finished_beams.reshape(batch, beam).repeat(1, vocab) + ).float() # Mask out the index '0', i.e. padding token, by default. - finished_mask[:, :beam] = True + # FIXME: Set this to a very small, yet non-zero value, to only + # get padding after stop token. + active_mask[:, :beam] = 1e-8 # Figure out the top K decodings. - _, top_idx = torch.topk( - step_scores.nanmean(dim=1) * (~finished_mask).float(), beam - ) + _, top_idx = torch.topk(step_scores.nanmean(dim=1) * active_mask, beam) v_idx, s_idx = np.unravel_index(top_idx.cpu(), (vocab, beam)) s_idx = einops.rearrange(s_idx, "B S -> (B S)") b_idx = einops.repeat(torch.arange(batch), "B -> (B S)", S=beam) diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index 85446118..3253419a 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -252,16 +252,16 @@ def initialize_model(self, train: bool) -> None: calculate_precision=self.config.calculate_precision, ) - from_scratch = ( - self.config.train_from_scratch, - self.model_filename is None, - ) - if train and any(from_scratch): - self.model = Spec2Pep(**model_params) - return - elif self.model_filename is None: - logger.error("A model file must be provided") - raise ValueError("A model file must be provided") + if self.model_filename is None: + # Train a model from scratch if no model file is provided. + if train: + self.model = Spec2Pep(**model_params) + return + # Else we're not training, so a model file must be provided. + else: + logger.error("A model file must be provided") + raise ValueError("A model file must be provided") + # Else a model file is provided (to continue training or for inference). if not Path(self.model_filename).exists(): logger.error( diff --git a/casanovo/utils.py b/casanovo/utils.py index 4125cd54..71e5962b 100644 --- a/casanovo/utils.py +++ b/casanovo/utils.py @@ -1,4 +1,4 @@ -"""Small utility functions.""" +"""Small utility functions""" import logging import os diff --git a/docs/images/help.svg b/docs/images/help.svg index 42180a3f..baf2e237 100644 --- a/docs/images/help.svg +++ b/docs/images/help.svg @@ -1,4 +1,4 @@ - + - - + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - + - + - - $ casanovo --help - -Usage:casanovo [OPTIONSCOMMAND [ARGS]...                                     - - ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓  - ┃                                  Casanovo                                  ┃  - ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛  - Casanovo de novo sequences peptides from tandem mass spectra using a            - Transformer model. Casanovo currently supports mzML, mzXML, and MGF files for   - de novo sequencing and annotated MGF files, such as those from MassIVE-KB, for  - training new models.                                                            - - Links:                                                                          - - • Documentation: https://casanovo.readthedocs.io - • Official code repository: https://github.com/Noble-Lab/casanovo - - If you use Casanovo in your work, please cite:                                  - - • Yilmaz, M., Fondrie, W. E., Bittremieux, W., Oh, S. & Noble, W. S. De novo   -mass spectrometry peptide sequencing with a transformer model. Proceedings   -of the 39th International Conference on Machine Learning - ICML '22 (2022)   -doi:10.1101/2022.02.07.479481.                                               - -╭─ Options ────────────────────────────────────────────────────────────────────╮ ---help-h    Show this message and exit.                                     -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Commands ───────────────────────────────────────────────────────────────────╮ -configure Generate a Casanovo configuration file to customize.               -evaluate  Evaluate de novo peptide sequencing performance.                   -sequence  De novo sequence peptides from tandem mass spectra.                -train     Train a Casanovo model on your own data.                           -version   Get the Casanovo version information                               -╰──────────────────────────────────────────────────────────────────────────────╯ - + + $ casanovo --help diff --git a/tests/conftest.py b/tests/conftest.py index d4e81e36..3345824e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -223,7 +223,6 @@ def tiny_config(tmp_path): "weight_decay": 1e-5, "train_batch_size": 32, "num_sanity_val_steps": 0, - "train_from_scratch": True, "calculate_precision": False, "residues": { "G": 57.021464, diff --git a/tests/unit_tests/test_config.py b/tests/unit_tests/test_config.py index 89d32569..1924d122 100644 --- a/tests/unit_tests/test_config.py +++ b/tests/unit_tests/test_config.py @@ -1,4 +1,4 @@ -"""Test configuration loading.""" +"""Test configuration loading""" import pytest import yaml diff --git a/tests/unit_tests/test_runner.py b/tests/unit_tests/test_runner.py index d1e88e49..efaceb6b 100644 --- a/tests/unit_tests/test_runner.py +++ b/tests/unit_tests/test_runner.py @@ -7,35 +7,39 @@ from casanovo.denovo.model_runner import ModelRunner -def test_initialize_model(tmp_path): - """Test that""" +def test_initialize_model(tmp_path, mgf_small): + """Test initializing a new or existing model.""" config = Config() - config.train_from_scratch = False + # No model filename given, so train from scratch. ModelRunner(config=config).initialize_model(train=True) + # No model filename given during inference = error. with pytest.raises(ValueError): ModelRunner(config=config).initialize_model(train=False) - with pytest.raises(FileNotFoundError): - runner = ModelRunner(config=config, model_filename="blah") - runner.initialize_model(train=True) - + # Non-existing model filename given during inference = error. with pytest.raises(FileNotFoundError): runner = ModelRunner(config=config, model_filename="blah") runner.initialize_model(train=False) - # This should work now: - config.train_from_scratch = True - runner = ModelRunner(config=config, model_filename="blah") + # Train a quick model. + config.max_epochs = 1 + config.n_layers = 1 + ckpt = tmp_path / "existing.ckpt" + with ModelRunner(config=config) as runner: + runner.train([mgf_small], [mgf_small]) + runner.trainer.save_checkpoint(ckpt) + + # Resume training from previous model. + runner = ModelRunner(config=config, model_filename=str(ckpt)) runner.initialize_model(train=True) - # But this should still fail: - with pytest.raises(FileNotFoundError): - runner = ModelRunner(config=config, model_filename="blah") - runner.initialize_model(train=False) + # Inference with previous model. + runner = ModelRunner(config=config, model_filename=str(ckpt)) + runner.initialize_model(train=False) # If the model initialization throws and EOFError, then the Spec2Pep model - # has tried to load the weights: + # has tried to load the weights. weights = tmp_path / "blah" weights.touch() with pytest.raises(EOFError): @@ -44,7 +48,7 @@ def test_initialize_model(tmp_path): def test_save_and_load_weights(tmp_path, mgf_small, tiny_config): - """Test saving aloading weights""" + """Test saving and loading weights""" config = Config(tiny_config) config.max_epochs = 1 config.n_layers = 1 diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index 6b840d20..61d61efa 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -424,6 +424,49 @@ def test_beam_search_decode(): ) assert torch.equal(discarded_beams, torch.tensor([False, True, True])) + # Test _get_topk_beams() with finished beams in the batch. + model = Spec2Pep(n_beams=1, residues="massivekb", min_peptide_len=3) + + # Sizes and other variables. + batch = 2 # B + beam = model.n_beams # S + model.decoder.reverse = True + length = model.max_length + 1 # L + vocab = model.decoder.vocab_size + 1 # V + step = 4 + + # Initialize dummyy scores and tokens. + scores = torch.full( + size=(batch, length, vocab, beam), fill_value=torch.nan + ) + scores = einops.rearrange(scores, "B L V S -> (B S) L V") + tokens = torch.zeros(batch * beam, length, dtype=torch.int64) + + # Simulate non-zero amino acid-level probability scores. + scores[:, : step + 1, :] = torch.rand(batch, step + 1, vocab) + scores[:, step, range(1, 4)] = torch.tensor([1.0, 2.0, 3.0]) + + # Simulate one finished and one unfinished beam in the same batch. + tokens[0, :step] = torch.tensor([4, 14, 4, 28]) + tokens[1, :step] = torch.tensor([4, 14, 4, 1]) + + # Set finished beams array to allow decoding from only one beam. + test_finished_beams = torch.tensor([True, False]) + + new_tokens, new_scores = model._get_topk_beams( + tokens, scores, test_finished_beams, batch, step + ) + + # Only the second peptide should have a new token predicted. + expected_tokens = torch.tensor( + [ + [4, 14, 4, 28, 0], + [4, 14, 4, 1, 3], + ] + ) + + assert torch.equal(new_tokens[:, : step + 1], expected_tokens) + def test_eval_metrics(): """