Prepare for release v4.1.0 (#296)

* Remove `train_from_scratch` config option (#275) Instead of having to specify `train_from_scratch` in the config file, training will proceed from an existing model weights file if this is given as an argument to `casanovo train`. Fixes #263. * Stabilize torch.topk() behavior (#290) * Add epsilon to index zero * Fix typo * Use base PyTorch for repeating along the vocabulary size * Combine masking steps * Lint with updated black version * Lint test files * Add topk unit test * Fix lint * Add fixme comment for future * Update changelog * Generate new screengrabs with rich-codex --------- Co-authored-by: Wout Bittremieux <[email protected]> Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com> * Update changelog --------- Co-authored-by: Melih Yilmaz <[email protected]> Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
Noble-Lab · Feb 16, 2024 · b7f8ff9 · b7f8ff9
1 parent dae8392
commit b7f8ff9
Show file tree

Hide file tree

Showing 11 changed files with 113 additions and 198 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,22 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 
 ## [Unreleased]
 
+## [4.1.0] - 2024-02-16
+
+### Changed
+
+- Instead of having to specify `train_from_scratch` in the config file, training will proceed from an existing model weights file if this is given as an argument to `casanovo train`.
+
+### Fixed
+
+- Fixed beam search decoding error due to non-deterministic selection of beams with equal scores.
+
+## [4.0.1] - 2023-12-25
+
+### Fixed
+
+- Fix automatic PyPI upload.
+
 ## [4.0.0] - 2023-12-22
 
 ### Added
@@ -217,7 +233,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 
 - Initial Casanovo version.
 
-[Unreleased]: https://github.com/Noble-Lab/casanovo/compare/v4.0.0...HEAD
+[Unreleased]: https://github.com/Noble-Lab/casanovo/compare/v4.1.0...HEAD
+[4.1.0]: https://github.com/Noble-Lab/casanovo/compare/v4.0.1...v4.1.0
+[4.0.1]: https://github.com/Noble-Lab/casanovo/compare/v4.0.0...v4.0.1
 [4.0.0]: https://github.com/Noble-Lab/casanovo/compare/v3.5.0...v4.0.0
 [3.5.0]: https://github.com/Noble-Lab/casanovo/compare/v3.4.0...v3.5.0
 [3.4.0]: https://github.com/Noble-Lab/casanovo/compare/v3.3.0...v3.4.0

diff --git a/casanovo/config.py b/casanovo/config.py
@@ -65,7 +65,6 @@ class Config:
         top_match=int,
         max_epochs=int,
         num_sanity_val_steps=int,
-        train_from_scratch=bool,
         save_top_k=int,
         model_save_folder_path=str,
         val_check_interval=int,

diff --git a/casanovo/config.yaml b/casanovo/config.yaml
@@ -99,8 +99,6 @@ train_batch_size: 32
 max_epochs: 30
 # Number of validation steps to run before training begins
 num_sanity_val_steps: 0
-# Set to "False" to further train a pre-trained Casanovo model
-train_from_scratch: True
 # Calculate peptide and amino acid precision during training. this
 # is expensive, so we recommend against it.
 calculate_precision: False

diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py
@@ -607,21 +607,17 @@ def _get_topk_beams(
             scores[:, step, :, :], "B V S -> B (V S)"
         )
 
-        # Mask out terminated beams. Include precursor m/z tolerance induced
-        # termination.
-        # TODO: `clone()` is necessary to get the correct output with n_beams=1.
-        #   An alternative implementation using base PyTorch instead of einops
-        #   might be more efficient.
-        finished_mask = einops.repeat(
-            finished_beams, "(B S) -> B (V S)", S=beam, V=vocab
-        ).clone()
+        # Find all still active beams by masking out terminated beams.
+        active_mask = (
+            ~finished_beams.reshape(batch, beam).repeat(1, vocab)
+        ).float()
         # Mask out the index '0', i.e. padding token, by default.
-        finished_mask[:, :beam] = True
+        # FIXME: Set this to a very small, yet non-zero value, to only
+        # get padding after stop token.
+        active_mask[:, :beam] = 1e-8
 
         # Figure out the top K decodings.
-        _, top_idx = torch.topk(
-            step_scores.nanmean(dim=1) * (~finished_mask).float(), beam
-        )
+        _, top_idx = torch.topk(step_scores.nanmean(dim=1) * active_mask, beam)
         v_idx, s_idx = np.unravel_index(top_idx.cpu(), (vocab, beam))
         s_idx = einops.rearrange(s_idx, "B S -> (B S)")
         b_idx = einops.repeat(torch.arange(batch), "B -> (B S)", S=beam)

diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py
@@ -252,16 +252,16 @@ def initialize_model(self, train: bool) -> None:
             calculate_precision=self.config.calculate_precision,
         )
 
-        from_scratch = (
-            self.config.train_from_scratch,
-            self.model_filename is None,
-        )
-        if train and any(from_scratch):
-            self.model = Spec2Pep(**model_params)
-            return
-        elif self.model_filename is None:
-            logger.error("A model file must be provided")
-            raise ValueError("A model file must be provided")
+        if self.model_filename is None:
+            # Train a model from scratch if no model file is provided.
+            if train:
+                self.model = Spec2Pep(**model_params)
+                return
+            # Else we're not training, so a model file must be provided.
+            else:
+                logger.error("A model file must be provided")
+                raise ValueError("A model file must be provided")
+        # Else a model file is provided (to continue training or for inference).
 
         if not Path(self.model_filename).exists():
             logger.error(

diff --git a/casanovo/utils.py b/casanovo/utils.py
@@ -1,4 +1,4 @@
-"""Small utility functions."""
+"""Small utility functions"""
 
 import logging
 import os

diff --git a/docs/images/help.svg b/docs/images/help.svg
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -223,7 +223,6 @@ def tiny_config(tmp_path):
         "weight_decay": 1e-5,
         "train_batch_size": 32,
         "num_sanity_val_steps": 0,
-        "train_from_scratch": True,
         "calculate_precision": False,
         "residues": {
             "G": 57.021464,

diff --git a/tests/unit_tests/test_config.py b/tests/unit_tests/test_config.py
@@ -1,4 +1,4 @@
-"""Test configuration loading."""
+"""Test configuration loading"""
 
 import pytest
 import yaml

diff --git a/tests/unit_tests/test_runner.py b/tests/unit_tests/test_runner.py
@@ -7,35 +7,39 @@
 from casanovo.denovo.model_runner import ModelRunner
 
 
-def test_initialize_model(tmp_path):
-    """Test that"""
+def test_initialize_model(tmp_path, mgf_small):
+    """Test initializing a new or existing model."""
     config = Config()
-    config.train_from_scratch = False
+    # No model filename given, so train from scratch.
     ModelRunner(config=config).initialize_model(train=True)
 
+    # No model filename given during inference = error.
     with pytest.raises(ValueError):
         ModelRunner(config=config).initialize_model(train=False)
 
-    with pytest.raises(FileNotFoundError):
-        runner = ModelRunner(config=config, model_filename="blah")
-        runner.initialize_model(train=True)
-
+    # Non-existing model filename given during inference = error.
     with pytest.raises(FileNotFoundError):
         runner = ModelRunner(config=config, model_filename="blah")
         runner.initialize_model(train=False)
 
-    # This should work now:
-    config.train_from_scratch = True
-    runner = ModelRunner(config=config, model_filename="blah")
+    # Train a quick model.
+    config.max_epochs = 1
+    config.n_layers = 1
+    ckpt = tmp_path / "existing.ckpt"
+    with ModelRunner(config=config) as runner:
+        runner.train([mgf_small], [mgf_small])
+        runner.trainer.save_checkpoint(ckpt)
+
+    # Resume training from previous model.
+    runner = ModelRunner(config=config, model_filename=str(ckpt))
     runner.initialize_model(train=True)
 
-    # But this should still fail:
-    with pytest.raises(FileNotFoundError):
-        runner = ModelRunner(config=config, model_filename="blah")
-        runner.initialize_model(train=False)
+    # Inference with previous model.
+    runner = ModelRunner(config=config, model_filename=str(ckpt))
+    runner.initialize_model(train=False)
 
     # If the model initialization throws and EOFError, then the Spec2Pep model
-    # has tried to load the weights:
+    # has tried to load the weights.
     weights = tmp_path / "blah"
     weights.touch()
     with pytest.raises(EOFError):
@@ -44,7 +48,7 @@ def test_initialize_model(tmp_path):
 
 
 def test_save_and_load_weights(tmp_path, mgf_small, tiny_config):
-    """Test saving aloading weights"""
+    """Test saving and loading weights"""
     config = Config(tiny_config)
     config.max_epochs = 1
     config.n_layers = 1

diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py
@@ -424,6 +424,49 @@ def test_beam_search_decode():
     )
     assert torch.equal(discarded_beams, torch.tensor([False, True, True]))
 
+    # Test _get_topk_beams() with finished beams in the batch.
+    model = Spec2Pep(n_beams=1, residues="massivekb", min_peptide_len=3)
+
+    # Sizes and other variables.
+    batch = 2  # B
+    beam = model.n_beams  # S
+    model.decoder.reverse = True
+    length = model.max_length + 1  # L
+    vocab = model.decoder.vocab_size + 1  # V
+    step = 4
+
+    # Initialize dummyy scores and tokens.
+    scores = torch.full(
+        size=(batch, length, vocab, beam), fill_value=torch.nan
+    )
+    scores = einops.rearrange(scores, "B L V S -> (B S) L V")
+    tokens = torch.zeros(batch * beam, length, dtype=torch.int64)
+
+    # Simulate non-zero amino acid-level probability scores.
+    scores[:, : step + 1, :] = torch.rand(batch, step + 1, vocab)
+    scores[:, step, range(1, 4)] = torch.tensor([1.0, 2.0, 3.0])
+
+    # Simulate one finished and one unfinished beam in the same batch.
+    tokens[0, :step] = torch.tensor([4, 14, 4, 28])
+    tokens[1, :step] = torch.tensor([4, 14, 4, 1])
+
+    # Set finished beams array to allow decoding from only one beam.
+    test_finished_beams = torch.tensor([True, False])
+
+    new_tokens, new_scores = model._get_topk_beams(
+        tokens, scores, test_finished_beams, batch, step
+    )
+
+    # Only the second peptide should have a new token predicted.
+    expected_tokens = torch.tensor(
+        [
+            [4, 14, 4, 28, 0],
+            [4, 14, 4, 1, 3],
+        ]
+    )
+
+    assert torch.equal(new_tokens[:, : step + 1], expected_tokens)
+
 
 def test_eval_metrics():
     """