Skip to content

Commit

Permalink
Prepare for release v4.1.0 (#296)
Browse files Browse the repository at this point in the history
* Remove `train_from_scratch` config option (#275)

Instead of having to specify `train_from_scratch` in the config file, training will proceed from an existing model weights file if this is given as an argument to `casanovo train`.

Fixes #263.

* Stabilize torch.topk() behavior (#290)

* Add epsilon to index zero

* Fix typo

* Use base PyTorch for repeating along the vocabulary size

* Combine masking steps

* Lint with updated black version

* Lint test files

* Add topk unit test

* Fix lint

* Add fixme comment for future

* Update changelog

* Generate new screengrabs with rich-codex

---------

Co-authored-by: Wout Bittremieux <[email protected]>
Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>

* Update changelog

---------

Co-authored-by: Melih Yilmaz <[email protected]>
Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
  • Loading branch information
3 people authored Feb 16, 2024
1 parent dae8392 commit b7f8ff9
Show file tree
Hide file tree
Showing 11 changed files with 113 additions and 198 deletions.
20 changes: 19 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,22 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),

## [Unreleased]

## [4.1.0] - 2024-02-16

### Changed

- Instead of having to specify `train_from_scratch` in the config file, training will proceed from an existing model weights file if this is given as an argument to `casanovo train`.

### Fixed

- Fixed beam search decoding error due to non-deterministic selection of beams with equal scores.

## [4.0.1] - 2023-12-25

### Fixed

- Fix automatic PyPI upload.

## [4.0.0] - 2023-12-22

### Added
Expand Down Expand Up @@ -217,7 +233,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),

- Initial Casanovo version.

[Unreleased]: https://github.com/Noble-Lab/casanovo/compare/v4.0.0...HEAD
[Unreleased]: https://github.com/Noble-Lab/casanovo/compare/v4.1.0...HEAD
[4.1.0]: https://github.com/Noble-Lab/casanovo/compare/v4.0.1...v4.1.0
[4.0.1]: https://github.com/Noble-Lab/casanovo/compare/v4.0.0...v4.0.1
[4.0.0]: https://github.com/Noble-Lab/casanovo/compare/v3.5.0...v4.0.0
[3.5.0]: https://github.com/Noble-Lab/casanovo/compare/v3.4.0...v3.5.0
[3.4.0]: https://github.com/Noble-Lab/casanovo/compare/v3.3.0...v3.4.0
Expand Down
1 change: 0 additions & 1 deletion casanovo/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,6 @@ class Config:
top_match=int,
max_epochs=int,
num_sanity_val_steps=int,
train_from_scratch=bool,
save_top_k=int,
model_save_folder_path=str,
val_check_interval=int,
Expand Down
2 changes: 0 additions & 2 deletions casanovo/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,6 @@ train_batch_size: 32
max_epochs: 30
# Number of validation steps to run before training begins
num_sanity_val_steps: 0
# Set to "False" to further train a pre-trained Casanovo model
train_from_scratch: True
# Calculate peptide and amino acid precision during training. this
# is expensive, so we recommend against it.
calculate_precision: False
Expand Down
20 changes: 8 additions & 12 deletions casanovo/denovo/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -607,21 +607,17 @@ def _get_topk_beams(
scores[:, step, :, :], "B V S -> B (V S)"
)

# Mask out terminated beams. Include precursor m/z tolerance induced
# termination.
# TODO: `clone()` is necessary to get the correct output with n_beams=1.
# An alternative implementation using base PyTorch instead of einops
# might be more efficient.
finished_mask = einops.repeat(
finished_beams, "(B S) -> B (V S)", S=beam, V=vocab
).clone()
# Find all still active beams by masking out terminated beams.
active_mask = (
~finished_beams.reshape(batch, beam).repeat(1, vocab)
).float()
# Mask out the index '0', i.e. padding token, by default.
finished_mask[:, :beam] = True
# FIXME: Set this to a very small, yet non-zero value, to only
# get padding after stop token.
active_mask[:, :beam] = 1e-8

# Figure out the top K decodings.
_, top_idx = torch.topk(
step_scores.nanmean(dim=1) * (~finished_mask).float(), beam
)
_, top_idx = torch.topk(step_scores.nanmean(dim=1) * active_mask, beam)
v_idx, s_idx = np.unravel_index(top_idx.cpu(), (vocab, beam))
s_idx = einops.rearrange(s_idx, "B S -> (B S)")
b_idx = einops.repeat(torch.arange(batch), "B -> (B S)", S=beam)
Expand Down
20 changes: 10 additions & 10 deletions casanovo/denovo/model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,16 +252,16 @@ def initialize_model(self, train: bool) -> None:
calculate_precision=self.config.calculate_precision,
)

from_scratch = (
self.config.train_from_scratch,
self.model_filename is None,
)
if train and any(from_scratch):
self.model = Spec2Pep(**model_params)
return
elif self.model_filename is None:
logger.error("A model file must be provided")
raise ValueError("A model file must be provided")
if self.model_filename is None:
# Train a model from scratch if no model file is provided.
if train:
self.model = Spec2Pep(**model_params)
return
# Else we're not training, so a model file must be provided.
else:
logger.error("A model file must be provided")
raise ValueError("A model file must be provided")
# Else a model file is provided (to continue training or for inference).

if not Path(self.model_filename).exists():
logger.error(
Expand Down
2 changes: 1 addition & 1 deletion casanovo/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Small utility functions."""
"""Small utility functions"""

import logging
import os
Expand Down
164 changes: 11 additions & 153 deletions docs/images/help.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1 change: 0 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,6 @@ def tiny_config(tmp_path):
"weight_decay": 1e-5,
"train_batch_size": 32,
"num_sanity_val_steps": 0,
"train_from_scratch": True,
"calculate_precision": False,
"residues": {
"G": 57.021464,
Expand Down
2 changes: 1 addition & 1 deletion tests/unit_tests/test_config.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Test configuration loading."""
"""Test configuration loading"""

import pytest
import yaml
Expand Down
36 changes: 20 additions & 16 deletions tests/unit_tests/test_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,35 +7,39 @@
from casanovo.denovo.model_runner import ModelRunner


def test_initialize_model(tmp_path):
"""Test that"""
def test_initialize_model(tmp_path, mgf_small):
"""Test initializing a new or existing model."""
config = Config()
config.train_from_scratch = False
# No model filename given, so train from scratch.
ModelRunner(config=config).initialize_model(train=True)

# No model filename given during inference = error.
with pytest.raises(ValueError):
ModelRunner(config=config).initialize_model(train=False)

with pytest.raises(FileNotFoundError):
runner = ModelRunner(config=config, model_filename="blah")
runner.initialize_model(train=True)

# Non-existing model filename given during inference = error.
with pytest.raises(FileNotFoundError):
runner = ModelRunner(config=config, model_filename="blah")
runner.initialize_model(train=False)

# This should work now:
config.train_from_scratch = True
runner = ModelRunner(config=config, model_filename="blah")
# Train a quick model.
config.max_epochs = 1
config.n_layers = 1
ckpt = tmp_path / "existing.ckpt"
with ModelRunner(config=config) as runner:
runner.train([mgf_small], [mgf_small])
runner.trainer.save_checkpoint(ckpt)

# Resume training from previous model.
runner = ModelRunner(config=config, model_filename=str(ckpt))
runner.initialize_model(train=True)

# But this should still fail:
with pytest.raises(FileNotFoundError):
runner = ModelRunner(config=config, model_filename="blah")
runner.initialize_model(train=False)
# Inference with previous model.
runner = ModelRunner(config=config, model_filename=str(ckpt))
runner.initialize_model(train=False)

# If the model initialization throws and EOFError, then the Spec2Pep model
# has tried to load the weights:
# has tried to load the weights.
weights = tmp_path / "blah"
weights.touch()
with pytest.raises(EOFError):
Expand All @@ -44,7 +48,7 @@ def test_initialize_model(tmp_path):


def test_save_and_load_weights(tmp_path, mgf_small, tiny_config):
"""Test saving aloading weights"""
"""Test saving and loading weights"""
config = Config(tiny_config)
config.max_epochs = 1
config.n_layers = 1
Expand Down
43 changes: 43 additions & 0 deletions tests/unit_tests/test_unit.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,6 +424,49 @@ def test_beam_search_decode():
)
assert torch.equal(discarded_beams, torch.tensor([False, True, True]))

# Test _get_topk_beams() with finished beams in the batch.
model = Spec2Pep(n_beams=1, residues="massivekb", min_peptide_len=3)

# Sizes and other variables.
batch = 2 # B
beam = model.n_beams # S
model.decoder.reverse = True
length = model.max_length + 1 # L
vocab = model.decoder.vocab_size + 1 # V
step = 4

# Initialize dummyy scores and tokens.
scores = torch.full(
size=(batch, length, vocab, beam), fill_value=torch.nan
)
scores = einops.rearrange(scores, "B L V S -> (B S) L V")
tokens = torch.zeros(batch * beam, length, dtype=torch.int64)

# Simulate non-zero amino acid-level probability scores.
scores[:, : step + 1, :] = torch.rand(batch, step + 1, vocab)
scores[:, step, range(1, 4)] = torch.tensor([1.0, 2.0, 3.0])

# Simulate one finished and one unfinished beam in the same batch.
tokens[0, :step] = torch.tensor([4, 14, 4, 28])
tokens[1, :step] = torch.tensor([4, 14, 4, 1])

# Set finished beams array to allow decoding from only one beam.
test_finished_beams = torch.tensor([True, False])

new_tokens, new_scores = model._get_topk_beams(
tokens, scores, test_finished_beams, batch, step
)

# Only the second peptide should have a new token predicted.
expected_tokens = torch.tensor(
[
[4, 14, 4, 28, 0],
[4, 14, 4, 1, 3],
]
)

assert torch.equal(new_tokens[:, : step + 1], expected_tokens)


def test_eval_metrics():
"""
Expand Down

0 comments on commit b7f8ff9

Please sign in to comment.