Skip to content

Commit

Permalink
Merge branch 'dev' into fix-modelrunner-inputs
Browse files Browse the repository at this point in the history
melihyilmaz committed Nov 28, 2023
2 parents d906679 + 235420f commit b6b374f
Showing 8 changed files with 119 additions and 68 deletions.
15 changes: 6 additions & 9 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -16,31 +16,28 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
### Changed

- The CLI has been overhauled to use subcommands.
- Upgraded to Lightning >=2.0
- Upgraded to Lightning >=2.0.
- Checkpointing is configured to save the top-k models instead of all.
- Log steps rather than epochs as units of progress during training.
- Validation performance metrics are logged (and added to tensorboard) at the validation epoch, and training loss is logged at the end of training epoch, i.e. training and validation metrics are logged asynchronously.
- Irrelevant warning messages on the console output and in the log file are no longer shown.
- Nicely format logged warnings.
- `every_n_train_steps` has been renamed to `val_check_interval` in accordance to the corresponding Pytorch Lightning parameter.
- Training batches are randomly shuffled.

### Fixed

- Casanovo runs on CPU and can passes all tests.
- Enable gradients during prediction and validation to avoid NaNs from occuring as a temporary workaround until a new Pytorch version is available.
- Upgrade to depthcharge v0.2.3 for `PeptideTransformerDecoder` hotfix.
- Upgraded to Torch >=2.1.

### Removed

- Remove config option for a custom Pytorch Lightning logger.
- Remove superfluous `custom_encoder` config option.

### Fixed

- Casanovo now runs on CPU and can passes all tests.
- Upgrade to Depthcharge v0.2.0 to fix sinusoidal encoding.
- Casanovo runs on CPU and can pass all tests.
- Correctly refer to input peak files by their full file path.
- Specifying custom residues to retrain Casanovo is now possible.
- Upgrade to depthcharge v0.2.3 to fix sinusoidal encoding and for the `PeptideTransformerDecoder` hotfix.
- Correctly report amino acid precision and recall during validation.

## [3.3.0] - 2023-04-04

12 changes: 11 additions & 1 deletion casanovo/config.py
Original file line number Diff line number Diff line change
@@ -83,7 +83,17 @@ def __init__(self, config_file: Optional[str] = None):
else:
with Path(config_file).open() as f_in:
self._user_config = yaml.safe_load(f_in)

# check for missing entries in config file
if len(self._user_config.keys()) < len(self._params.keys()):
keys_set = set(self._params.keys())
users_set = set(self._user_config.keys())
missing = list(keys_set - users_set)
raise KeyError(f"Missing expected entry {missing}")
# detect unrecognized config file entries
keys = list(self._params.keys())
for key, val in self._user_config.items():
if key not in keys:
raise KeyError(f"Unrecognized config file entry {key}")
# Validate:
for key, val in self._config_types.items():
self.validate_param(key, val)
3 changes: 0 additions & 3 deletions casanovo/config.yaml
Original file line number Diff line number Diff line change
@@ -79,9 +79,6 @@ dropout: 0.0
# Number of dimensions to use for encoding peak intensity
# Projected up to ``dim_model`` by default and summed with the peak m/z encoding
dim_intensity:
# Option to provide a pre-trained spectrum encoder when training
# Trained from scratch by default
custom_encoder:
# Max decoded peptide length
max_length: 100
# Number of warmup iterations for learning rate scheduler
73 changes: 31 additions & 42 deletions casanovo/denovo/model.py
Original file line number Diff line number Diff line change
@@ -43,9 +43,6 @@ class Spec2Pep(pl.LightningModule, ModelMixin):
(``dim_model - dim_intensity``) are reserved for encoding the m/z value.
If ``None``, the intensity will be projected up to ``dim_model`` using a
linear layer, then summed with the m/z encoding for each peak.
custom_encoder : Optional[Union[SpectrumEncoder, PairedSpectrumEncoder]]
A pretrained encoder to use. The ``dim_model`` of the encoder must be
the same as that specified by the ``dim_model`` parameter here.
max_length : int
The maximum peptide length to decode.
residues: Union[Dict[str, float], str]
@@ -97,7 +94,6 @@ def __init__(
n_layers: int = 9,
dropout: float = 0.0,
dim_intensity: Optional[int] = None,
custom_encoder: Optional[SpectrumEncoder] = None,
max_length: int = 100,
residues: Union[Dict[str, float], str] = "canonical",
max_charge: int = 5,
@@ -120,17 +116,14 @@ def __init__(
self.save_hyperparameters()

# Build the model.
if custom_encoder is not None:
self.encoder = custom_encoder
else:
self.encoder = SpectrumEncoder(
dim_model=dim_model,
n_head=n_head,
dim_feedforward=dim_feedforward,
n_layers=n_layers,
dropout=dropout,
dim_intensity=dim_intensity,
)
self.encoder = SpectrumEncoder(
dim_model=dim_model,
n_head=n_head,
dim_feedforward=dim_feedforward,
n_layers=n_layers,
dropout=dropout,
dim_intensity=dim_intensity,
)
self.decoder = PeptideDecoder(
dim_model=dim_model,
n_head=n_head,
@@ -758,9 +751,7 @@ def validation_step(
The loss of the validation step.
"""
# Record the loss.
# FIXME: Temporary workaround to avoid the NaN bug.
with torch.set_grad_enabled(True):
loss = self.training_step(batch, mode="valid")
loss = self.training_step(batch, mode="valid")
if not self.calculate_precision:
return loss

@@ -773,8 +764,8 @@ def validation_step(

aa_precision, _, pep_precision = evaluate.aa_match_metrics(
*evaluate.aa_match_batch(
peptides_pred,
peptides_true,
peptides_pred,
self.decoder._peptide_mass.masses,
)
)
@@ -811,30 +802,28 @@ def predict_step(
and amino acid-level confidence scores.
"""
predictions = []
# FIXME: Temporary workaround to avoid the NaN bug.
with torch.set_grad_enabled(True):
for (
precursor_charge,
precursor_mz,
spectrum_i,
spectrum_preds,
) in zip(
batch[1][:, 1].cpu().detach().numpy(),
batch[1][:, 2].cpu().detach().numpy(),
batch[2],
self.forward(batch[0], batch[1]),
):
for peptide_score, aa_scores, peptide in spectrum_preds:
predictions.append(
(
spectrum_i,
precursor_charge,
precursor_mz,
peptide,
peptide_score,
aa_scores,
)
for (
precursor_charge,
precursor_mz,
spectrum_i,
spectrum_preds,
) in zip(
batch[1][:, 1].cpu().detach().numpy(),
batch[1][:, 2].cpu().detach().numpy(),
batch[2],
self.forward(batch[0], batch[1]),
):
for peptide_score, aa_scores, peptide in spectrum_preds:
predictions.append(
(
spectrum_i,
precursor_charge,
precursor_mz,
peptide,
peptide_score,
aa_scores,
)
)

return predictions

1 change: 0 additions & 1 deletion casanovo/denovo/model_runner.py
Original file line number Diff line number Diff line change
@@ -212,7 +212,6 @@ def initialize_model(self, train: bool) -> None:
n_layers=self.config.n_layers,
dropout=self.config.dropout,
dim_intensity=self.config.dim_intensity,
custom_encoder=self.config.custom_encoder,
max_length=self.config.max_length,
residues=self.config.residues,
max_charge=self.config.max_charge,
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -34,7 +34,7 @@ dependencies = [
"scikit-learn",
"spectrum_utils",
"tensorboard",
"torch>=2.0",
"torch>=2.1",
"tqdm",
]
dynamic = ["version"]
58 changes: 57 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
@@ -188,14 +188,70 @@ def tiny_config(tmp_path):
"""A config file for a tiny model."""
cfg = {
"n_head": 2,
"dim_feedfoward": 10,
"dim_feedforward": 10,
"n_layers": 1,
"warmup_iters": 1,
"max_iters": 1,
"max_epochs": 20,
"val_check_interval": 1,
"model_save_folder_path": str(tmp_path),
"accelerator": "cpu",
"precursor_mass_tol": 5,
"isotope_error_range": [0, 1],
"min_peptide_len": 6,
"predict_batch_size": 1024,
"n_beams": 1,
"top_match": 1,
"devices": None,
"random_seed": 454,
"n_log": 1,
"tb_summarywriter": None,
"save_top_k": 5,
"n_peaks": 150,
"min_mz": 50.0,
"max_mz": 2500.0,
"min_intensity": 0.01,
"remove_precursor_tol": 2.0,
"max_charge": 10,
"dim_model": 512,
"dropout": 0.0,
"dim_intensity": None,
"max_length": 100,
"learning_rate": 5e-4,
"weight_decay": 1e-5,
"train_batch_size": 32,
"num_sanity_val_steps": 0,
"train_from_scratch": True,
"calculate_precision": False,
"residues": {
"G": 57.021464,
"A": 71.037114,
"S": 87.032028,
"P": 97.052764,
"V": 99.068414,
"T": 101.047670,
"C+57.021": 160.030649,
"L": 113.084064,
"I": 113.084064,
"N": 114.042927,
"D": 115.026943,
"Q": 128.058578,
"K": 128.094963,
"E": 129.042593,
"M": 131.040485,
"H": 137.058912,
"F": 147.068414,
"R": 156.101111,
"Y": 163.063329,
"W": 186.079313,
"M+15.995": 147.035400,
"N+0.984": 115.026943,
"Q+0.984": 129.042594,
"+42.011": 42.010565,
"+43.006": 43.005814,
"-17.027": -17.026549,
"+43.006-17.027": 25.980265,
},
}

cfg_file = tmp_path / "config.yml"
23 changes: 13 additions & 10 deletions tests/unit_tests/test_config.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
"""Test configuration loading"""
from casanovo.config import Config
import pytest
import yaml


def test_default():
@@ -11,7 +13,7 @@ def test_default():
assert config.file == "default"


def test_override(tmp_path):
def test_override(tmp_path, tiny_config):
"""Test overriding the default"""
yml = tmp_path / "test.yml"
with yml.open("w+") as f_out:
@@ -26,12 +28,13 @@ def test_override(tmp_path):
"""
)

config = Config(yml)
assert config.random_seed == 42
assert config["random_seed"] == 42
assert config.accelerator == "auto"
assert config.top_match == 3
assert len(config.residues) == 4
for i, residue in enumerate("WOUT", 1):
assert config["residues"][residue] == i
assert config.file == str(yml)
with open(tiny_config, "r") as read_file:
contents = yaml.safe_load(read_file)
contents["random_seed_"] = 354

with open("output.yml", "w") as write_file:
yaml.safe_dump(contents, write_file)
with pytest.raises(KeyError):
config = Config("output.yml")
with pytest.raises(KeyError):
config = Config(yml)

0 comments on commit b6b374f

Please sign in to comment.