Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Migration to depthcharge v0.4.8 #350

Open
wants to merge 49 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
49 commits
Select commit Hold shift + click to select a range
6826a1c
migration to depthcharge v0.4.8
Jul 1, 2024
8c8dc61
shuffling training set by default
Jul 1, 2024
70cdea6
Reformat with Black
wfondrie Jul 27, 2024
8771d78
Fix formatting again after merge
wfondrie Jul 27, 2024
7984bdc
Resolve requested changes
Jul 29, 2024
f4b6ec6
Reformat with Black
wfondrie Jul 29, 2024
4ec36b3
removed invalid imports
Lilferrit Sep 10, 2024
355edc6
removed to be added functionality (for now)
Lilferrit Sep 18, 2024
d224011
tensorboard logger
Lilferrit Sep 18, 2024
e6ac94e
circular import bug
Lilferrit Sep 18, 2024
39de098
removed tensorboard unit tests
Lilferrit Sep 18, 2024
97b8de7
beam search decode unit tests (IP)
Lilferrit Sep 18, 2024
2ee2845
teast_beam_search decode test update
Lilferrit Sep 19, 2024
9b9349d
test_eval_metrics test update
Lilferrit Sep 19, 2024
0295493
unit tests updates
Lilferrit Sep 20, 2024
3d1c20f
spectrum id unit tests
Lilferrit Sep 24, 2024
3ec8d7c
integration test fix
Lilferrit Sep 24, 2024
9b8efea
model prediction io flow fixes
Lilferrit Sep 25, 2024
47df27e
PyLightning logging refactor
Lilferrit Sep 25, 2024
45b3e26
mgf file reader title field formatting
Lilferrit Sep 27, 2024
a1b42af
integration tests fix
Lilferrit Sep 30, 2024
261f63c
integration tests
Lilferrit Sep 30, 2024
e3e8456
test_initialize_model fix
Lilferrit Oct 1, 2024
0fb6692
test_save_and_load_weights fix
Lilferrit Oct 1, 2024
5594bf8
test_save_and_load_weights_deprecated fix
Lilferrit Oct 1, 2024
7bd2b5e
test_evaluate fix, evaluate unnanotated peak file error handling
Lilferrit Oct 1, 2024
d178860
test_evaluate fix, evaluate unnanotated peak file error handling
Lilferrit Oct 1, 2024
340695a
test_eval_metrics fix
Lilferrit Oct 1, 2024
e4d93f9
test_spectrum_id tests fix
Lilferrit Oct 1, 2024
eb4af71
unit tests fixes
Lilferrit Oct 2, 2024
2a946c2
teast_beam_search_decode fix
Lilferrit Oct 2, 2024
17bc3a2
negative residue work around
Lilferrit Oct 2, 2024
7d789a7
depthcharge upgrade - all unit tests pass
Lilferrit Oct 7, 2024
c1ca436
pylance depthcharge compatability fix
Lilferrit Oct 8, 2024
2d539fd
removed scans field from dataloaders
Lilferrit Oct 14, 2024
6ab3397
non db functionality working
Lilferrit Nov 21, 2024
9dc293f
import orders, CasanovoDB psm batching
Lilferrit Nov 26, 2024
051a82a
CasanovoDB unit tests
Lilferrit Nov 26, 2024
8ebb55a
no batch made edge case
Lilferrit Nov 26, 2024
a6a2db8
mass caclulation
Lilferrit Nov 26, 2024
d3cd392
CasanovoDB mass mod fixes
Lilferrit Nov 27, 2024
113c879
remove unsqueeze batch method
Lilferrit Nov 27, 2024
54366a5
reduced test epochs from 20 to 15
Lilferrit Nov 28, 2024
3028cd2
integration test fix
Lilferrit Dec 2, 2024
ec20013
integration test fix
Lilferrit Dec 2, 2024
2233839
psm batch generator unit test
Lilferrit Dec 2, 2024
c612785
cleanup debug code
Lilferrit Dec 2, 2024
c43c515
disable multi threading on linux
Lilferrit Dec 6, 2024
2123894
skip n_threads unit test
Lilferrit Dec 6, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion casanovo/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from .version import _get_version


__version__ = _get_version()
13 changes: 6 additions & 7 deletions casanovo/casanovo.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,9 @@
import tqdm
from lightning.pytorch import seed_everything

from . import __version__
from . import utils
from .denovo import ModelRunner
from . import __version__, utils
from .config import Config
from .denovo import ModelRunner

logger = logging.getLogger("casanovo")
click.rich_click.USE_MARKDOWN = True
Expand Down Expand Up @@ -139,7 +138,7 @@ def main() -> None:
"peak_path",
required=True,
nargs=-1,
type=click.Path(exists=True, dir_okay=False),
type=click.Path(exists=True, dir_okay=True),
)
@click.option(
"--evaluate",
Expand Down Expand Up @@ -206,7 +205,7 @@ def sequence(
"peak_path",
required=True,
nargs=-1,
type=click.Path(exists=True, dir_okay=False),
type=click.Path(exists=True, dir_okay=True),
)
@click.argument(
"fasta_path",
Expand Down Expand Up @@ -266,7 +265,7 @@ def db_search(
"train_peak_path",
required=True,
nargs=-1,
type=click.Path(exists=True, dir_okay=False),
type=click.Path(exists=True, dir_okay=True),
)
@click.option(
"-p",
Expand All @@ -277,7 +276,7 @@ def db_search(
""",
required=False,
multiple=True,
type=click.Path(exists=True, dir_okay=False),
type=click.Path(exists=True, dir_okay=True),
)
def train(
train_peak_path: Tuple[str],
Expand Down
18 changes: 17 additions & 1 deletion casanovo/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import shutil
import warnings
from pathlib import Path
from typing import Optional, Dict, Callable, Tuple, Union
from typing import Callable, Dict, Optional, Tuple, Union

import yaml

Expand Down Expand Up @@ -55,6 +55,12 @@ class Config:
max_charge=int,
precursor_mass_tol=float,
isotope_error_range=lambda min_max: (int(min_max[0]), int(min_max[1])),
enzyme=str,
digestion=str,
missed_cleavages=int,
max_mods=int,
allowed_fixed_mods=str,
allowed_var_mods=str,
min_peptide_len=int,
dim_model=int,
n_head=int,
Expand Down Expand Up @@ -83,6 +89,16 @@ class Config:
calculate_precision=bool,
accelerator=str,
devices=int,
lance_dir=str,
shuffle=bool,
buffer_size=int,
reverse_peptides=bool,
replace_isoleucine_with_leucine=bool,
accumulate_grad_batches=int,
gradient_clip_val=float,
gradient_clip_algorithm=str,
precision=str,
mskb_tokenizer=bool,
)

def __init__(self, config_file: Optional[str] = None):
Expand Down
39 changes: 29 additions & 10 deletions casanovo/config.yaml
andradesalazar marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,8 @@ max_mods: 1
# where aa is a standard amino acid (or "nterm" for an N-terminal mod)
# and mod_residue is a key from the "residues" dictionary.
# Example: "M:M+15.995,nterm:+43.006"
allowed_fixed_mods: "C:C+57.021"
allowed_var_mods: "M:M+15.995,N:N+0.984,Q:Q+0.984,nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027"
allowed_fixed_mods: "C:C[Carbamidomethyl]"
allowed_var_mods: "M:M[Oxidation],N:N[Deamidated],Q:Q[Deamidated],nterm:[Acetyl]-,nterm:[Carbamyl]-,nterm:[Ammonia-loss]-,nterm:[+25.980265]-"


###
Expand All @@ -84,6 +84,8 @@ tb_summarywriter: false
log_metrics: false
# How often to log optimizer parameters in steps
log_every_n_steps: 50
# Path to save lance instances
lance_dir:
# Model validation and checkpointing frequency in training steps.
val_check_interval: 50_000

Expand Down Expand Up @@ -125,6 +127,10 @@ learning_rate: 5e-4
weight_decay: 1e-5
# Amount of label smoothing when computing the training loss.
train_label_smoothing: 0.01
# Shuffle dataset during training.
# A buffer of size buffer_size is filled and examples from this buffer are randomly sampled.
shuffle: True
buffer_size: 100_000

# TRAINING/INFERENCE OPTIONS
# Number of spectra in one training batch.
Expand All @@ -137,6 +143,19 @@ num_sanity_val_steps: 0
# This is expensive, so we recommend against it.
calculate_precision: False

# Additional Pytorch lightning trainer flags
accumulate_grad_batches: 1
gradient_clip_val:
gradient_clip_algorithm:
precision: "32-true" # '16-true', '16-mixed', 'bf16-true', 'bf16-mixed', '32-true', '64-true', '64', '32', '16', 'bf16'

# Replace I by L in peptide sequences
replace_isoleucine_with_leucine: True
# Reverse peptide sequences
reverse_peptides: True
# mskb tokenizer, otherwise proforma syntax
mskb_tokenizer: True

# AMINO ACID AND MODIFICATION VOCABULARY
residues:
"G": 57.021464
Expand All @@ -145,7 +164,7 @@ residues:
"P": 97.052764
"V": 99.068414
"T": 101.047670
"C+57.021": 160.030649 # 103.009185 + 57.021464
"C[Carbamidomethyl]": 160.030649 # 103.009185 + 57.021464
"L": 113.084064
"I": 113.084064
"N": 114.042927
Expand All @@ -160,11 +179,11 @@ residues:
"Y": 163.063329
"W": 186.079313
# Amino acid modifications.
"M+15.995": 147.035400 # Met oxidation: 131.040485 + 15.994915
"N+0.984": 115.026943 # Asn deamidation: 114.042927 + 0.984016
"Q+0.984": 129.042594 # Gln deamidation: 128.058578 + 0.984016
"M[Oxidation]": 147.035400 # Met oxidation: 131.040485 + 15.994915
"N[Deamidated]": 115.026943 # Asn deamidation: 114.042927 + 0.984016
"Q[Deamidated]": 129.042594 # Gln deamidation: 128.058578 + 0.984016
# N-terminal modifications.
"+42.011": 42.010565 # Acetylation
"+43.006": 43.005814 # Carbamylation
"-17.027": -17.026549 # NH3 loss
"+43.006-17.027": 25.980265 # Carbamylation and NH3 loss
"[Acetyl]-": 42.010565 # Acetylation
"[Carbamyl]-": 43.005814 # Carbamylation "+43.006"
"[Ammonia-loss]-": -17.026549 # NH3 loss
"[+25.980265]-": 25.980265 # Carbamylation and NH3 loss
Loading
Loading