diff --git a/CHANGELOG.md b/CHANGELOG.md index 240185d1..c73eec8b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), ### Added +- Casanovo-DB mode (`casanovo db_search`) to use Casanovo as a learned score function for sequence database searching (given a FASTA protein database). - During training, model checkpoints will be saved at the end of each training epoch in addition to the checkpoints saved at the end of every validation run. - Besides as a local file, model weights can be specified from a URL. Upon initial download, the weights file is cached for future re-use. - Training and optimizer metrics can now be logged to a CSV file by setting the `log_metrics` config file option to true - the CSV file will be written to under a sub-directory of the output directory named `csv_logs`. diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py index 419021dd..fef73a9b 100644 --- a/casanovo/casanovo.py +++ b/casanovo/casanovo.py @@ -12,7 +12,7 @@ import urllib.parse import warnings from pathlib import Path -from typing import Optional, Tuple, List +from typing import Optional, Tuple warnings.formatwarning = lambda message, category, *args, **kwargs: ( f"{category.__name__}: {message}" @@ -62,18 +62,19 @@ def __init__(self, *args, **kwargs) -> None: click.Option( ("-m", "--model"), help=""" - Either the model weights (.ckpt file) or a URL pointing to the model weights - file. If not provided, Casanovo will try to download the latest release automatically. + Either the model weights (.ckpt file) or a URL pointing to the + model weights file. If not provided, Casanovo will try to + download the latest release automatically. """, ), click.Option( ("-d", "--output_dir"), - help="The destination directory for output files", + help="The destination directory for output files.", type=click.Path(dir_okay=True), ), click.Option( ("-o", "--output_root"), - help="The root name for all output files", + help="The root name for all output files.", type=click.Path(dir_okay=False), ), click.Option( @@ -112,9 +113,9 @@ def main() -> None: ======== Casanovo de novo sequences peptides from tandem mass spectra using a - Transformer model. Casanovo currently supports mzML, mzXML, and MGF files - for de novo sequencing and annotated MGF files, such as those from - MassIVE-KB, for training new models. + Transformer model. Casanovo currently supports mzML, mzXML, and MGF + files for de novo sequencing and annotated MGF files, such as those + from MassIVE-KB, for training new models. Links: @@ -123,10 +124,10 @@ def main() -> None: If you use Casanovo in your work, please cite: - - Yilmaz, M., Fondrie, W. E., Bittremieux, W., Oh, S. & Noble, W. S. De novo - mass spectrometry peptide sequencing with a transformer model. Proceedings - of the 39th International Conference on Machine Learning - ICML '22 (2022) - doi:10.1101/2022.02.07.479481. + - Yilmaz, M., Fondrie, W. E., Bittremieux, W., Oh, S. & Noble, W. S. + De novo mass spectrometry peptide sequencing with a transformer + model. Proceedings of the 39th International Conference on Machine + Learning - ICML '22 (2022) doi:10.1101/2022.02.07.479481. """ @@ -146,9 +147,9 @@ def main() -> None: is_flag=True, default=False, help=""" - Run in evaluation mode. When this flag is set the peptide and amino - acid precision will be calculated and logged at the end of the sequencing - run. All input files must be annotated MGF files if running in evaluation + Run in evaluation mode. When this flag is set the peptide and amino acid + precision will be calculated and logged at the end of the sequencing run. + All input files must be annotated MGF files if running in evaluation mode. """, ) @@ -195,9 +196,68 @@ def sequence( str((output_path / output_root_name).with_suffix(".mztab")), evaluate=evaluate, ) - psms = runner.writer.psms - utils.log_sequencing_report( - psms, start_time=start_time, end_time=time.time() + utils.log_annotate_report( + runner.writer.psms, start_time=start_time, end_time=time.time() + ) + + +@main.command(cls=_SharedParams) +@click.argument( + "peak_path", + required=True, + nargs=-1, + type=click.Path(exists=True, dir_okay=False), +) +@click.argument( + "fasta_path", + required=True, + nargs=1, + type=click.Path(exists=True, dir_okay=False), +) +def db_search( + peak_path: Tuple[str], + fasta_path: str, + model: Optional[str], + config: Optional[str], + output_dir: Optional[str], + output_root: Optional[str], + verbosity: str, + force_overwrite: bool, +) -> None: + """Perform a database search on MS/MS data using Casanovo-DB. + + PEAK_PATH must be one or more mzML, mzXML, or MGF files. + FASTA_PATH must be one FASTA file. + """ + output_path, output_root_name = _setup_output( + output_dir, output_root, force_overwrite, verbosity + ) + utils.check_dir_file_exists(output_path, f"{output_root}.mztab") + config, model = setup_model( + model, config, output_path, output_root_name, False + ) + start_time = time.time() + with ModelRunner( + config, + model, + output_path, + output_root_name if output_root is not None else None, + False, + ) as runner: + logger.info("Performing database search on:") + for peak_file in peak_path: + logger.info(" %s", peak_file) + + logger.info("Using the following FASTA file:") + logger.info(" %s", fasta_path) + + runner.db_search( + peak_path, + fasta_path, + str((output_path / output_root_name).with_suffix(".mztab")), + ) + utils.log_annotate_report( + runner.writer.psms, start_time=start_time, end_time=time.time() ) @@ -231,8 +291,9 @@ def train( ) -> None: """Train a Casanovo model on your own data. - TRAIN_PEAK_PATH must be one or more annoated MGF files, such as those - provided by MassIVE-KB, from which to train a new Casnovo model. + TRAIN_PEAK_PATH must be one or more annoated MGF files, such as + those provided by MassIVE-KB, from which to train a new Casnovo + model. """ output_path, output_root_name = _setup_output( output_dir, output_root, force_overwrite, verbosity @@ -265,7 +326,7 @@ def train( @main.command() def version() -> None: - """Get the Casanovo version information""" + """Get the Casanovo version information.""" versions = [ f"Casanovo: {__version__}", f"Depthcharge: {depthcharge.__version__}", @@ -283,20 +344,20 @@ def version() -> None: default="casanovo.yaml", type=click.Path(dir_okay=False), ) -def configure(output: str) -> None: +def configure(output: Path) -> None: """Generate a Casanovo configuration file to customize. The casanovo configuration file is in the YAML format. """ - Config.copy_default(output) - output = setup_logging(output, "info") + Config.copy_default(str(output)) + setup_logging(output, "info") logger.info(f"Wrote {output}\n") def setup_logging( log_file_path: Path, verbosity: str, -) -> Path: +) -> None: """Set up the logger. Logging occurs to the command-line and to the given log file. @@ -364,10 +425,11 @@ def setup_model( Parameters ---------- model : str | None - May be a file system path, a URL pointing to a .ckpt file, or None. - If `model` is a URL the weights will be downloaded and cached from - `model`. If `model` is `None` the weights from the latest matching - official release will be used (downloaded and cached). + May be a file system path, a URL pointing to a .ckpt file, or + None. If `model` is a URL the weights will be downloaded and + cached from `model`. If `model` is `None` the weights from the + latest matching official release will be used (downloaded and + cached). config : str | None Config file path. If None the default config will be used. output_dir: : Path | str @@ -375,20 +437,21 @@ def setup_model( output_root_name : str, The base name for the output files. is_train : bool - Are we training? If not, we need to retrieve weights when the model is - None. + Are we training? If not, we need to retrieve weights when the + model is None. Return ------ Tuple[Config, Path] - Initialized Casanovo config, local path to model weights if any (may be - `None` if training using random starting weights). + Initialized Casanovo config, local path to model weights if any + (may be `None` if training using random starting weights). """ # Read parameters from the config file. config = Config(config) seed_everything(seed=config["random_seed"], workers=True) - # Download model weights if these were not specified (except when training). + # Download model weights if these were not specified (except when + # training). cache_dir = Path(appdirs.user_cache_dir("casanovo", False, opinion=False)) if model is None: if not is_train: @@ -396,16 +459,16 @@ def setup_model( model = _get_model_weights(cache_dir) except github.RateLimitExceededException: logger.error( - "GitHub API rate limit exceeded while trying to download the " - "model weights. Please download compatible model weights " - "manually from the official Casanovo code website " - "(https://github.com/Noble-Lab/casanovo) and specify these " - "explicitly using the `--model` parameter when running " - "Casanovo." + "GitHub API rate limit exceeded while trying to download " + "the model weights. Please download compatible model " + "weights manually from the official Casanovo code website " + "(https://github.com/Noble-Lab/casanovo) and specify " + "these explicitly using the `--model` parameter when " + "running Casanovo." ) raise PermissionError( - "GitHub API rate limit exceeded while trying to download the " - "model weights" + "GitHub API rate limit exceeded while trying to download " + "the model weights" ) from None else: if _is_valid_url(model): @@ -430,29 +493,30 @@ def setup_model( return config, model -def _get_model_weights(cache_dir: Path) -> str: +def _get_model_weights(cache_dir: Path) -> Path: """ Use cached model weights or download them from GitHub. - If no weights file (extension: .ckpt) is available in the cache directory, - it will be downloaded from a release asset on GitHub. - Model weights are retrieved by matching release version. If no model weights - for an identical release (major, minor, patch), alternative releases with - matching (i) major and minor, or (ii) major versions will be used. - If no matching release can be found, no model weights will be downloaded. + If no weights file (extension: .ckpt) is available in the cache + directory, it will be downloaded from a release asset on GitHub. + Model weights are retrieved by matching release version. If no model + weights for an identical release (major, minor, patch), alternative + releases with matching (i) major and minor, or (ii) major versions + will be used. If no matching release can be found, no model weights + will be downloaded. - Note that the GitHub API is limited to 60 requests from the same IP per - hour. + Note that the GitHub API is limited to 60 requests from the same IP + per hour. Parameters ---------- cache_dir : Path - model weights cache directory path + Model weights cache directory path. Returns ------- - str - The name of the model weights file. + Path + The path of the model weights file. """ os.makedirs(cache_dir, exist_ok=True) version = utils.split_version(__version__) @@ -539,11 +603,11 @@ def _setup_output( Parameters: ----------- output_dir : str | None - The path to the output directory. If `None`, the output directory will - be resolved to the current working directory. + The path to the output directory. If `None`, the output + directory will be resolved to the current working directory. output_root : str | None - The base name for the output files. If `None` the output root name will - be resolved to casanovo_ + The base name for the output files. If `None` the output root + name will be resolved to casanovo_ overwrite: bool Whether to overwrite log file if it already exists in the output directory. @@ -553,8 +617,8 @@ def _setup_output( Returns: -------- Tuple[Path, str] - A tuple containing the resolved output directory and root name for - output files. + A tuple containing the resolved output directory and root name + for output files. """ if output_root is None: output_root = ( @@ -568,7 +632,8 @@ def _setup_output( if not output_path.is_dir(): output_path.mkdir(parents=True) logger.warning( - "Target output directory %s does not exists, so it will be created.", + "Target output directory %s does not exists, so it will be " + "created.", output_path, ) @@ -588,8 +653,8 @@ def _get_weights_from_url( Resolve weight file from URL Attempt to download weight file from URL if weights are not already - cached - otherwise use cached weights. Downloaded weight files will be - cached. + cached - otherwise use cached weights. Downloaded weight files will + be cached. Parameters ---------- @@ -598,8 +663,8 @@ def _get_weights_from_url( cache_dir : Path Model weights cache directory path. force_download : Optional[bool], default=False - If True, forces a new download of the weight file even if it exists in - the cache. + If True, forces a new download of the weight file even if it + exists in the cache. Returns ------- @@ -629,7 +694,8 @@ def _get_weights_from_url( ).timestamp() else: logger.warning( - "Attempted HEAD request to %s yielded non-ok status code - using cached file", + "Attempted HEAD request to %s yielded non-ok status code—" + "using cached file", file_url, ) except ( @@ -638,7 +704,8 @@ def _get_weights_from_url( requests.TooManyRedirects, ): logger.warning( - "Failed to reach %s to get remote last modified time - using cached file", + "Failed to reach %s to get remote last modified time—using " + "cached file", file_url, ) @@ -656,8 +723,9 @@ def _download_weights(file_url: str, download_path: Path) -> None: """ Download weights file from URL - Download the model weights file from the specified URL and save it to the - given path. Ensures the download directory exists, and uses a progress + Download the model weights file from the specified URL and save it + to the given path. Ensures the download directory exists, and uses a + progress bar to indicate download status. Parameters diff --git a/casanovo/config.py b/casanovo/config.py index 8bef2233..e276e12d 100644 --- a/casanovo/config.py +++ b/casanovo/config.py @@ -13,11 +13,12 @@ logger = logging.getLogger("casanovo") -# FIXME: This contains deprecated config options to be removed in the next major -# version update. +# FIXME: This contains deprecated config options to be removed in the next +# major version update. _config_deprecated = dict( every_n_train_steps="val_check_interval", max_iters="cosine_schedule_period_iters", + max_length="max_peptide_len", save_top_k=None, model_save_folder_path=None, ) @@ -26,8 +27,8 @@ class Config: """The Casanovo configuration options. - If a parameter is missing from a user's configuration file, the default - value is assumed. + If a parameter is missing from a user's configuration file, the + default value is assumed. Parameters ---------- @@ -61,7 +62,7 @@ class Config: n_layers=int, dropout=float, dim_intensity=int, - max_length=int, + max_peptide_len=int, residues=dict, n_log=int, tb_summarywriter=bool, diff --git a/casanovo/config.yaml b/casanovo/config.yaml index c532048b..b7179347 100644 --- a/casanovo/config.yaml +++ b/casanovo/config.yaml @@ -5,20 +5,22 @@ ### # The following parameters can be modified when running inference or when -# fine-tuning an existing Casanovo model. +# fine-tuning an existing Casanovo model. They also affect database search +# parameters when running Casanovo in DB-search mode. ### # Max absolute difference allowed with respect to observed precursor m/z. -# Predictions outside the tolerance range are assigned a negative peptide score. +# denovo: Predictions outside the tolerance range are assigned a negative peptide score. +# db-search: Select candidate peptides within the specified precursor m/z tolerance. precursor_mass_tol: 50 # ppm # Isotopes to consider when comparing predicted and observed precursor m/z's. isotope_error_range: [0, 1] -# The minimum length of predicted peptides. +# The minimum length of considered peptides. min_peptide_len: 6 +# The maximum length of considered peptides. +max_peptide_len: 100 # Number of spectra in one inference batch. predict_batch_size: 1024 -# Number of beams used in beam search. -n_beams: 1 # Number of PSMs for each spectrum. top_match: 1 # The hardware accelerator to use. Must be one of: @@ -29,6 +31,42 @@ accelerator: "auto" # number will be automatically selected for based on the chosen accelerator. devices: + +### +# The following parameters are unique to Casanovo's de novo sequencing mode. +### + +# Number of beams used in beam search. +n_beams: 1 + + +### +# The following parameters are unique to Casanovo's database search mode. +### + +# Enzyme for in silico digestion, used to generate candidate peptides. +# See pyteomics.parser.expasy_rules for valid enzymes. +# Can also take a regex to specify custom digestion rules. +enzyme: "trypsin" +# Digestion type for candidate peptide generation. +# full: standard digestion. +# semi: Include products of semi-specific cleavage. +# non-specific: Include products of non-specific cleavage. +digestion: "full" +# Number of allowed missed cleavages when digesting protein. +missed_cleavages: 0 +# Maximum number of variable amino acid modifications per peptide, +# None generates all possible isoforms as candidates. +max_mods: 1 +# Select which modifications from the vocabulary can be used in candidate creation. +# Format: Comma-separated list of "aa:mod_residue", +# where aa is a standard amino acid (or "nterm" for an N-terminal mod) +# and mod_residue is a key from the "residues" dictionary. +# Example: "M:M+15.995,nterm:+43.006" +allowed_fixed_mods: "C:C+57.021" +allowed_var_mods: "M:M+15.995,N:N+0.984,Q:Q+0.984,nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" + + ### # The following parameters should only be modified if you are training a new # Casanovo model from scratch. @@ -77,8 +115,6 @@ dropout: 0.0 # Number of dimensions to use for encoding peak intensity. # Projected up to `dim_model` by default and summed with the peak m/z encoding. dim_intensity: -# Max decoded peptide length. -max_length: 100 # The number of iterations for the linear warm-up of the learning rate. warmup_iters: 100_000 # The number of iterations for the cosine half period of the learning rate. diff --git a/casanovo/data/db_utils.py b/casanovo/data/db_utils.py new file mode 100644 index 00000000..d3670930 --- /dev/null +++ b/casanovo/data/db_utils.py @@ -0,0 +1,399 @@ +"""Unique methods used within db-search mode""" + +import functools +import logging +import os +import re +import string +from typing import Dict, Iterator, Pattern, Set, Tuple + +import depthcharge.masses +import numpy as np +import pandas as pd +import pyteomics.fasta +import pyteomics.parser + + +logger = logging.getLogger("casanovo") + +# CONSTANTS +PROTON = 1.00727646677 +ISOTOPE_SPACING = 1.003355 + + +class ProteinDatabase: + """ + Store digested FASTA data and return candidate peptides for a given + precursor mass. + + Parameters + ---------- + fasta_path : str + Path to the FASTA file. + enzyme : str + The enzyme to use for digestion. + See pyteomics.parser.expasy_rules for valid enzymes. + digestion : str + The type of digestion to perform. + Either 'full', 'partial', or 'non-specific'. + missed_cleavages : int + The number of missed cleavages to allow. + min_peptide_len : int + The minimum length of peptides to consider. + max_peptide_len : int + The maximum length of peptides to consider. + max_mods : int + The maximum number of modifications to allow per peptide. + precursor_tolerance : float + The precursor mass tolerance in ppm. + isotope_error : Tuple[int, int] + Isotope range [min, max] to consider when comparing predicted + and observed precursor m/z's. + allowed_fixed_mods : str + A comma-separated string of fixed modifications to consider. + allowed_var_mods : str + A comma-separated string of variable modifications to consider. + residues : Dict[str, float] + A dictionary of amino acid masses. + """ + + def __init__( + self, + fasta_path: str, + enzyme: str, + digestion: str, + missed_cleavages: int, + min_peptide_len: int, + max_peptide_len: int, + max_mods: int, + precursor_tolerance: float, + isotope_error: Tuple[int, int], + allowed_fixed_mods: str, + allowed_var_mods: str, + residues: Dict[str, float], + ): + self.fixed_mods, self.var_mods, self.swap_map = _construct_mods_dict( + allowed_fixed_mods, allowed_var_mods + ) + self.max_mods = max_mods + self.swap_regex = re.compile( + "(%s)" % "|".join(map(re.escape, self.swap_map.keys())) + ) + peptide_generator = _peptide_generator( + fasta_path, + enzyme, + digestion, + missed_cleavages, + min_peptide_len, + max_peptide_len, + set([aa[0] for aa in residues.keys() if aa[0].isalpha()]), + ) + logger.info( + "Digesting FASTA file (enzyme = %s, digestion = %s, missed " + "cleavages = %d)...", + enzyme, + digestion, + missed_cleavages, + ) + self.db_peptides = self._digest_fasta(peptide_generator) + self.precursor_tolerance = precursor_tolerance + self.isotope_error = isotope_error + + def _digest_fasta( + self, + peptide_generator: Iterator[Tuple[str, str]], + ) -> pd.DataFrame: + """ + Digests a FASTA file and returns the peptides, their masses, + and associated protein(s). + + Parameters + ---------- + peptide_generator : Iterator[Tuple[str, str]] + An iterator that yields peptides and associated proteins. + + Returns + ------- + peptides : pd.DataFrame + A Pandas DataFrame with index "peptide" (the peptide + sequence), and columns "calc_mass" (the peptide neutral + mass) and "protein" (a list of associated protein(s)). + """ + # Generate all possible peptide isoforms. + peptides = pd.DataFrame( + data=[ + (iso, prot) + for pep, prot in peptide_generator + for iso in pyteomics.parser.isoforms( + pep, + variable_mods=self.var_mods, + fixed_mods=self.fixed_mods, + max_mods=self.max_mods, + ) + ], + columns=["peptide", "protein"], + ) + # Convert modX peptide to Casanovo format. + peptides["peptide"] = peptides["peptide"].apply( + functools.partial( + _convert_from_modx, + swap_map=self.swap_map, + swap_regex=self.swap_regex, + ) + ) + # Merge proteins from duplicate peptides. + peptides = ( + peptides.groupby("peptide")["protein"] + .apply(lambda proteins: sorted(set(proteins))) + .reset_index() + ) + # Calculate the mass of each peptide. + mass_calculator = depthcharge.masses.PeptideMass(residues="massivekb") + peptides["calc_mass"] = ( + peptides["peptide"].apply(mass_calculator.mass).round(5) + ) + # Sort by peptide mass and index by peptide sequence. + peptides.sort_values( + by=["calc_mass", "peptide"], ascending=True, inplace=True + ) + peptides.set_index("peptide", inplace=True) + + logger.info( + "Digestion complete. %s peptides generated.", f"{len(peptides):,d}" + ) + return peptides + + def get_candidates( + self, + precursor_mz: float, + charge: int, + ) -> pd.Series: + """ + Returns candidate peptides that fall within the search + parameter's precursor mass tolerance. + + Parameters + ---------- + precursor_mz : float + The precursor mass-to-charge ratio. + charge : int + The precursor charge. + + Returns + ------- + candidates : pd.Series + A series of candidate peptides. + """ + # FIXME: This could potentially be sped up with only a single pass + # through the database. + mask = np.zeros(len(self.db_peptides), dtype=bool) + precursor_tol_ppm = self.precursor_tolerance / 1e6 + for e in range(self.isotope_error[0], self.isotope_error[1] + 1): + iso_shift = ISOTOPE_SPACING * e + shift_raw_mass = float( + _to_neutral_mass(precursor_mz, charge) - iso_shift + ) + upper_bound = shift_raw_mass * (1 + precursor_tol_ppm) + lower_bound = shift_raw_mass * (1 - precursor_tol_ppm) + mask |= (self.db_peptides["calc_mass"] >= lower_bound) & ( + self.db_peptides["calc_mass"] <= upper_bound + ) + return self.db_peptides.index[mask] + + def get_associated_protein(self, peptide: str) -> str: + """ + Returns the associated protein(s) for a given peptide. + + Parameters + ---------- + peptide : str + The peptide sequence. + + Returns + ------- + protein : str + The associated protein(s) identifiers, separated by commas. + """ + return ",".join(self.db_peptides.loc[peptide, "protein"]) + + +def _construct_mods_dict( + allowed_fixed_mods: str, allowed_var_mods: str +) -> Tuple[Dict[str, str], Dict[str, str], Dict[str, str]]: + """ + Constructs dictionaries of fixed and variable modifications. + + Parameters + ---------- + allowed_fixed_mods : str + A comma-separated string of fixed modifications to consider. + allowed_var_mods : str + A comma-separated string of variable modifications to consider. + + Returns + ------- + fixed_mods : Dict[str, str] + A dictionary of fixed modifications. + var_mods : Dict[str, str] + A dictionary of variable modifications. + swap_map : Dict[str, str] + A dictionary that allows for swapping of modX to + Casanovo-acceptable modifications. + """ + swap_map, fixed_mods, var_mods = {}, {}, {} + for mod_map, allowed_mods in zip( + [fixed_mods, var_mods], [allowed_fixed_mods, allowed_var_mods] + ): + for i, mod in enumerate(allowed_mods.split(",")): + aa, mod_aa = mod.split(":") + mod_id = string.ascii_lowercase[i] + if aa == "nterm": + mod_map[f"{mod_id}-"] = True + swap_map[f"{mod_id}-"] = f"{mod_aa}" + else: + mod_map[mod_id] = [aa] + swap_map[f"{mod_id}{aa}"] = f"{mod_aa}" + + return fixed_mods, var_mods, swap_map + + +def _peptide_generator( + fasta_filename: str, + enzyme: str, + digestion: str, + missed_cleavages: int, + min_peptide_len: int, + max_peptide_len: int, + valid_aa: Set[str], +) -> Iterator[Tuple[str, str]]: + """ + Creates a generator that yields peptides from a FASTA file depending + on the type of digestion specified. + + Parameters + ---------- + fasta_filename : str + Path to the FASTA file. + enzyme : str + The enzyme to use for digestion. + See pyteomics.parser.expasy_rules for valid enzymes. + Can also be a regex. + digestion : str + The type of digestion to perform. + Either 'full', 'partial', or 'non-specific'. + missed_cleavages : int + The number of missed cleavages to allow. + min_peptide_len : int + The minimum length of peptides to consider. + max_peptide_len : int + The maximum length of peptides to consider. + valid_aa : Set[str] + A set of valid amino acids. + + Yields + ------ + peptide : str + A peptide sequence, unmodified. + protein : str + The associated protein. + """ + # Verify the existence of the file. + if not os.path.isfile(fasta_filename): + logger.error("File %s does not exist.", fasta_filename) + raise FileNotFoundError(f"File {fasta_filename} does not exist.") + if digestion not in ("full", "partial", "non-specific"): + logger.error("Digestion type %s not recognized.", digestion) + raise ValueError(f"Digestion type {digestion} not recognized.") + if enzyme not in pyteomics.parser.expasy_rules: + logger.info( + "Enzyme %s not recognized. Interpreting as cleavage rule.", + enzyme, + ) + n_skipped = 0 + if digestion == "non-specific": + for header, seq in pyteomics.fasta.read(fasta_filename): + protein = header.split()[0] + # Generate all possible peptides. + for i in range(len(seq)): + for j in range( + i + min_peptide_len, + min(i + max_peptide_len + 1, len(seq) + 1), + ): + peptide = seq[i:j] + if any(aa not in valid_aa for aa in peptide): + n_skipped += 1 + logger.debug( + "Skipping peptide with unknown amino acids: %s", + peptide, + ) + else: + yield peptide, protein + else: + for header, seq in pyteomics.fasta.read(fasta_filename): + peptides = pyteomics.parser.cleave( + seq, + rule=enzyme, + missed_cleavages=missed_cleavages, + semi=digestion == "partial", + ) + protein = header.split()[0] + for peptide in peptides: + if min_peptide_len <= len(peptide) <= max_peptide_len: + if any(aa not in valid_aa for aa in peptide): + n_skipped += 1 + logger.debug( + "Skipping peptide with unknown amino acids: %s", + peptide, + ) + else: + yield peptide, protein + if n_skipped > 0: + logger.warning( + "Skipped %d peptides with unknown amino acids", n_skipped + ) + + +def _convert_from_modx( + seq: str, swap_map: dict[str, str], swap_regex: Pattern +) -> str: + """ + Converts peptide sequence from modX format to + Casanovo-acceptable modifications. + + Parameters: + ----------- + seq : str + Peptide in modX format + swap_map : dict[str, str] + Dictionary that allows for swapping of modX to + Casanovo-acceptable modifications. + swap_regex : Pattern + Regular expression to match modX format. + + Returns: + -------- + str + Peptide in Casanovo-acceptable modifications. + """ + # FIXME: This might be handled by the DepthCharge residues vocabulary + # instead. + return swap_regex.sub(lambda x: swap_map[x.group()], seq) + + +def _to_neutral_mass(mz_mass: float, charge: int) -> float: + """ + Convert precursor m/z value to neutral mass. + + Parameters + ---------- + mz_mass : float + The precursor mass-to-charge ratio. + charge : int + The precursor charge. + + Returns + ------- + mass : float + The calculated precursor neutral mass. + """ + return charge * (mz_mass - PROTON) diff --git a/casanovo/data/ms_io.py b/casanovo/data/ms_io.py index 50932399..bb9a8a3e 100644 --- a/casanovo/data/ms_io.py +++ b/casanovo/data/ms_io.py @@ -191,7 +191,7 @@ def save(self) -> None: "PSM", psm.sequence, # sequence i, # PSM_ID - "null", # accession + psm.protein, # accession "null", # unique "null", # database "null", # database_version diff --git a/casanovo/data/psm.py b/casanovo/data/psm.py index 0dc3c48b..eece07a4 100644 --- a/casanovo/data/psm.py +++ b/casanovo/data/psm.py @@ -1,4 +1,4 @@ -"""Peptide spectrum match dataclass""" +"""Peptide spectrum match dataclass.""" import dataclasses from typing import Tuple, Iterable @@ -15,21 +15,24 @@ class PepSpecMatch: The amino acid sequence of the peptide. spectrum_id : Tuple[str, str] A tuple containing the spectrum identifier in the form - (spectrum file name, spectrum file idx) + (spectrum file name, spectrum file idx). peptide_score : float Score of the match between the full peptide sequence and the spectrum. charge : int - The precursor charge state of the peptide ion observed in the spectrum. + The precursor charge state of the peptide ion observed in the + spectrum. calc_mz : float - The calculated mass-to-charge ratio (m/z) of the peptide based on its - sequence and charge state. + The calculated mass-to-charge ratio (m/z) of the peptide based + on its sequence and charge state. exp_mz : float - The observed (experimental) precursor mass-to-charge ratio (m/z) of the - peptide as detected in the spectrum. + The observed (experimental) precursor mass-to-charge ratio (m/z) + of the peptide as detected in the spectrum. aa_scores : Iterable[float] A list of scores for individual amino acids in the peptide - sequence, where len(aa_scores) == len(sequence) + sequence, where len(aa_scores) == len(sequence). + protein : str + Protein associated with the peptide sequence (for db mode). """ sequence: str @@ -39,3 +42,4 @@ class PepSpecMatch: calc_mz: float exp_mz: float aa_scores: Iterable[float] + protein: str = "null" diff --git a/casanovo/denovo/dataloaders.py b/casanovo/denovo/dataloaders.py index fe5d6237..cdbf71bf 100644 --- a/casanovo/denovo/dataloaders.py +++ b/casanovo/denovo/dataloaders.py @@ -1,6 +1,7 @@ """Data loaders for the de novo sequencing task.""" import functools +import logging import os from typing import List, Optional, Tuple @@ -9,9 +10,13 @@ import torch from depthcharge.data import AnnotatedSpectrumIndex +from ..data import db_utils from ..data.datasets import AnnotatedSpectrumDataset, SpectrumDataset +logger = logging.getLogger("casanovo") + + class DeNovoDataModule(pl.LightningDataModule): """ Data loader to prepare MS/MS spectra for a Spec2Pep predictor. @@ -29,25 +34,25 @@ class DeNovoDataModule(pl.LightningDataModule): eval_batch_size : int The batch size to use for inference. n_peaks : Optional[int] - The number of top-n most intense peaks to keep in each spectrum. `None` - retains all peaks. + The number of top-n most intense peaks to keep in each spectrum. + `None` retains all peaks. min_mz : float - The minimum m/z to include. The default is 140 m/z, in order to exclude - TMT and iTRAQ reporter ions. + The minimum m/z to include. The default is 140 m/z, in order to + exclude TMT and iTRAQ reporter ions. max_mz : float The maximum m/z to include. min_intensity : float - Remove peaks whose intensity is below `min_intensity` percentage of the - base peak intensity. + Remove peaks whose intensity is below `min_intensity` percentage + of the base peak intensity. remove_precursor_tol : float - Remove peaks within the given mass tolerance in Dalton around the - precursor mass. + Remove peaks within the given mass tolerance in Dalton around + the precursor mass. n_workers : int, optional - The number of workers to use for data loading. By default, the number of - available CPU cores on the current machine is used. + The number of workers to use for data loading. By default, the + number of available CPU cores on the current machine is used. random_state : Optional[int] - The NumPy random state. ``None`` leaves mass spectra in the order they - were parsed. + The NumPy random state. ``None`` leaves mass spectra in the + order they were parsed. """ def __init__( @@ -66,12 +71,12 @@ def __init__( random_state: Optional[int] = None, ): super().__init__() - self.train_index = train_index - self.valid_index = valid_index - self.test_index = test_index + self.train_index: Optional[AnnotatedSpectrumIndex] = train_index + self.valid_index: Optional[AnnotatedSpectrumIndex] = valid_index + self.test_index: Optional[AnnotatedSpectrumIndex] = test_index self.train_batch_size = train_batch_size self.eval_batch_size = eval_batch_size - self.n_peaks = n_peaks + self.n_peaks: Optional[int] = n_peaks self.min_mz = min_mz self.max_mz = max_mz self.min_intensity = min_intensity @@ -81,6 +86,7 @@ def __init__( self.train_dataset = None self.valid_dataset = None self.test_dataset = None + self.protein_database = None def setup(self, stage: str = None, annotated: bool = True) -> None: """ @@ -89,11 +95,11 @@ def setup(self, stage: str = None, annotated: bool = True) -> None: Parameters ---------- stage : str {"fit", "validate", "test"} - The stage indicating which Datasets to prepare. All are prepared by - default. + The stage indicating which Datasets to prepare. All are + prepared by default. annotated: bool - True if peptide sequence annotations are available for the test - data. + True if peptide sequence annotations are available for the + test data. """ if stage in (None, "fit", "validate"): make_dataset = functools.partial( @@ -128,6 +134,7 @@ def _make_loader( dataset: torch.utils.data.Dataset, batch_size: int, shuffle: bool = False, + collate_fn: Optional[callable] = None, ) -> torch.utils.data.DataLoader: """ Create a PyTorch DataLoader. @@ -140,6 +147,8 @@ def _make_loader( The batch size to use. shuffle : bool Option to shuffle the batches. + collate_fn : Optional[callable] + A function to collate the data into a batch. Returns ------- @@ -149,7 +158,7 @@ def _make_loader( return torch.utils.data.DataLoader( dataset, batch_size=batch_size, - collate_fn=prepare_batch, + collate_fn=prepare_batch if collate_fn is None else collate_fn, pin_memory=True, num_workers=self.n_workers, shuffle=shuffle, @@ -173,6 +182,16 @@ def predict_dataloader(self) -> torch.utils.data.DataLoader: """Get the predict DataLoader.""" return self._make_loader(self.test_dataset, self.eval_batch_size) + def db_dataloader(self) -> torch.utils.data.DataLoader: + """Get a special dataloader for DB search.""" + return self._make_loader( + self.test_dataset, + self.eval_batch_size, + collate_fn=functools.partial( + prepare_psm_batch, protein_database=self.protein_database + ), + ) + def prepare_batch( batch: List[Tuple[torch.Tensor, float, int, str]] @@ -180,21 +199,23 @@ def prepare_batch( """ Collate MS/MS spectra into a batch. - The MS/MS spectra will be padded so that they fit nicely as a tensor. - However, the padded elements are ignored during the subsequent steps. + The MS/MS spectra will be padded so that they fit nicely as a + tensor. However, the padded elements are ignored during the + subsequent steps. Parameters ---------- batch : List[Tuple[torch.Tensor, float, int, str]] - A batch of data from an AnnotatedSpectrumDataset, consisting of for each - spectrum (i) a tensor with the m/z and intensity peak values, (ii), the - precursor m/z, (iii) the precursor charge, (iv) the spectrum identifier. + A batch of data from an AnnotatedSpectrumDataset, consisting of + for each spectrum (i) a tensor with the m/z and intensity peak + values, (ii), the precursor m/z, (iii) the precursor charge, + (iv) the spectrum identifier. Returns ------- spectra : torch.Tensor of shape (batch_size, n_peaks, 2) - The padded mass spectra tensor with the m/z and intensity peak values - for each spectrum. + The padded mass spectra tensor with the m/z and intensity peak + values for each spectrum. precursors : torch.Tensor of shape (batch_size, 3) A tensor with the precursor neutral mass, precursor charge, and precursor m/z. @@ -211,3 +232,75 @@ def prepare_batch( [precursor_masses, precursor_charges, precursor_mzs] ).T.float() return spectra, precursors, np.asarray(spectrum_ids) + + +def prepare_psm_batch( + batch: List[Tuple[torch.Tensor, float, int, str]], + protein_database: db_utils.ProteinDatabase, +) -> Tuple[torch.Tensor, torch.Tensor, np.ndarray, np.ndarray]: + """ + Collate MS/MS spectra into a batch for DB search. + + The MS/MS spectra will be padded so that they fit nicely as a + tensor. However, the padded elements are ignored during the + subsequent steps. + + Parameters + ---------- + batch : List[Tuple[torch.Tensor, float, int, str]] + A batch of data from an AnnotatedSpectrumDataset, consisting of + for each spectrum (i) a tensor with the m/z and intensity peak + values, (ii), the precursor m/z, (iii) the precursor charge, + (iv) the spectrum identifier. + protein_database : db_utils.ProteinDatabase + The protein database to use for candidate peptide retrieval. + + Returns + ------- + batch_spectra : torch.Tensor of shape (batch_size, n_peaks, 2) + The padded mass spectra tensor with the m/z and intensity peak + values for each spectrum. + batch_precursors : torch.Tensor of shape (batch_size, 3) + A tensor with the precursor neutral mass, precursor charge, and + precursor m/z. + batch_spectrum_ids : np.ndarray + The spectrum identifiers. + batch_peptides : np.ndarray + The candidate peptides for each spectrum. + """ + spectra, precursors, spectrum_ids = prepare_batch(batch) + + batch_spectra = [] + batch_precursors = [] + batch_spectrum_ids = [] + batch_peptides = [] + # FIXME: This can be optimized by using a sliding window instead of + # retrieving candidates for each spectrum independently. + for i in range(len(batch)): + candidate_pep = protein_database.get_candidates( + precursors[i][2], precursors[i][1] + ) + if len(candidate_pep) == 0: + logger.debug( + "No candidate peptides found for spectrum %s with precursor " + "charge %d and precursor m/z %f", + spectrum_ids[i], + precursors[i][1], + precursors[i][2], + ) + else: + batch_spectra.append( + spectra[i].unsqueeze(0).repeat(len(candidate_pep), 1, 1) + ) + batch_precursors.append( + precursors[i].unsqueeze(0).repeat(len(candidate_pep), 1) + ) + batch_spectrum_ids.extend([spectrum_ids[i]] * len(candidate_pep)) + batch_peptides.extend(candidate_pep) + + return ( + torch.cat(batch_spectra, dim=0), + torch.cat(batch_precursors, dim=0), + np.asarray(batch_spectrum_ids), + np.asarray(batch_peptides), + ) diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 8a2b421f..f350f3b3 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -2,8 +2,10 @@ import collections import heapq +import itertools import logging import warnings +from pathlib import Path from typing import Any, Dict, Iterable, List, Optional, Tuple, Union import depthcharge.masses @@ -16,7 +18,7 @@ from . import evaluate from .. import config -from ..data import ms_io, psm +from ..data import ms_io logger = logging.getLogger("casanovo") @@ -32,37 +34,39 @@ class Spec2Pep(pl.LightningModule, ModelMixin): dim_model : int The latent dimensionality used by the transformer model. n_head : int - The number of attention heads in each layer. ``dim_model`` must be - divisible by ``n_head``. + The number of attention heads in each layer. ``dim_model`` must + be divisible by ``n_head``. dim_feedforward : int - The dimensionality of the fully connected layers in the transformer - model. + The dimensionality of the fully connected layers in the + transformer model. n_layers : int The number of transformer layers. dropout : float The dropout probability for all layers. dim_intensity : Optional[int] - The number of features to use for encoding peak intensity. The remaining - (``dim_model - dim_intensity``) are reserved for encoding the m/z value. - If ``None``, the intensity will be projected up to ``dim_model`` using a - linear layer, then summed with the m/z encoding for each peak. - max_length : int + The number of features to use for encoding peak intensity. The + remaining (``dim_model - dim_intensity``) are reserved for + encoding the m/z value. If ``None``, the intensity will be + projected up to ``dim_model`` using a linear layer, then summed + with the m/z encoding for each peak. + max_peptide_len : int The maximum peptide length to decode. residues : Union[Dict[str, float], str] - The amino acid dictionary and their masses. By default ("canonical) this - is only the 20 canonical amino acids, with cysteine carbamidomethylated. - If "massivekb", this dictionary will include the modifications found in - MassIVE-KB. Additionally, a dictionary can be used to specify a custom + The amino acid dictionary and their masses. By default + ("canonical) this is only the 20 canonical amino acids, with + cysteine carbamidomethylated. If "massivekb", this dictionary + will include the modifications found in MassIVE-KB. + Additionally, a dictionary can be used to specify a custom collection of amino acids and masses. max_charge : int The maximum precursor charge to consider. precursor_mass_tol : float, optional - The maximum allowable precursor mass tolerance (in ppm) for correct - predictions. + The maximum allowable precursor mass tolerance (in ppm) for + correct predictions. isotope_error_range : Tuple[int, int] - Take into account the error introduced by choosing a non-monoisotopic - peak for fragmentation by not penalizing predicted precursor m/z's that - fit the specified isotope error: + Take into account the error introduced by choosing a + non-monoisotopic peak for fragmentation by not penalizing + predicted precursor m/z's that fit the specified isotope error: `abs(calc_mz - (precursor_mz - isotope * 1.00335 / precursor_charge)) < precursor_mass_tol` min_peptide_len : int @@ -73,16 +77,18 @@ class Spec2Pep(pl.LightningModule, ModelMixin): Number of PSMs to return for each spectrum. n_log : int The number of epochs to wait between logging messages. - tb_summarywriter : Optional[str] - Folder path to record performance metrics during training. If ``None``, - don't use a ``SummaryWriter``. + tb_summarywriter : Optional[Path] + Folder path to record performance metrics during training. If + ``None``, don't use a ``SummaryWriter``. train_label_smoothing : float Smoothing factor when calculating the training loss. warmup_iters : int - The number of iterations for the linear warm-up of the learning rate. + The number of iterations for the linear warm-up of the learning + rate. cosine_schedule_period_iters : int - The number of iterations for the cosine half period of the learning rate. - out_writer : Optional[str] + The number of iterations for the cosine half period of the + learning rate. + out_writer : Optional[ms_io.MztabWriter] The output writer for the prediction results. calculate_precision : bool Calculate the validation set precision during training. @@ -99,7 +105,7 @@ def __init__( n_layers: int = 9, dropout: float = 0.0, dim_intensity: Optional[int] = None, - max_length: int = 100, + max_peptide_len: int = 100, residues: Union[Dict[str, float], str] = "canonical", max_charge: int = 5, precursor_mass_tol: float = 50, @@ -108,9 +114,7 @@ def __init__( n_beams: int = 1, top_match: int = 1, n_log: int = 10, - tb_summarywriter: Optional[ - torch.utils.tensorboard.SummaryWriter - ] = None, + tb_summarywriter: Optional[Path] = None, train_label_smoothing: float = 0.01, warmup_iters: int = 100_000, cosine_schedule_period_iters: int = 600_000, @@ -147,8 +151,9 @@ def __init__( # Optimizer settings. self.warmup_iters = warmup_iters self.cosine_schedule_period_iters = cosine_schedule_period_iters - # `kwargs` will contain additional arguments as well as unrecognized - # arguments, including deprecated ones. Remove the deprecated ones. + # `kwargs` will contain additional arguments as well as + # unrecognized arguments, including deprecated ones. Remove the + # deprecated ones. for k in config._config_deprecated: kwargs.pop(k, None) warnings.warn( @@ -158,7 +163,7 @@ def __init__( self.opt_kwargs = kwargs # Data properties. - self.max_length = max_length + self.max_peptide_len = max_peptide_len self.residues = residues self.precursor_mass_tol = precursor_mass_tol self.isotope_error_range = isotope_error_range @@ -175,12 +180,12 @@ def __init__( self.n_log = n_log self._history = [] if tb_summarywriter is not None: - self.tb_summarywriter = SummaryWriter(tb_summarywriter) + self.tb_summarywriter = SummaryWriter(str(tb_summarywriter)) else: - self.tb_summarywriter = tb_summarywriter + self.tb_summarywriter = None # Output writer during predicting. - self.out_writer = out_writer + self.out_writer: ms_io.MztabWriter = out_writer def forward( self, spectra: torch.Tensor, precursors: torch.Tensor @@ -192,20 +197,22 @@ def forward( ---------- spectra : torch.Tensor of shape (n_spectra, n_peaks, 2) The spectra for which to predict peptide sequences. - Axis 0 represents an MS/MS spectrum, axis 1 contains the peaks in - the MS/MS spectrum, and axis 2 is essentially a 2-tuple specifying - the m/z-intensity pair for each peak. These should be zero-padded, - such that all the spectra in the batch are the same length. + Axis 0 represents an MS/MS spectrum, axis 1 contains the + peaks in the MS/MS spectrum, and axis 2 is essentially a + 2-tuple specifying the m/z-intensity pair for each peak. + These should be zero-padded, such that all the spectra in + the batch are the same length. precursors : torch.Tensor of size (n_spectra, 3) - The measured precursor mass (axis 0), precursor charge (axis 1), and - precursor m/z (axis 2) of each MS/MS spectrum. + The measured precursor mass (axis 0), precursor charge + (axis 1), and precursor m/z (axis 2) of each MS/MS spectrum. Returns ------- pred_peptides : List[List[Tuple[float, np.ndarray, str]]] - For each spectrum, a list with the top peptide predictions. A - peptide predictions consists of a tuple with the peptide score, - the amino acid scores, and the predicted peptide sequence. + For each spectrum, a list with the top peptide predictions. + A peptide predictions consists of a tuple with the peptide + score, the amino acid scores, and the predicted peptide + sequence. """ return self.beam_search_decode( spectra.to(self.encoder.device), @@ -222,26 +229,28 @@ def beam_search_decode( ---------- spectra : torch.Tensor of shape (n_spectra, n_peaks, 2) The spectra for which to predict peptide sequences. - Axis 0 represents an MS/MS spectrum, axis 1 contains the peaks in - the MS/MS spectrum, and axis 2 is essentially a 2-tuple specifying - the m/z-intensity pair for each peak. These should be zero-padded, - such that all the spectra in the batch are the same length. + Axis 0 represents an MS/MS spectrum, axis 1 contains the + peaks in the MS/MS spectrum, and axis 2 is essentially a + 2-tuple specifying the m/z-intensity pair for each peak. + These should be zero-padded, such that all the spectra in + the batch are the same length. precursors : torch.Tensor of size (n_spectra, 3) - The measured precursor mass (axis 0), precursor charge (axis 1), and - precursor m/z (axis 2) of each MS/MS spectrum. + The measured precursor mass (axis 0), precursor charge + (axis 1), and precursor m/z (axis 2) of each MS/MS spectrum. Returns ------- pred_peptides : List[List[Tuple[float, np.ndarray, str]]] - For each spectrum, a list with the top peptide prediction(s). A - peptide predictions consists of a tuple with the peptide score, - the amino acid scores, and the predicted peptide sequence. + For each spectrum, a list with the top peptide + prediction(s). A peptide predictions consists of a tuple + with the peptide score, the amino acid scores, and the + predicted peptide sequence. """ memories, mem_masks = self.encoder(spectra) # Sizes. batch = spectra.shape[0] # B - length = self.max_length + 1 # L + length = self.max_peptide_len + 1 # L vocab = self.decoder.vocab_size + 1 # V beam = self.n_beams # S @@ -269,16 +278,17 @@ def beam_search_decode( scores = einops.rearrange(scores, "B L V S -> (B S) L V") # The main decoding loop. - for step in range(0, self.max_length): - # Terminate beams exceeding the precursor m/z tolerance and track - # all finished beams (either terminated or stop token predicted). + for step in range(0, self.max_peptide_len): + # Terminate beams exceeding the precursor m/z tolerance and + # track all finished beams (either terminated or stop token + # predicted). ( finished_beams, beam_fits_precursor, discarded_beams, ) = self._finish_beams(tokens, precursors, step) - # Cache peptide predictions from the finished beams (but not the - # discarded beams). + # Cache peptide predictions from the finished beams (but not + # the discarded beams). self._cache_finished_beams( tokens, scores, @@ -289,7 +299,8 @@ def beam_search_decode( ) # Stop decoding when all current beams have been finished. - # Continue with beams that have not been finished and not discarded. + # Continue with beams that have not been finished and not + # discarded. finished_beams |= discarded_beams if finished_beams.all(): break @@ -300,14 +311,14 @@ def beam_search_decode( memories[~finished_beams, :, :], mem_masks[~finished_beams, :], ) - # Find the top-k beams with the highest scores and continue decoding - # those. + # Find the top-k beams with the highest scores and continue + # decoding those. tokens, scores = self._get_topk_beams( tokens, scores, finished_beams, batch, step + 1 ) - # Return the peptide with the highest confidence score, within the - # precursor m/z tolerance if possible. + # Return the peptide with the highest confidence score, within + # the precursor m/z tolerance if possible. return list(self._get_top_peptide(pred_cache)) def _finish_beams( @@ -317,33 +328,33 @@ def _finish_beams( step: int, ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """ - Track all beams that have been finished, either by predicting the stop - token or because they were terminated due to exceeding the precursor - m/z tolerance. + Track all beams that have been finished, either by predicting + the stop token or because they were terminated due to exceeding + the precursor m/z tolerance. Parameters ---------- - tokens : torch.Tensor of shape (n_spectra * n_beams, max_length) + tokens : torch.Tensor of shape (n_spectra * n_beams, max_peptide_len) Predicted amino acid tokens for all beams and all spectra. scores : torch.Tensor of shape - (n_spectra * n_beams, max_length, n_amino_acids) - Scores for the predicted amino acid tokens for all beams and all - spectra. + (n_spectra * n_beams, max_peptide_len, n_amino_acids) + Scores for the predicted amino acid tokens for all beams and + all spectra. step : int Index of the current decoding step. Returns ------- finished_beams : torch.Tensor of shape (n_spectra * n_beams) - Boolean tensor indicating whether the current beams have been - finished. + Boolean tensor indicating whether the current beams have + been finished. beam_fits_precursor: torch.Tensor of shape (n_spectra * n_beams) - Boolean tensor indicating if current beams are within precursor m/z - tolerance. + Boolean tensor indicating if current beams are within + precursor m/z tolerance. discarded_beams : torch.Tensor of shape (n_spectra * n_beams) - Boolean tensor indicating whether the current beams should be - discarded (e.g. because they were predicted to end but violate the - minimum peptide length). + Boolean tensor indicating whether the current beams should + be discarded (e.g. because they were predicted to end but + violate the minimum peptide length). """ # Check for tokens with a negative mass (i.e. neutral loss). aa_neg_mass = [None] @@ -362,7 +373,8 @@ def _finish_beams( beam_fits_precursor = torch.zeros( tokens.shape[0], dtype=torch.bool ).to(self.encoder.device) - # Beams with a stop token predicted in the current step can be finished. + # Beams with a stop token predicted in the current step can be + # finished. finished_beams = torch.zeros(tokens.shape[0], dtype=torch.bool).to( self.encoder.device ) @@ -374,8 +386,9 @@ def _finish_beams( self.encoder.device ) discarded_beams[tokens[:, step] == 0] = True - # Discard beams with invalid modification combinations (i.e. N-terminal - # modifications occur multiple times or in internal positions). + # Discard beams with invalid modification combinations (i.e. + # N-terminal modifications occur multiple times or in internal + # positions). if step > 1: # Only relevant for longer predictions. dim0 = torch.arange(tokens.shape[0]) final_pos = torch.full((ends_stop_token.shape[0],), step) @@ -392,8 +405,8 @@ def _finish_beams( ).any(dim=1) discarded_beams[multiple_mods | internal_mods] = True - # Check which beams should be terminated or discarded based on the - # predicted peptide. + # Check which beams should be terminated or discarded based on + # the predicted peptide. for i in range(len(finished_beams)): # Skip already discarded beams. if discarded_beams[i]: @@ -408,15 +421,15 @@ def _finish_beams( elif not self.decoder.reverse and peptide[-1] == "$": peptide = peptide[:-1] peptide_len -= 1 - # Discard beams that were predicted to end but don't fit the minimum - # peptide length. + # Discard beams that were predicted to end but don't fit the + # minimum peptide length. if finished_beams[i] and peptide_len < self.min_peptide_len: discarded_beams[i] = True continue - # Terminate the beam if it has not been finished by the model but - # the peptide mass exceeds the precursor m/z to an extent that it - # cannot be corrected anymore by a subsequently predicted AA with - # negative mass. + # Terminate the beam if it has not been finished by the + # model but the peptide mass exceeds the precursor m/z to an + # extent that it cannot be corrected anymore by a + # subsequently predicted AA with negative mass. precursor_charge = precursors[i, 1] precursor_mz = precursors[i, 2] matches_precursor_mz = exceeds_precursor_mz = False @@ -442,16 +455,18 @@ def _finish_beams( self.isotope_error_range[1] + 1, ) ] - # Terminate the beam if the calculated m/z for the predicted - # peptide (without potential additional AAs with negative - # mass) is within the precursor m/z tolerance. + # Terminate the beam if the calculated m/z for the + # predicted peptide (without potential additional + # AAs with negative mass) is within the precursor + # m/z tolerance. matches_precursor_mz = aa is None and any( abs(d) < self.precursor_mass_tol for d in delta_mass_ppm ) - # Terminate the beam if the calculated m/z exceeds the - # precursor m/z + tolerance and hasn't been corrected by a - # subsequently predicted AA with negative mass. + # Terminate the beam if the calculated m/z exceeds + # the precursor m/z + tolerance and hasn't been + # corrected by a subsequently predicted AA with + # negative mass. if matches_precursor_mz: exceeds_precursor_mz = False else: @@ -466,8 +481,8 @@ def _finish_beams( except KeyError: matches_precursor_mz = exceeds_precursor_mz = False # Finish beams that fit or exceed the precursor m/z. - # Don't finish beams that don't include a stop token if they don't - # exceed the precursor m/z tolerance yet. + # Don't finish beams that don't include a stop token if they + # don't exceed the precursor m/z tolerance yet. if finished_beams[i]: beam_fits_precursor[i] = matches_precursor_mz elif exceeds_precursor_mz: @@ -491,17 +506,17 @@ def _cache_finished_beams( Parameters ---------- - tokens : torch.Tensor of shape (n_spectra * n_beams, max_length) + tokens : torch.Tensor of shape (n_spectra * n_beams, max_peptide_len) Predicted amino acid tokens for all beams and all spectra. scores : torch.Tensor of shape - (n_spectra * n_beams, max_length, n_amino_acids) - Scores for the predicted amino acid tokens for all beams and all - spectra. + (n_spectra * n_beams, max_peptide_len, n_amino_acids) + Scores for the predicted amino acid tokens for all beams and + all spectra. step : int Index of the current decoding step. beams_to_cache : torch.Tensor of shape (n_spectra * n_beams) - Boolean tensor indicating whether the current beams are ready for - caching. + Boolean tensor indicating whether the current beams are + ready for caching. beam_fits_precursor: torch.Tensor of shape (n_spectra * n_beams) Boolean tensor indicating whether the beams are within the precursor m/z tolerance. @@ -509,9 +524,9 @@ def _cache_finished_beams( int, List[Tuple[float, float, np.ndarray, torch.Tensor]] ] Priority queue with finished beams for each spectrum, ordered by - peptide score. For each finished beam, a tuple with the (negated) - peptide score, a random tie-breaking float, the amino acid-level - scores, and the predicted tokens is stored. + peptide score. For each finished beam, a tuple with the + (negated) peptide score, a random tie-breaking float, the + amino acid-level scores, and the predicted tokens is stored. """ for i in range(len(beams_to_cache)): if not beams_to_cache[i]: @@ -533,8 +548,8 @@ def _cache_finished_beams( continue smx = self.softmax(scores[i : i + 1, : step + 1, :]) aa_scores = smx[0, range(len(pred_tokens)), pred_tokens].tolist() - # Add an explicit score 0 for the missing stop token in case this - # was not predicted (i.e. early stopping). + # Add an explicit score 0 for the missing stop token in case + # this was not predicted (i.e. early stopping). if not has_stop_token: aa_scores.append(0) aa_scores = np.asarray(aa_scores) @@ -544,8 +559,8 @@ def _cache_finished_beams( ) # Omit the stop token from the amino acid-level scores. aa_scores = aa_scores[:-1] - # Add the prediction to the cache (minimum priority queue, maximum - # the number of beams elements). + # Add the prediction to the cache (minimum priority queue, + # maximum the number of beams elements). if len(pred_cache[spec_idx]) < self.n_beams: heapadd = heapq.heappush else: @@ -569,22 +584,22 @@ def _get_topk_beams( step: int, ) -> Tuple[torch.tensor, torch.tensor]: """ - Find the top-k beams with the highest scores and continue decoding - those. + Find the top-k beams with the highest scores and continue + decoding those. Stop decoding for beams that have been finished. Parameters ---------- - tokens : torch.Tensor of shape (n_spectra * n_beams, max_length) + tokens : torch.Tensor of shape (n_spectra * n_beams, max_peptide_len) Predicted amino acid tokens for all beams and all spectra. scores : torch.Tensor of shape - (n_spectra * n_beams, max_length, n_amino_acids) - Scores for the predicted amino acid tokens for all beams and all - spectra. + (n_spectra * n_beams, max_peptide_len, n_amino_acids) + Scores for the predicted amino acid tokens for all beams and + all spectra. finished_beams : torch.Tensor of shape (n_spectra * n_beams) - Boolean tensor indicating whether the current beams are ready for - caching. + Boolean tensor indicating whether the current beams are + ready for caching. batch: int Number of spectra in the batch. step : int @@ -592,12 +607,12 @@ def _get_topk_beams( Returns ------- - tokens : torch.Tensor of shape (n_spectra * n_beams, max_length) + tokens : torch.Tensor of shape (n_spectra * n_beams, max_peptide_len) Predicted amino acid tokens for all beams and all spectra. scores : torch.Tensor of shape - (n_spectra * n_beams, max_length, n_amino_acids) - Scores for the predicted amino acid tokens for all beams and all - spectra. + (n_spectra * n_beams, max_peptide_len, n_amino_acids) + Scores for the predicted amino acid tokens for all beams and + all spectra. """ beam = self.n_beams # S vocab = self.decoder.vocab_size + 1 # V @@ -632,7 +647,7 @@ def _get_topk_beams( ).float() # Mask out the index '0', i.e. padding token, by default. # FIXME: Set this to a very small, yet non-zero value, to only - # get padding after stop token. + # get padding after stop token. active_mask[:, :beam] = 1e-8 # Figure out the top K decodings. @@ -660,24 +675,26 @@ def _get_top_peptide( ], ) -> Iterable[List[Tuple[float, np.ndarray, str]]]: """ - Return the peptide with the highest confidence score for each spectrum. + Return the peptide with the highest confidence score for each + spectrum. Parameters ---------- pred_cache : Dict[ int, List[Tuple[float, float, np.ndarray, torch.Tensor]] ] - Priority queue with finished beams for each spectrum, ordered by - peptide score. For each finished beam, a tuple with the peptide - score, a random tie-breaking float, the amino acid-level scores, - and the predicted tokens is stored. + Priority queue with finished beams for each spectrum, + ordered by peptide score. For each finished beam, a tuple + with the peptide score, a random tie-breaking float, the + amino acid-level scores, and the predicted tokens is stored. Returns ------- pred_peptides : Iterable[List[Tuple[float, np.ndarray, str]]] - For each spectrum, a list with the top peptide prediction(s). A - peptide predictions consists of a tuple with the peptide score, - the amino acid scores, and the predicted peptide sequence. + For each spectrum, a list with the top peptide + prediction(s). A peptide predictions consists of a tuple + with the peptide score, the amino acid scores, and the + predicted peptide sequence. """ for peptides in pred_cache.values(): if len(peptides) > 0: @@ -707,13 +724,14 @@ def _forward_step( ---------- spectra : torch.Tensor of shape (n_spectra, n_peaks, 2) The spectra for which to predict peptide sequences. - Axis 0 represents an MS/MS spectrum, axis 1 contains the peaks in - the MS/MS spectrum, and axis 2 is essentially a 2-tuple specifying - the m/z-intensity pair for each peak. These should be zero-padded, - such that all the spectra in the batch are the same length. + Axis 0 represents an MS/MS spectrum, axis 1 contains the + peaks in the MS/MS spectrum, and axis 2 is essentially a + 2-tuple specifying the m/z-intensity pair for each peak. + These should be zero-padded, such that all the spectra in + the batch are the same length. precursors : torch.Tensor of size (n_spectra, 3) - The measured precursor mass (axis 0), precursor charge (axis 1), and - precursor m/z (axis 2) of each MS/MS spectrum. + The measured precursor mass (axis 0), precursor charge + (axis 1), and precursor m/z (axis 2) of each MS/MS spectrum. sequences : List[str] of length n_spectra The partial peptide sequences to predict. @@ -738,8 +756,8 @@ def training_step( Parameters ---------- batch : Tuple[torch.Tensor, torch.Tensor, List[str]] - A batch of (i) MS/MS spectra, (ii) precursor information, (iii) - peptide sequences as torch Tensors. + A batch of (i) MS/MS spectra, (ii) precursor information, + (iii) peptide sequences as torch Tensors. mode : str Logging key to describe the current stage. @@ -772,8 +790,8 @@ def validation_step( Parameters ---------- batch : Tuple[torch.Tensor, torch.Tensor, List[str]] - A batch of (i) MS/MS spectra, (ii) precursor information, (iii) - peptide sequences. + A batch of (i) MS/MS spectra, (ii) precursor information, + (iii) peptide sequences. Returns ------- @@ -785,8 +803,8 @@ def validation_step( if not self.calculate_precision: return loss - # Calculate and log amino acid and peptide match evaluation metrics from - # the predicted peptides. + # Calculate and log amino acid and peptide match evaluation + # metrics from the predicted peptides. peptides_pred, peptides_true = [], batch[2] for spectrum_preds in self.forward(batch[0], batch[1]): for _, _, pred in spectrum_preds: @@ -794,42 +812,30 @@ def validation_step( aa_precision, _, pep_precision = evaluate.aa_match_metrics( *evaluate.aa_match_batch( - peptides_true, - peptides_pred, - self.decoder._peptide_mass.masses, + peptides_true, peptides_pred, self.decoder._peptide_mass.masses ) ) log_args = dict(on_step=False, on_epoch=True, sync_dist=True) - self.log( - "Peptide precision at coverage=1", - pep_precision, - **log_args, - ) - self.log( - "AA precision at coverage=1", - aa_precision, - **log_args, - ) + self.log("Peptide precision at coverage=1", pep_precision, **log_args) + self.log("AA precision at coverage=1", aa_precision, **log_args) return loss def predict_step( self, batch: Tuple[torch.Tensor, torch.Tensor, torch.Tensor], *args - ) -> List[Tuple[np.ndarray, float, float, str, float, np.ndarray]]: + ) -> List[ms_io.PepSpecMatch]: """ A single prediction step. Parameters ---------- batch : Tuple[torch.Tensor, torch.Tensor, torch.Tensor] - A batch of (i) MS/MS spectra, (ii) precursor information, (iii) - spectrum identifiers as torch Tensors. + A batch of (i) MS/MS spectra, (ii) precursor information, + (iii) spectrum identifiers as torch Tensors. Returns ------- - predictions: List[Tuple[np.ndarray, float, float, str, float, np.ndarray]] - Model predictions for the given batch of spectra containing spectrum - ids, precursor information, peptide sequences as well as peptide - and amino acid-level confidence scores. + predictions: List[ms_io.PepSpecMatch] + Predicted PSMs for the given batch of spectra. """ predictions = [] for ( @@ -845,13 +851,16 @@ def predict_step( ): for peptide_score, aa_scores, peptide in spectrum_preds: predictions.append( - ( - spectrum_i, - precursor_charge, - precursor_mz, - peptide, - peptide_score, - aa_scores, + ms_io.PepSpecMatch( + sequence=peptide, + spectrum_id=tuple(spectrum_i), + peptide_score=peptide_score, + charge=int(precursor_charge), + calc_mz=self.peptide_mass_calculator.mass( + peptide, precursor_charge + ), + exp_mz=precursor_mz, + aa_scores=aa_scores, ) ) @@ -892,38 +901,17 @@ def on_validation_epoch_end(self) -> None: self._log_history() def on_predict_batch_end( - self, - outputs: List[Tuple[np.ndarray, List[str], torch.Tensor]], - *args, + self, outputs: List[ms_io.PepSpecMatch], *args ) -> None: """ - Write the predicted peptide sequences and amino acid scores to the - output file. + Write the predicted peptide sequences and amino acid scores to + the output file. """ if self.out_writer is None: return - # Triply nested lists: results -> batch -> step -> spectrum. - for ( - spectrum_i, - charge, - precursor_mz, - peptide, - peptide_score, - aa_scores, - ) in outputs: - if len(peptide) == 0: - continue - self.out_writer.psms.append( - psm.PepSpecMatch( - sequence=peptide, - spectrum_id=tuple(spectrum_i), - peptide_score=peptide_score, - charge=int(charge), - calc_mz=precursor_mz, - exp_mz=self.peptide_mass_calculator.mass(peptide, charge), - aa_scores=aa_scores, - ) - ) + for pred in outputs: + if len(pred.sequence) > 0: + self.out_writer.psms.append(pred) def _log_history(self) -> None: """ @@ -970,16 +958,18 @@ def _log_history(self) -> None: def configure_optimizers( self, - ) -> Tuple[torch.optim.Optimizer, Dict[str, Any]]: + ) -> Tuple[List[torch.optim.Optimizer], Dict[str, Any]]: """ Initialize the optimizer. - This is used by pytorch-lightning when preparing the model for training. + This is used by pytorch-lightning when preparing the model for + training. Returns ------- - Tuple[torch.optim.Optimizer, Dict[str, Any]] - The initialized Adam optimizer and its learning rate scheduler. + Tuple[List[torch.optim.Optimizer], Dict[str, Any]] + The initialized Adam optimizer and its learning rate + scheduler. """ optimizer = torch.optim.Adam(self.parameters(), **self.opt_kwargs) # Apply learning rate scheduler per step. @@ -989,18 +979,180 @@ def configure_optimizers( return [optimizer], {"scheduler": lr_scheduler, "interval": "step"} +class DbSpec2Pep(Spec2Pep): + """ + Subclass of Spec2Pep for the use of Casanovo as an MS/MS database + search score function. + + Uses teacher forcing to 'query' Casanovo to score a peptide-spectrum + pair. Higher scores indicate a better match between the peptide and + spectrum. The amino acid-level scores are also returned. + + Also note that although teacher-forcing is used within this method, + there is *no training* involved. This is a prediction-only method. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.psm_batch_size = None + + def predict_step( + self, + batch: Tuple[torch.Tensor, torch.Tensor, np.ndarray, np.ndarray], + *args, + ) -> List[ms_io.PepSpecMatch]: + """ + A single prediction step. + + Parameters + ---------- + batch : Tuple[torch.Tensor, torch.Tensor, np.ndarray, np.ndarray] + A batch of (i) MS/MS spectra, (ii) precursor information, + (iii) spectrum identifiers, (iv) candidate peptides. + + Returns + ------- + predictions: List[ms_io.PepSpecMatch] + Predicted PSMs for the given batch of spectra. + """ + predictions_all = collections.defaultdict(list) + for start_i in range(0, len(batch[0]), self.psm_batch_size): + psm_batch = [ + b[start_i : start_i + self.psm_batch_size] for b in batch + ] + pred, truth = self._forward_step( + psm_batch[0], psm_batch[1], psm_batch[3] + ) + pred = self.softmax(pred) + batch_peptide_scores, batch_aa_scores = _calc_match_score( + pred, truth, self.decoder.reverse + ) + for ( + charge, + precursor_mz, + spectrum_i, + peptide_score, + aa_scores, + peptide, + ) in zip( + psm_batch[1][:, 1].cpu().detach().numpy(), + psm_batch[1][:, 2].cpu().detach().numpy(), + psm_batch[2], + batch_peptide_scores, + batch_aa_scores, + psm_batch[3], + ): + spectrum_i = tuple(spectrum_i) + predictions_all[spectrum_i].append( + ms_io.PepSpecMatch( + sequence=peptide, + spectrum_id=spectrum_i, + peptide_score=peptide_score, + charge=int(charge), + calc_mz=self.peptide_mass_calculator.mass( + peptide, charge + ), + exp_mz=precursor_mz, + aa_scores=aa_scores, + protein=self.protein_database.get_associated_protein( + peptide + ), + ) + ) + # Filter the top-scoring prediction(s) for each spectrum. + predictions = list( + itertools.chain.from_iterable( + [ + *( + sorted( + spectrum_predictions, + key=lambda p: p.peptide_score, + reverse=True, + )[: self.top_match] + for spectrum_predictions in predictions_all.values() + ) + ] + ) + ) + return predictions + + +def _calc_match_score( + batch_all_aa_scores: torch.Tensor, + truth_aa_indices: torch.Tensor, + decoder_reverse: bool = False, +) -> Tuple[List[float], List[np.ndarray]]: + """ + Calculate the score between the input spectra and associated + peptide. + + Take in teacher-forced scoring of amino acids of the peptides (in a + batch) and use the truth labels to calculate a score between the + input spectra and associated peptide. + + Parameters + ---------- + batch_all_aa_scores : torch.Tensor + Amino acid scores for all amino acids in the vocabulary for + every prediction made to generate the associated peptide (for an + entire batch). + truth_aa_indices : torch.Tensor + Indices of the score for each actual amino acid in the peptide + (for an entire batch). + decoder_reverse : bool + Whether the decoder is reversed. + + Returns + ------- + peptide_scores: List[float] + The peptide score for each PSM in the batch. + aa_scores : List[np.ndarray] + The amino acid scores for each PSM in the batch. + """ + # Remove trailing tokens from predictions based on decoder reversal. + if not decoder_reverse: + batch_all_aa_scores = batch_all_aa_scores[:, 1:] + else: + batch_all_aa_scores = batch_all_aa_scores[:, :-1] + + # Vectorized scoring using efficient indexing. + rows = ( + torch.arange(batch_all_aa_scores.shape[0]) + .unsqueeze(-1) + .expand(-1, batch_all_aa_scores.shape[1]) + ) + cols = torch.arange(0, batch_all_aa_scores.shape[1]).expand_as(rows) + + per_aa_scores = batch_all_aa_scores[rows, cols, truth_aa_indices] + per_aa_scores = per_aa_scores.cpu().detach().numpy() + per_aa_scores[per_aa_scores == 0] += 1e-10 + score_mask = (truth_aa_indices != 0).cpu().detach().numpy() + peptide_scores, aa_scores = [], [] + for psm_score, psm_mask in zip(per_aa_scores, score_mask): + psm_aa_scores, psm_peptide_score = _aa_pep_score( + psm_score[psm_mask], True + ) + peptide_scores.append(psm_peptide_score) + aa_scores.append(psm_aa_scores) + + return peptide_scores, aa_scores + + class CosineWarmupScheduler(torch.optim.lr_scheduler._LRScheduler): """ - Learning rate scheduler with linear warm-up followed by cosine shaped decay. + Learning rate scheduler with linear warm-up followed by cosine + shaped decay. Parameters ---------- optimizer : torch.optim.Optimizer Optimizer object. warmup_iters : int - The number of iterations for the linear warm-up of the learning rate. + The number of iterations for the linear warm-up of the learning + rate. cosine_schedule_period_iters : int - The number of iterations for the cosine half period of the learning rate. + The number of iterations for the cosine half period of the + learning rate. """ def __init__( @@ -1030,8 +1182,8 @@ def _calc_mass_error( calc_mz: float, obs_mz: float, charge: int, isotope: int = 0 ) -> float: """ - Calculate the mass error in ppm between the theoretical m/z and the observed - m/z, optionally accounting for an isotopologue mismatch. + Calculate the mass error in ppm between the theoretical m/z and the + observed m/z, optionally accounting for an isotopologue mismatch. Parameters ---------- @@ -1056,18 +1208,20 @@ def _aa_pep_score( aa_scores: np.ndarray, fits_precursor_mz: bool ) -> Tuple[np.ndarray, float]: """ - Calculate amino acid and peptide-level confidence score from the raw amino - acid scores. + Calculate amino acid and peptide-level confidence score from the raw + amino acid scores. - The peptide score is the mean of the raw amino acid scores. The amino acid - scores are the mean of the raw amino acid scores and the peptide score. + The peptide score is the mean of the raw amino acid scores. The + amino acid scores are the mean of the raw amino acid scores and the + peptide score. Parameters ---------- aa_scores : np.ndarray Amino acid level confidence scores. fits_precursor_mz : bool - Flag indicating whether the prediction fits the precursor m/z filter. + Flag indicating whether the prediction fits the precursor m/z + filter. Returns ------- diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index 1b4ade13..30f86f24 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -4,7 +4,6 @@ import glob import logging import os -import re import tempfile import uuid import warnings @@ -22,10 +21,10 @@ from .. import utils from ..config import Config -from ..data import ms_io +from ..data import db_utils, ms_io from ..denovo.dataloaders import DeNovoDataModule from ..denovo.evaluate import aa_match_batch, aa_match_metrics -from ..denovo.model import Spec2Pep +from ..denovo.model import DbSpec2Pep, Spec2Pep logger = logging.getLogger("casanovo") @@ -45,11 +44,12 @@ class ModelRunner: The directory where checkpoint files will be saved. If `None` no checkpoint files will be saved and a warning will be triggered. output_rootname : str | None, optional - The root name for checkpoint files (e.g., checkpoints or results). If - `None` no base name will be used for checkpoint files. - overwrite_ckpt_check: bool, optional - Whether to check output_dir (if not `None`) for conflicting checkpoint + The root name for checkpoint files (e.g., checkpoints or + results). If `None` no base name will be used for checkpoint files. + overwrite_ckpt_check: bool, optional + Whether to check output_dir (if not `None`) for conflicting + checkpoint files. """ def __init__( @@ -123,6 +123,55 @@ def __exit__(self, exc_type, exc_value, traceback): if self.writer is not None: self.writer.save() + def db_search( + self, + peak_path: Iterable[str], + fasta_path: str, + results_path: str, + ) -> None: + """Perform database search with Casanovo. + + Parameters + ---------- + peak_path : Iterable[str] + The path with the MS data files for database search. + fasta_path : str + The path with the FASTA file for database search. + results_path : str + Sequencing results file path. + """ + self.writer = ms_io.MztabWriter(results_path) + self.writer.set_metadata( + self.config, + model=str(self.model_filename), + config_filename=self.config.file, + ) + self.initialize_trainer(train=True) + self.initialize_model(train=False, db_search=True) + self.model.out_writer = self.writer + self.model.psm_batch_size = self.config.predict_batch_size + self.model.protein_database = db_utils.ProteinDatabase( + fasta_path, + self.config.enzyme, + self.config.digestion, + self.config.missed_cleavages, + self.config.min_peptide_len, + self.config.max_peptide_len, + self.config.max_mods, + self.config.precursor_mass_tol, + self.config.isotope_error_range, + self.config.allowed_fixed_mods, + self.config.allowed_var_mods, + self.config.residues, + ) + test_index = self._get_index(peak_path, False, "db search") + self.writer.set_ms_run(test_index.ms_files) + + self.initialize_data_module(test_index=test_index) + self.loaders.protein_database = self.model.protein_database + self.loaders.setup(stage="test", annotated=False) + self.trainer.predict(self.model, self.loaders.db_dataloader()) + def train( self, train_peak_path: Iterable[str], @@ -136,10 +185,6 @@ def train( The path to the MS data files for training. valid_peak_path : iterable of str The path to the MS data files for validation. - - Returns - ------- - self """ self.initialize_trainer(train=True) self.initialize_model(train=True) @@ -156,16 +201,16 @@ def train( ) def log_metrics(self, test_index: AnnotatedSpectrumIndex) -> None: - """Log peptide precision and amino acid precision + """Log peptide precision and amino acid precision. Calculate and log peptide precision and amino acid precision - based off of model predictions and spectrum annotations + based off of model predictions and spectrum annotations. Parameters ---------- test_index : AnnotatedSpectrumIndex - Index containing the annotated spectra used to generate model - predictions + Index containing the annotated spectra used to generate + model predictions. """ seq_pred = [] seq_true = [] @@ -192,8 +237,9 @@ def log_metrics(self, test_index: AnnotatedSpectrumIndex) -> None: if self.config["top_match"] > 1: logger.warning( - "The behavior for calculating evaluation metrics is undefined when " - "the 'top_match' configuration option is set to a value greater than 1." + "The behavior for calculating evaluation metrics is undefined " + "when the 'top_match' configuration option is set to a value " + "greater than 1." ) logger.info("Peptide Precision: %.2f%%", 100 * pep_precision) @@ -208,13 +254,14 @@ def predict( ) -> None: """Predict peptide sequences with a trained Casanovo model. - Can also evaluate model during prediction if provided with annotated - peak files. + Can also evaluate model during prediction if provided with + annotated peak files. Parameters ---------- - peak_path : iterable of str - The path with the MS data files for predicting peptide sequences. + peak_path : Iterable[str] + The path with the MS data files for predicting peptide + sequences. results_path : str Sequencing results file path evaluate: bool @@ -222,10 +269,6 @@ def predict( Note: peak_path most point to annotated MS data files when running model evaluation. Files that are not an annotated peak file format will be ignored if evaluate is set to true. - - Returns - ------- - self """ self.writer = ms_io.MztabWriter(results_path) self.writer.set_metadata( @@ -309,7 +352,7 @@ def initialize_trainer(self, train: bool) -> None: self.trainer = pl.Trainer(**trainer_cfg) - def initialize_model(self, train: bool) -> None: + def initialize_model(self, train: bool, db_search: bool = False) -> None: """Initialize the Casanovo model. Parameters @@ -317,6 +360,8 @@ def initialize_model(self, train: bool) -> None: train : bool Determines whether to set the model up for model training or evaluation / inference. + db_search : bool + Determines whether to use the DB search model subclass. """ tb_summarywriter = None if self.config.tb_summarywriter: @@ -335,7 +380,7 @@ def initialize_model(self, train: bool) -> None: n_layers=self.config.n_layers, dropout=self.config.dropout, dim_intensity=self.config.dim_intensity, - max_length=self.config.max_length, + max_peptide_len=self.config.max_peptide_len, residues=self.config.residues, max_charge=self.config.max_charge, precursor_mass_tol=self.config.precursor_mass_tol, @@ -354,9 +399,10 @@ def initialize_model(self, train: bool) -> None: calculate_precision=self.config.calculate_precision, ) - # Reconfigurable non-architecture related parameters for a loaded model. + # Reconfigurable non-architecture related parameters for a + # loaded model. loaded_model_params = dict( - max_length=self.config.max_length, + max_peptide_len=self.config.max_peptide_len, precursor_mass_tol=self.config.precursor_mass_tol, isotope_error_range=self.config.isotope_error_range, n_beams=self.config.n_beams, @@ -374,6 +420,9 @@ def initialize_model(self, train: bool) -> None: ) if self.model_filename is None: + if db_search: + logger.error("A model file must be provided for DB search") + raise ValueError("A model file must be provided for DB search") # Train a model from scratch if no model file is provided. if train: self.model = Spec2Pep(**model_params) @@ -382,7 +431,8 @@ def initialize_model(self, train: bool) -> None: else: logger.error("A model file must be provided") raise ValueError("A model file must be provided") - # Else a model file is provided (to continue training or for inference). + # Else a model file is provided (to continue training or for + # inference). if not Path(self.model_filename).exists(): logger.error( @@ -391,11 +441,12 @@ def initialize_model(self, train: bool) -> None: ) raise FileNotFoundError("Could not find the model weights file") - # First try loading model details from the weights file, otherwise use - # the provided configuration. + # First try loading model details from the weights file, + # otherwise use the provided configuration. device = torch.empty(1).device # Use the default device. + Model = DbSpec2Pep if db_search else Spec2Pep try: - self.model = Spec2Pep.load_from_checkpoint( + self.model = Model.load_from_checkpoint( self.model_filename, map_location=device, **loaded_model_params ) @@ -411,9 +462,10 @@ def initialize_model(self, train: bool) -> None: "using the checkpoint." ) except RuntimeError: - # This only doesn't work if the weights are from an older version + # This only doesn't work if the weights are from an older + # version. try: - self.model = Spec2Pep.load_from_checkpoint( + self.model = Model.load_from_checkpoint( self.model_filename, map_location=device, **model_params, @@ -432,7 +484,7 @@ def initialize_data_module( Union[AnnotatedSpectrumIndex, SpectrumIndex] ] = None, ) -> None: - """Initialize the data module + """Initialize the data module. Parameters ---------- @@ -471,8 +523,8 @@ def _get_index( ) -> Union[SpectrumIndex, AnnotatedSpectrumIndex]: """Get the spectrum index. - If the file is a SpectrumIndex, only one is allowed. Otherwise multiple - may be specified. + If the file is a SpectrumIndex, only one is allowed. Otherwise + multiple may be specified. Parameters ---------- @@ -532,15 +584,14 @@ def _get_index( def _get_strategy(self) -> Union[str, DDPStrategy]: """Get the strategy for the Trainer. - The DDP strategy works best when multiple GPUs are used. It can work - for CPU-only, but definitely fails using MPS (the Apple Silicon chip) - due to Gloo. + The DDP strategy works best when multiple GPUs are used. It can + work for CPU-only, but definitely fails using MPS (the Apple + Silicon chip) due to Gloo. Returns ------- Union[str, DDPStrategy] The strategy parameter for the Trainer. - """ if self.config.accelerator in ("cpu", "mps"): return "auto" @@ -558,8 +609,8 @@ def _get_peak_filenames( """ Get all matching peak file names from the path pattern. - Performs cross-platform path expansion akin to the Unix shell (glob, expand - user, expand vars). + Performs cross-platform path expansion akin to the Unix shell (glob, + expand user, expand vars). Parameters ---------- diff --git a/casanovo/utils.py b/casanovo/utils.py index 43b1cb7d..86e0748f 100644 --- a/casanovo/utils.py +++ b/casanovo/utils.py @@ -8,7 +8,7 @@ import socket import sys from datetime import datetime -from typing import Tuple, Dict, List, Optional, Iterable +from typing import Dict, Iterable, List, Optional, Tuple import numpy as np import pandas as pd @@ -18,7 +18,7 @@ from .data.psm import PepSpecMatch -SCORE_BINS = [0.0, 0.5, 0.9, 0.95, 0.99] +SCORE_BINS = (0.0, 0.5, 0.9, 0.95, 0.99) logger = logging.getLogger("casanovo") @@ -27,8 +27,8 @@ def n_workers() -> int: """ Get the number of workers to use for data loading. - This is the maximum number of CPUs allowed for the process, scaled for the - number of GPUs being used. + This is the maximum number of CPUs allowed for the process, scaled + for the number of GPUs being used. On Windows and MacOS, we only use the main process. See: https://discuss.pytorch.org/t/errors-when-using-num-workers-0-in-dataloader/97564/4 @@ -79,7 +79,7 @@ def split_version(version: str) -> Tuple[str, str, str]: def get_score_bins( - scores: pd.Series, score_bins: List[float] + scores: pd.Series, score_bins: Iterable[float] ) -> Dict[float, int]: """ Get binned confidence scores @@ -92,14 +92,14 @@ def get_score_bins( ---------- scores: pd.Series Series of assigned peptide scores. - score_bins: List[float] + score_bins: Iterable[float] Confidence scores to map. Returns ------- score_bin_dict: Dict[float, int] - Dictionary mapping each confidence score to the number of spectra - with a confidence greater than or equal to it. + Dictionary mapping each confidence score to the number of + spectra with a confidence greater than or equal to it. """ return {score: (scores >= score).sum() for score in score_bins} @@ -116,8 +116,8 @@ def get_peptide_lengths(sequences: pd.Series) -> np.ndarray: Returns ------- sequence_lengths: np.ndarray - Numpy array containing the length of each sequence, listed in the - same order that the sequences are provided in. + Numpy array containing the length of each sequence, listed in + the same order that the sequences are provided in. """ # Mass modifications do not contribute to sequence length # FIXME: If PTMs are represented in ProForma notation this filtering @@ -126,7 +126,7 @@ def get_peptide_lengths(sequences: pd.Series) -> np.ndarray: def get_report_dict( - results_table: pd.DataFrame, score_bins: List[float] = SCORE_BINS + results_table: pd.DataFrame, score_bins: Iterable[float] = SCORE_BINS ) -> Optional[Dict]: """ Generate sequencing run report @@ -134,15 +134,16 @@ def get_report_dict( Parameters ---------- results_table: pd.DataFrame - Parsed spectrum match table - score_bins: List[float], Optional - Confidence scores for creating confidence CMF, see get_score_bins + Parsed spectrum match table. + score_bins: Iterable[float], Optional + Confidence scores for creating confidence CMF, see + `get_score_bins`. Returns ------- report_gen: Dict Generated report represented as a dictionary, or None if no - sequencing predictions were logged + sequencing predictions were logged. """ if results_table.empty: return None @@ -161,16 +162,16 @@ def get_report_dict( def log_run_report( - start_time: Optional[int] = None, end_time: Optional[int] = None + start_time: Optional[float] = None, end_time: Optional[float] = None ) -> None: """ Log general run report Parameters ---------- - start_time : Optional[int], default=None + start_time : Optional[float], default=None The start time of the sequencing run in seconds since the epoch. - end_time : Optional[int], default=None + end_time : Optional[float], default=None The end time of the sequencing run in seconds since the epoch. """ logger.info("======= End of Run Report =======") @@ -195,28 +196,26 @@ def log_run_report( logger.info("Max GPU Memory Utilization: %d MiB", gpu_util >> 20) -def log_sequencing_report( +def log_annotate_report( predictions: List[PepSpecMatch], - start_time: Optional[int] = None, - end_time: Optional[int] = None, - score_bins: List[float] = SCORE_BINS, + start_time: Optional[float] = None, + end_time: Optional[float] = None, + score_bins: Iterable[float] = SCORE_BINS, ) -> None: """ - Log sequencing run report + Log run annotation report. Parameters ---------- - next_prediction : Tuple[ - str, Tuple[str, str], float, float, float, float, str - ] - PSM predictions - start_time : Optional[int], default=None + predictions: List[PepSpecMatch] + PSM predictions. + start_time : Optional[float], default=None The start time of the sequencing run in seconds since the epoch. - end_time : Optional[int], default=None + end_time : Optional[float], default=None The end time of the sequencing run in seconds since the epoch. - score_bins: List[float], Optional + score_bins: Iterable[float], Optional Confidence scores for creating confidence score distribution, - see get_score_bins + see `get_score_bins`. """ log_run_report(start_time=start_time, end_time=end_time) run_report = get_report_dict( diff --git a/docs/images/help.svg b/docs/images/help.svg index dbdc05e0..d25376e4 100644 --- a/docs/images/help.svg +++ b/docs/images/help.svg @@ -1,4 +1,4 @@ - + - - + + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + + + + - + - + - - $ casanovo --help - -Usage:casanovo [OPTIONSCOMMAND [ARGS]...                                     - - ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓  - ┃                                  Casanovo                                  ┃  - ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛  - Casanovo de novo sequences peptides from tandem mass spectra using a            - Transformer model. Casanovo currently supports mzML, mzXML, and MGF files for   - de novo sequencing and annotated MGF files, such as those from MassIVE-KB, for  - training new models.                                                            - - Links:                                                                          - - • Documentation: https://casanovo.readthedocs.io                               - • Official code repository: https://github.com/Noble-Lab/casanovo              - - If you use Casanovo in your work, please cite:                                  - - • Yilmaz, M., Fondrie, W. E., Bittremieux, W., Oh, S. & Noble, W. S. De novo   -mass spectrometry peptide sequencing with a transformer model. Proceedings   -of the 39th International Conference on Machine Learning - ICML '22 (2022)   -doi:10.1101/2022.02.07.479481.                                               - -╭─ Options ────────────────────────────────────────────────────────────────────╮ ---help-h    Show this message and exit.                                     -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Commands ───────────────────────────────────────────────────────────────────╮ -configure Generate a Casanovo configuration file to customize.               -sequence  De novo sequence peptides from tandem mass spectra.                -train     Train a Casanovo model on your own data.                           -version   Get the Casanovo version information                               -╰──────────────────────────────────────────────────────────────────────────────╯ - + + $ casanovo --help + +Usage:casanovo [OPTIONSCOMMAND [ARGS]...                                     + + ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓  + ┃                                  Casanovo                                  ┃  + ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛  + Casanovo de novo sequences peptides from tandem mass spectra using a            + Transformer model. Casanovo currently supports mzML, mzXML, and MGF files for   + de novo sequencing and annotated MGF files, such as those from MassIVE-KB, for  + training new models.                                                            + + Links:                                                                          + + • Documentation: https://casanovo.readthedocs.io                               + • Official code repository: https://github.com/Noble-Lab/casanovo              + + If you use Casanovo in your work, please cite:                                  + + • Yilmaz, M., Fondrie, W. E., Bittremieux, W., Oh, S. & Noble, W. S. De novo   +mass spectrometry peptide sequencing with a transformer model. Proceedings   +of the 39th International Conference on Machine Learning - ICML '22 (2022)   +doi:10.1101/2022.02.07.479481.                                               + +╭─ Options ────────────────────────────────────────────────────────────────────╮ +--help-h    Show this message and exit.                                     +╰──────────────────────────────────────────────────────────────────────────────╯ +╭─ Commands ───────────────────────────────────────────────────────────────────╮ +configure Generate a Casanovo configuration file to customize.               +db-search Perform a database search on MS/MS data using Casanovo-DB.         +sequence  De novo sequence peptides from tandem mass spectra.                +train     Train a Casanovo model on your own data.                           +version   Get the Casanovo version information.                              +╰──────────────────────────────────────────────────────────────────────────────╯ + diff --git a/docs/images/sequence-help.svg b/docs/images/sequence-help.svg index ea6ff078..6354851d 100644 --- a/docs/images/sequence-help.svg +++ b/docs/images/sequence-help.svg @@ -19,171 +19,171 @@ font-weight: 700; } - .terminal-3610042700-matrix { + .terminal-3608076648-matrix { font-family: Fira Code, monospace; font-size: 20px; line-height: 24.4px; font-variant-east-asian: full-width; } - .terminal-3610042700-title { + .terminal-3608076648-title { font-size: 18px; font-weight: bold; font-family: arial; } - .terminal-3610042700-r1 { fill: #c5c8c6 } -.terminal-3610042700-r2 { fill: #d0b344 } -.terminal-3610042700-r3 { fill: #c5c8c6;font-weight: bold } -.terminal-3610042700-r4 { fill: #68a0b3;font-weight: bold } -.terminal-3610042700-r5 { fill: #868887 } -.terminal-3610042700-r6 { fill: #cc555a } -.terminal-3610042700-r7 { fill: #d0b344;font-weight: bold } -.terminal-3610042700-r8 { fill: #8a4346 } -.terminal-3610042700-r9 { fill: #98a84b;font-weight: bold } -.terminal-3610042700-r10 { fill: #8d7b39;font-weight: bold } + .terminal-3608076648-r1 { fill: #c5c8c6 } +.terminal-3608076648-r2 { fill: #d0b344 } +.terminal-3608076648-r3 { fill: #c5c8c6;font-weight: bold } +.terminal-3608076648-r4 { fill: #68a0b3;font-weight: bold } +.terminal-3608076648-r5 { fill: #868887 } +.terminal-3608076648-r6 { fill: #cc555a } +.terminal-3608076648-r7 { fill: #d0b344;font-weight: bold } +.terminal-3608076648-r8 { fill: #8a4346 } +.terminal-3608076648-r9 { fill: #98a84b;font-weight: bold } +.terminal-3608076648-r10 { fill: #8d7b39;font-weight: bold } - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + @@ -195,56 +195,56 @@ - + - - $ casanovo sequence --help - -Usage:casanovo sequence [OPTIONSPEAK_PATH...                                 - - De novo sequence peptides from tandem mass spectra.                             - PEAK_PATH must be one or more mzML, mzXML, or MGF files from which to sequence  - peptides. If evaluate is set to True PEAK_PATH must be one or more annotated    - MGF file.                                                                       - -╭─ Arguments ──────────────────────────────────────────────────────────────────╮ -*  PEAK_PATH    FILE[required] -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Options ────────────────────────────────────────────────────────────────────╮ ---evaluate-e  Run in evaluation mode.     -                                                   When this flag is set the   -                                                   peptide and amino acid      -                                                   precision will be           -                                                   calculated and logged at    -                                                   the end of the sequencing   -                                                   run. All input files must   -                                                   be annotated MGF files if   -                                                   running in evaluation       -                                                   mode.                       ---model-mTEXT                       Either the model weights    -                                                   (.ckpt file) or a URL       -                                                   pointing to the model       -                                                   weights file. If not        -                                                   provided, Casanovo will     -                                                   try to download the latest  -                                                   release automatically.      ---output_dir-dPATH                       The destination directory   -                                                   for output files            ---output_root-oFILE                       The root name for all       -                                                   output files                ---config-cFILE                       The YAML configuration      -                                                   file overriding the         -                                                   default options.            ---verbosity-v[debug|info|warning|error  Set the verbosity of        -]  console logging messages.   -                                                   Log files are always set    -                                                   to 'debug'.                 ---force_overwrite-f  Whether to overwrite        -                                                   output files.               ---help-h  Show this message and       -                                                   exit.                       -╰──────────────────────────────────────────────────────────────────────────────╯ - + + $ casanovo sequence --help + +Usage:casanovo sequence [OPTIONSPEAK_PATH...                                 + + De novo sequence peptides from tandem mass spectra.                             + PEAK_PATH must be one or more mzML, mzXML, or MGF files from which to sequence  + peptides. If evaluate is set to True PEAK_PATH must be one or more annotated    + MGF file.                                                                       + +╭─ Arguments ──────────────────────────────────────────────────────────────────╮ +*  PEAK_PATH    FILE[required] +╰──────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ────────────────────────────────────────────────────────────────────╮ +--evaluate-e  Run in evaluation mode.     +                                                   When this flag is set the   +                                                   peptide and amino acid      +                                                   precision will be           +                                                   calculated and logged at    +                                                   the end of the sequencing   +                                                   run. All input files must   +                                                   be annotated MGF files if   +                                                   running in evaluation       +                                                   mode.                       +--model-mTEXT                       Either the model weights    +                                                   (.ckpt file) or a URL       +                                                   pointing to the model       +                                                   weights file. If not        +                                                   provided, Casanovo will     +                                                   try to download the latest  +                                                   release automatically.      +--output_dir-dPATH                       The destination directory   +                                                   for output files.           +--output_root-oFILE                       The root name for all       +                                                   output files.               +--config-cFILE                       The YAML configuration      +                                                   file overriding the         +                                                   default options.            +--verbosity-v[debug|info|warning|error  Set the verbosity of        +]  console logging messages.   +                                                   Log files are always set    +                                                   to 'debug'.                 +--force_overwrite-f  Whether to overwrite        +                                                   output files.               +--help-h  Show this message and       +                                                   exit.                       +╰──────────────────────────────────────────────────────────────────────────────╯ + diff --git a/docs/images/train-help.svg b/docs/images/train-help.svg index 783a0660..8aab62d4 100644 --- a/docs/images/train-help.svg +++ b/docs/images/train-help.svg @@ -19,162 +19,162 @@ font-weight: 700; } - .terminal-2920970231-matrix { + .terminal-3079567379-matrix { font-family: Fira Code, monospace; font-size: 20px; line-height: 24.4px; font-variant-east-asian: full-width; } - .terminal-2920970231-title { + .terminal-3079567379-title { font-size: 18px; font-weight: bold; font-family: arial; } - .terminal-2920970231-r1 { fill: #c5c8c6 } -.terminal-2920970231-r2 { fill: #d0b344 } -.terminal-2920970231-r3 { fill: #c5c8c6;font-weight: bold } -.terminal-2920970231-r4 { fill: #68a0b3;font-weight: bold } -.terminal-2920970231-r5 { fill: #868887 } -.terminal-2920970231-r6 { fill: #cc555a } -.terminal-2920970231-r7 { fill: #d0b344;font-weight: bold } -.terminal-2920970231-r8 { fill: #8a4346 } -.terminal-2920970231-r9 { fill: #98a84b;font-weight: bold } -.terminal-2920970231-r10 { fill: #8d7b39;font-weight: bold } + .terminal-3079567379-r1 { fill: #c5c8c6 } +.terminal-3079567379-r2 { fill: #d0b344 } +.terminal-3079567379-r3 { fill: #c5c8c6;font-weight: bold } +.terminal-3079567379-r4 { fill: #68a0b3;font-weight: bold } +.terminal-3079567379-r5 { fill: #868887 } +.terminal-3079567379-r6 { fill: #cc555a } +.terminal-3079567379-r7 { fill: #d0b344;font-weight: bold } +.terminal-3079567379-r8 { fill: #8a4346 } +.terminal-3079567379-r9 { fill: #98a84b;font-weight: bold } +.terminal-3079567379-r10 { fill: #8d7b39;font-weight: bold } - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + @@ -186,53 +186,53 @@ - + - - $ casanovo train --help - -Usage:casanovo train [OPTIONSTRAIN_PEAK_PATH...                              - - Train a Casanovo model on your own data.                                        - TRAIN_PEAK_PATH must be one or more annoated MGF files, such as those provided  - by MassIVE-KB, from which to train a new Casnovo model.                         - -╭─ Arguments ──────────────────────────────────────────────────────────────────╮ -*  TRAIN_PEAK_PATH    FILE[required] -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Options ────────────────────────────────────────────────────────────────────╮ ---validation_peak_path-pFILE                    An annotated MGF file     -                                                     for validation, like      -                                                     from MassIVE-KB. Use      -                                                     this option multiple      -                                                     times to specify          -                                                     multiple files.           ---model-mTEXT                    Either the model weights  -                                                     (.ckpt file) or a URL     -                                                     pointing to the model     -                                                     weights file. If not      -                                                     provided, Casanovo will   -                                                     try to download the       -                                                     latest release            -                                                     automatically.            ---output_dir-dPATH                    The destination           -                                                     directory for output      -                                                     files                     ---output_root-oFILE                    The root name for all     -                                                     output files              ---config-cFILE                    The YAML configuration    -                                                     file overriding the       -                                                     default options.          ---verbosity-v[debug|info|warning|er  Set the verbosity of      -ror]  console logging           -                                                     messages. Log files are   -                                                     always set to 'debug'.    ---force_overwrite-f  Whether to overwrite      -                                                     output files.             ---help-h  Show this message and     -                                                     exit.                     -╰──────────────────────────────────────────────────────────────────────────────╯ - + + $ casanovo train --help + +Usage:casanovo train [OPTIONSTRAIN_PEAK_PATH...                              + + Train a Casanovo model on your own data.                                        + TRAIN_PEAK_PATH must be one or more annoated MGF files, such as those provided  + by MassIVE-KB, from which to train a new Casnovo model.                         + +╭─ Arguments ──────────────────────────────────────────────────────────────────╮ +*  TRAIN_PEAK_PATH    FILE[required] +╰──────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ────────────────────────────────────────────────────────────────────╮ +--validation_peak_path-pFILE                    An annotated MGF file     +                                                     for validation, like      +                                                     from MassIVE-KB. Use      +                                                     this option multiple      +                                                     times to specify          +                                                     multiple files.           +--model-mTEXT                    Either the model weights  +                                                     (.ckpt file) or a URL     +                                                     pointing to the model     +                                                     weights file. If not      +                                                     provided, Casanovo will   +                                                     try to download the       +                                                     latest release            +                                                     automatically.            +--output_dir-dPATH                    The destination           +                                                     directory for output      +                                                     files.                    +--output_root-oFILE                    The root name for all     +                                                     output files.             +--config-cFILE                    The YAML configuration    +                                                     file overriding the       +                                                     default options.          +--verbosity-v[debug|info|warning|er  Set the verbosity of      +ror]  console logging           +                                                     messages. Log files are   +                                                     always set to 'debug'.    +--force_overwrite-f  Whether to overwrite      +                                                     output files.             +--help-h  Show this message and     +                                                     exit.                     +╰──────────────────────────────────────────────────────────────────────────────╯ + diff --git a/tests/conftest.py b/tests/conftest.py index 5c927eb1..a35c5834 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,10 +1,11 @@ """Fixtures used for testing.""" +import depthcharge import numpy as np import psims import pytest import yaml -from pyteomics.mass import calculate_mass +from pyteomics.mass import calculate_mass, fast_mass, std_aa_mass @pytest.fixture @@ -15,6 +16,36 @@ def mgf_small(tmp_path): return _create_mgf(peptides, mgf_file) +@pytest.fixture +def tiny_fasta_file(tmp_path): + fasta_file = tmp_path / "tiny_fasta.fasta" + with fasta_file.open("w+") as fasta_ref: + fasta_ref.write( + ( + ">foo\nMEAPAQLLFLLLLWLPDTTREIVMTQSPPTLSLSPGERVTLSCRASQSVSSSYLTWYQ" + "QKPGQAPRLLIYGASTRATSIPARFSGSGSGTDFTLTISSLQPEDFAVYYCQQDYNLP" + ) + ) + return fasta_file + + +@pytest.fixture +def mgf_medium(tmp_path): + """An MGF file with 7 spectra and scan numbers, + C+57.021 mass modification considered""" + peptides = [ + "ATSIPAR", + "VTLSCR", + "LLIYGASTR", + "EIVMTQSPPTLSLSPGER", + "MEAPAQLLFLLLLWLPDTTR", + "ASQSVSSSYLTWYQQKPGQAPR", + "FSGSGSGTDFTLTISSLQPEDFAVYYCQQDYNLP", + ] + mgf_file = tmp_path / "db_search.mgf" + return _create_mgf(peptides, mgf_file, mod_aa_mass={"C": 160.030649}) + + @pytest.fixture def mgf_small_unannotated(tmp_path): """An MGF file with 2 unannotated spectra.""" @@ -23,7 +54,9 @@ def mgf_small_unannotated(tmp_path): return _create_mgf(peptides, mgf_file, annotate=False) -def _create_mgf(peptides, mgf_file, random_state=42, annotate=True): +def _create_mgf( + peptides, mgf_file, random_state=42, mod_aa_mass=None, annotate=True +): """ Create a fake MGF file from one or more peptides. @@ -35,6 +68,9 @@ def _create_mgf(peptides, mgf_file, random_state=42, annotate=True): The MGF file to create. random_state : int or numpy.random.Generator, optional The random seed. The charge states are chosen to be 2 or 3 randomly. + mod_aa_mass : dict, optional + A dictionary that specifies the modified masses of amino acids. + e.g. {"C": 160.030649} for carbamidomethylated C. annotate: bool, optional Whether to add peptide annotations to mgf file @@ -44,7 +80,9 @@ def _create_mgf(peptides, mgf_file, random_state=42, annotate=True): """ rng = np.random.default_rng(random_state) entries = [ - _create_mgf_entry(p, rng.choice([2, 3]), annotate=annotate) + _create_mgf_entry( + p, rng.choice([2, 3]), mod_aa_mass=mod_aa_mass, annotate=annotate + ) for p in peptides ] with mgf_file.open("w+") as mgf_ref: @@ -53,7 +91,7 @@ def _create_mgf(peptides, mgf_file, random_state=42, annotate=True): return mgf_file -def _create_mgf_entry(peptide, charge=2, annotate=True): +def _create_mgf_entry(peptide, charge=2, mod_aa_mass=None, annotate=True): """ Create a MassIVE-KB style MGF entry for a single PSM. @@ -63,6 +101,8 @@ def _create_mgf_entry(peptide, charge=2, annotate=True): A peptide sequence. charge : int, optional The peptide charge state. + mod_aa_mass : dict, optional + A dictionary that specifies the modified masses of amino acids. annotate: bool, optional Whether to add peptide annotation to entry @@ -71,7 +111,12 @@ def _create_mgf_entry(peptide, charge=2, annotate=True): str The PSM entry in an MGF file format. """ - precursor_mz = calculate_mass(peptide, charge=int(charge)) + if mod_aa_mass is None: + precursor_mz = fast_mass(peptide, charge=int(charge)) + else: + aa_mass = std_aa_mass.copy() + aa_mass.update(mod_aa_mass) + precursor_mz = fast_mass(peptide, charge=int(charge), aa_mass=aa_mass) mzs, intensities = _peptide_to_peaks(peptide, charge) frags = "\n".join([f"{m} {i}" for m, i in zip(mzs, intensities)]) @@ -218,6 +263,11 @@ def tiny_config(tmp_path): "precursor_mass_tol": 5, "isotope_error_range": [0, 1], "min_peptide_len": 6, + "max_peptide_len": 100, + "enzyme": "trypsin", + "digestion": "full", + "missed_cleavages": 0, + "max_mods": None, "predict_batch_size": 1024, "n_beams": 1, "top_match": 1, @@ -236,7 +286,6 @@ def tiny_config(tmp_path): "dim_model": 512, "dropout": 0.0, "dim_intensity": None, - "max_length": 100, "learning_rate": 5e-4, "weight_decay": 1e-5, "train_batch_size": 32, @@ -271,6 +320,11 @@ def tiny_config(tmp_path): "-17.027": -17.026549, "+43.006-17.027": 25.980265, }, + "allowed_fixed_mods": "C:C+57.021", + "allowed_var_mods": ( + "M:M+15.995,N:N+0.984,Q:Q+0.984," + "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" + ), } cfg_file = tmp_path / "config.yml" @@ -278,3 +332,8 @@ def tiny_config(tmp_path): yaml.dump(cfg, out_file) return cfg_file + + +@pytest.fixture +def residues_dict(): + return depthcharge.masses.PeptideMass("massivekb").masses diff --git a/tests/test_integration.py b/tests/test_integration.py index bb6ef66e..7dab1b5b 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -11,6 +11,73 @@ TEST_DIR = Path(__file__).resolve().parent +def test_db_search( + mgf_medium, tiny_fasta_file, tiny_config, tmp_path, monkeypatch +): + # Run a command: + monkeypatch.setattr(casanovo, "__version__", "4.1.0") + run = functools.partial( + CliRunner().invoke, casanovo.main, catch_exceptions=False + ) + + output_rootname = "db" + output_filename = (tmp_path / output_rootname).with_suffix(".mztab") + + search_args = [ + "db-search", + "--config", + tiny_config, + "--output_dir", + str(tmp_path), + "--output_root", + output_rootname, + str(mgf_medium), + str(tiny_fasta_file), + ] + + result = run(search_args) + + assert result.exit_code == 0 + assert output_filename.exists() + + mztab = pyteomics.mztab.MzTab(str(output_filename)) + + psms = mztab.spectrum_match_table + assert list(psms.sequence) == [ + "ATSIPAR", + "VTLSC+57.021R", + "LLIYGASTR", + "EIVMTQSPPTLSLSPGER", + "MEAPAQLLFLLLLWLPDTTR", + "ASQSVSSSYLTWYQQKPGQAPR", + "FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", + ] + + # Validate mztab output + validate_args = [ + "java", + "-jar", + f"{TEST_DIR}/jmzTabValidator.jar", + "--check", + f"inFile={output_filename}", + ] + + validate_result = subprocess.run( + validate_args, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + + assert validate_result.returncode == 0 + assert not any( + [ + line.startswith("[Error-") + for line in validate_result.stdout.splitlines() + ] + ) + + def test_train_and_run( mgf_small, mzml_small, tiny_config, tmp_path, monkeypatch ): @@ -110,8 +177,8 @@ def test_train_and_run( mztab = pyteomics.mztab.MzTab(str(output_filename)) filename = "small.mgf" # Verify that the input annotated peak file is listed in the metadata. - assert f"ms_run[1]-location" in mztab.metadata - assert mztab.metadata[f"ms_run[1]-location"].endswith(filename) + assert "ms_run[1]-location" in mztab.metadata + assert mztab.metadata["ms_run[1]-location"].endswith(filename) # Verify that the spectrum predictions are correct # and indexed according to the peak input file type. diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index 985cfb4b..00617457 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -18,15 +18,16 @@ import einops import github import numpy as np +import pandas as pd import pytest import torch from casanovo import casanovo from casanovo import utils -from casanovo.data import ms_io +from casanovo.data import db_utils, ms_io from casanovo.data.datasets import SpectrumDataset, AnnotatedSpectrumDataset -from casanovo.denovo.evaluate import aa_match_batch, aa_match_metrics, aa_match -from casanovo.denovo.model import Spec2Pep, _aa_pep_score +from casanovo.denovo.evaluate import aa_match, aa_match_batch, aa_match_metrics +from casanovo.denovo.model import Spec2Pep, _aa_pep_score, _calc_match_score from depthcharge.data import SpectrumIndex, AnnotatedSpectrumIndex @@ -452,6 +453,776 @@ def test_aa_pep_score(): assert peptide_score == pytest.approx(0.5) +def test_peptide_generator_errors(residues_dict, tiny_fasta_file): + with pytest.raises(FileNotFoundError): + [ + (a, b) + for a, b in db_utils._peptide_generator( + "fail.fasta", "trypsin", "full", 0, 5, 10, residues_dict + ) + ] + with pytest.raises(ValueError): + [ + (a, b) + for a, b in db_utils._peptide_generator( + tiny_fasta_file, "trypsin", "fail", 0, 5, 10, residues_dict + ) + ] + + +def test_to_neutral_mass(): + mz = 500 + charge = 2 + neutral_mass = db_utils._to_neutral_mass(mz, charge) + assert neutral_mass == 997.98544706646 + + mz = 500 + charge = 1 + neutral_mass = db_utils._to_neutral_mass(mz, charge) + assert neutral_mass == 498.99272353323 + + +def test_calc_match_score(): + """ + Test the calculation of geometric scores using teacher-forced + decoder output probabilities and ground truth amino acid sequences. + """ + first_slot_prob = torch.zeros(29) + first_slot_prob[1] = 1.0 # A + second_slot_prob = torch.zeros(29) + second_slot_prob[2] = 1.0 # B + third_slot_prob = torch.zeros(29) + third_slot_prob[3] = 1.0 # C + stop_slot_prob = torch.zeros(29) + stop_slot_prob[28] = 1.0 # $ + blank_slot_prob = torch.zeros(29) + blank_slot_prob[0] = 0.42 # Should never come into play + fourth_slot_prob = torch.zeros(29) + fourth_slot_prob[4] = 0.5 # D + fifth_slot_prob = torch.zeros(29) + fifth_slot_prob[5] = 0.5 # E + + pep_1_aa = torch.stack( + [ + first_slot_prob, + second_slot_prob, + third_slot_prob, + stop_slot_prob, + blank_slot_prob, + ] + ) + pep_2_aa = torch.stack( + [ + third_slot_prob, + second_slot_prob, + stop_slot_prob, + blank_slot_prob, + blank_slot_prob, + ] + ) + pep_3_aa = torch.stack( + [ + fourth_slot_prob, + fifth_slot_prob, + first_slot_prob, + stop_slot_prob, + blank_slot_prob, + ] + ) + pep_4_aa = torch.stack( + [ + first_slot_prob, + second_slot_prob, + third_slot_prob, + stop_slot_prob, + blank_slot_prob, + ] + ) + batch_all_aa_scores = torch.stack([pep_1_aa, pep_2_aa, pep_3_aa, pep_4_aa]) + truth_aa_indices = torch.tensor( + [[1, 2, 3, 28], [3, 2, 28, 0], [4, 5, 1, 28], [2, 2, 3, 28]] + ) + + all_scores, masked_per_aa_scores = _calc_match_score( + batch_all_aa_scores, truth_aa_indices, True + ) + + assert all_scores[0] == np.exp(0) + assert all_scores[1] == np.exp(0) + assert all_scores[2] == pytest.approx( + np.exp(np.log(0.5 * 0.5 * 1 * 1) / 4) + ) + assert all_scores[3] == pytest.approx( + np.exp(np.log(1e-10 * 1 * 1 * 1) / 4) + ) + + aa_scores = np.array([1, 1, 1, 1]) + assert np.allclose(masked_per_aa_scores[0], (aa_scores + 1) / 2) + aa_scores = np.array([1, 1, 1]) + assert np.allclose(masked_per_aa_scores[1], (aa_scores + 1) / 2) + aa_scores = np.array([0.5, 0.5, 1, 1]) + assert np.allclose( + masked_per_aa_scores[2], + (aa_scores + np.exp(np.log(0.5 * 0.5 * 1 * 1) / 4)) / 2, + ) + aa_scores = np.array([1e-10, 1, 1, 1]) + assert np.allclose( + masked_per_aa_scores[3], + (aa_scores + np.exp(np.log(1e-10 * 1 * 1 * 1) / 4)) / 2, + ) + + +def test_digest_fasta_cleave(tiny_fasta_file, residues_dict): + + # No missed cleavages + expected_normal = [ + "ATSIPAR", + "VTLSC+57.021R", + "LLIYGASTR", + "EIVMTQSPPTLSLSPGER", + "MEAPAQLLFLLLLWLPDTTR", + "ASQSVSSSYLTWYQQKPGQAPR", + "FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", + ] + + # 1 missed cleavage + expected_1missedcleavage = [ + "ATSIPAR", + "VTLSC+57.021R", + "LLIYGASTR", + "LLIYGASTRATSIPAR", + "EIVMTQSPPTLSLSPGER", + "MEAPAQLLFLLLLWLPDTTR", + "ASQSVSSSYLTWYQQKPGQAPR", + "EIVMTQSPPTLSLSPGERVTLSC+57.021R", + "VTLSC+57.021RASQSVSSSYLTWYQQKPGQAPR", + "ASQSVSSSYLTWYQQKPGQAPRLLIYGASTR", + "FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", + "MEAPAQLLFLLLLWLPDTTREIVMTQSPPTLSLSPGER", + "ATSIPARFSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", + ] + + # 3 missed cleavages + expected_3missedcleavage = [ + "ATSIPAR", + "VTLSC+57.021R", + "LLIYGASTR", + "LLIYGASTRATSIPAR", + "EIVMTQSPPTLSLSPGER", + "MEAPAQLLFLLLLWLPDTTR", + "ASQSVSSSYLTWYQQKPGQAPR", + "EIVMTQSPPTLSLSPGERVTLSC+57.021R", + "VTLSC+57.021RASQSVSSSYLTWYQQKPGQAPR", + "ASQSVSSSYLTWYQQKPGQAPRLLIYGASTR", + "FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", + "ASQSVSSSYLTWYQQKPGQAPRLLIYGASTRATSIPAR", + "VTLSC+57.021RASQSVSSSYLTWYQQKPGQAPRLLIYGASTR", + "MEAPAQLLFLLLLWLPDTTREIVMTQSPPTLSLSPGER", + "ATSIPARFSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", + "VTLSC+57.021RASQSVSSSYLTWYQQKPGQAPRLLIYGASTRATSIPAR", + "MEAPAQLLFLLLLWLPDTTREIVMTQSPPTLSLSPGERVTLSC+57.021R", + "EIVMTQSPPTLSLSPGERVTLSC+57.021RASQSVSSSYLTWYQQKPGQAPR", + "LLIYGASTRATSIPARFSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", + ] + for missed_cleavages, expected in zip( + (0, 1, 3), + (expected_normal, expected_1missedcleavage, expected_3missedcleavage), + ): + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), + enzyme="trypsin", + digestion="full", + missed_cleavages=missed_cleavages, + min_peptide_len=6, + max_peptide_len=50, + max_mods=0, + precursor_tolerance=20, + isotope_error=[0, 0], + allowed_fixed_mods="C:C+57.021", + allowed_var_mods=( + "M:M+15.995,N:N+0.984,Q:Q+0.984," + "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" + ), + residues=residues_dict, + ) + assert pdb.db_peptides.index.to_list() == expected + + +def test_digest_fasta_mods(tiny_fasta_file, residues_dict): + # 1 modification allowed + # fixed: C+57.02146 + # variable: 1M+15.994915,1N+0.984016,1Q+0.984016 + # nterm: 1X+42.010565,1X+43.005814,1X-17.026549,1X+25.980265 + expected_1mod = [ + "-17.027ATSIPAR", + "ATSIPAR", + "-17.027VTLSC+57.021R", + "VTLSC+57.021R", + "+43.006-17.027ATSIPAR", + "+42.011ATSIPAR", + "+43.006ATSIPAR", + "+43.006-17.027VTLSC+57.021R", + "+42.011VTLSC+57.021R", + "+43.006VTLSC+57.021R", + "-17.027LLIYGASTR", + "LLIYGASTR", + "+43.006-17.027LLIYGASTR", + "+42.011LLIYGASTR", + "+43.006LLIYGASTR", + "-17.027EIVMTQSPPTLSLSPGER", + "EIVMTQSPPTLSLSPGER", + "EIVMTQ+0.984SPPTLSLSPGER", + "EIVM+15.995TQSPPTLSLSPGER", + "+43.006-17.027EIVMTQSPPTLSLSPGER", + "+42.011EIVMTQSPPTLSLSPGER", + "+43.006EIVMTQSPPTLSLSPGER", + "-17.027MEAPAQLLFLLLLWLPDTTR", + "-17.027M+15.995EAPAQLLFLLLLWLPDTTR", # + "MEAPAQLLFLLLLWLPDTTR", + "MEAPAQ+0.984LLFLLLLWLPDTTR", + "M+15.995EAPAQLLFLLLLWLPDTTR", + "+43.006-17.027MEAPAQLLFLLLLWLPDTTR", + "+43.006-17.027M+15.995EAPAQLLFLLLLWLPDTTR", # + "+42.011MEAPAQLLFLLLLWLPDTTR", + "+43.006MEAPAQLLFLLLLWLPDTTR", + "+42.011M+15.995EAPAQLLFLLLLWLPDTTR", # + "+43.006M+15.995EAPAQLLFLLLLWLPDTTR", # + "-17.027ASQSVSSSYLTWYQQKPGQAPR", + "ASQSVSSSYLTWYQQKPGQAPR", + "ASQ+0.984SVSSSYLTWYQQKPGQAPR", + "ASQSVSSSYLTWYQ+0.984QKPGQAPR", + "ASQSVSSSYLTWYQQ+0.984KPGQAPR", + "ASQSVSSSYLTWYQQKPGQ+0.984APR", + "+43.006-17.027ASQSVSSSYLTWYQQKPGQAPR", + "+42.011ASQSVSSSYLTWYQQKPGQAPR", + "+43.006ASQSVSSSYLTWYQQKPGQAPR", + "-17.027FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", + "FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", + "FSGSGSGTDFTLTISSLQ+0.984PEDFAVYYC+57.021QQDYNLP", + "FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021Q+0.984QDYNLP", + "FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQ+0.984DYNLP", + "FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYN+0.984LP", + "+43.006-17.027FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", + "+42.011FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", + "+43.006FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", + ] + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), + enzyme="trypsin", + digestion="full", + missed_cleavages=0, + min_peptide_len=6, + max_peptide_len=50, + max_mods=1, + precursor_tolerance=20, + isotope_error=[0, 0], + allowed_fixed_mods="C:C+57.021", + allowed_var_mods=( + "M:M+15.995,N:N+0.984,Q:Q+0.984," + "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" + ), + residues=residues_dict, + ) + assert pdb.db_peptides.index.to_list() == expected_1mod + + +def test_length_restrictions(tiny_fasta_file, residues_dict): + # length between 20 and 50 + expected_long = [ + "MEAPAQLLFLLLLWLPDTTR", + "ASQSVSSSYLTWYQQKPGQAPR", + "FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", + ] + + # length between 6 and 8 + expected_short = ["ATSIPAR", "VTLSC+57.021R"] + + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), + enzyme="trypsin", + digestion="full", + missed_cleavages=0, + min_peptide_len=20, + max_peptide_len=50, + max_mods=0, + precursor_tolerance=20, + isotope_error=[0, 0], + allowed_fixed_mods="C:C+57.021", + allowed_var_mods=( + "M:M+15.995,N:N+0.984,Q:Q+0.984," + "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" + ), + residues=residues_dict, + ) + assert pdb.db_peptides.index.to_list() == expected_long + + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), + enzyme="trypsin", + digestion="full", + missed_cleavages=0, + min_peptide_len=6, + max_peptide_len=8, + max_mods=0, + precursor_tolerance=20, + isotope_error=[0, 0], + allowed_fixed_mods="C:C+57.021", + allowed_var_mods=( + "M:M+15.995,N:N+0.984,Q:Q+0.984," + "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" + ), + residues=residues_dict, + ) + assert pdb.db_peptides.index.to_list() == expected_short + + +def test_digest_fasta_enzyme(tiny_fasta_file, residues_dict): + # arg-c enzyme + expected_argc = [ + "ATSIPAR", + "VTLSC+57.021R", + "LLIYGASTR", + "EIVMTQSPPTLSLSPGER", + "MEAPAQLLFLLLLWLPDTTR", + "ASQSVSSSYLTWYQQKPGQAPR", + "FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", + ] + + # asp-n enzyme + expected_aspn = ["DFAVYYC+57.021QQ", "DFTLTISSLQPE", "MEAPAQLLFLLLLWLP"] + + expected_semispecific = [ + "FSGSGS", + "ATSIPA", + "ASQSVS", + "PGQAPR", + "TSIPAR", + "MEAPAQ", + "LLIYGA", + "YGASTR", + "LSPGER", + "LPDTTR", + "EIVMTQ", + "VTLSC+57.021R", + "QDYNLP", + ] + + expected_nonspecific = [ + "SGSGSG", + "GSGSGT", + "SGSGTD", + "FSGSGS", + "ATSIPA", + "GASTRA", + "LSLSPG", + "ASQSVS", + "GSGTDF", + "SLSPGE", + "QSVSSS", + "SQSVSS", + "KPGQAP", + "SPPTLS", + "ASTRAT", + "RFSGSG", + "IYGAST", + "APAQLL", + "PTLSLS", + "TLSLSP", + "TLTISS", + "STRATS", + "LIYGAS", + "ARFSGS", + "PGQAPR", + "SGTDFT", + "PPTLSL", + "EAPAQL", + "QKPGQA", + "SVSSSY", + "TQSPPT", + "LTISSL", + "PARFSG", + "GQAPRL", + "QSPPTL", + "SPGERV", + "ISSLQP", + "RATSIP", + "TSIPAR", + "MEAPAQ", + "RASQSV", + "TISSLQ", + "TRATSI", + "LLIYGA", + "GTDFTL", + "YGASTR", + "VSSSYL", + "SSSYLT", + "LSPGER", + "PGERVT", + "MTQSPP", + "SSLQPE", + "VMTQSP", + "GERVTL", + "PEDFAV", + "IVMTQS", + "FTLTIS", + "APRLLI", + "QQKPGQ", + "SLQPED", + "PAQLLF", + "IPARFS", + "SIPARF", + "LSC+57.021RAS", + "TDFTLT", + "QAPRLL", + "LPDTTR", + "ERVTLS", + "AQLLFL", + "QPEDFA", + "TLSC+57.021RA", + "C+57.021RASQS", + "SC+57.021RASQ", + "DFTLTI", + "PDTTRE", + "TTREIV", + "EIVMTQ", + "YQQKPG", + "LFLLLL", + "LLFLLL", + "WLPDTT", + "DTTREI", + "RLLIYG", + "RVTLSC+57.021", + "VTLSC+57.021R", + "EDFAVY", + "LWLPDT", + "QLLFLL", + "LQPEDF", + "REIVMT", + "TREIVM", + "QDYNLP", + "LLLWLP", + "SSYLTW", + "LLWLPD", + "LLLLWL", + "PRLLIY", + "DFAVYY", + "QQDYNL", + "AVYYC+57.021Q", + "FLLLLW", + "FAVYYC+57.021", + "C+57.021QQDYN", + "SYLTWY", + "LTWYQQ", + "WYQQKP", + "TWYQQK", + "VYYC+57.021QQ", + "YLTWYQ", + "YC+57.021QQDY", + "YYC+57.021QQD", + ] + + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), + enzyme="arg-c", + digestion="full", + missed_cleavages=0, + min_peptide_len=6, + max_peptide_len=50, + max_mods=0, + precursor_tolerance=20, + isotope_error=[0, 0], + allowed_fixed_mods="C:C+57.021", + allowed_var_mods=( + "M:M+15.995,N:N+0.984,Q:Q+0.984," + "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" + ), + residues=residues_dict, + ) + assert pdb.db_peptides.index.to_list() == expected_argc + + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), + enzyme="asp-n", + digestion="full", + missed_cleavages=0, + min_peptide_len=6, + max_peptide_len=50, + max_mods=0, + precursor_tolerance=20, + isotope_error=[0, 0], + allowed_fixed_mods="C:C+57.021", + allowed_var_mods=( + "M:M+15.995,N:N+0.984,Q:Q+0.984," + "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" + ), + residues=residues_dict, + ) + assert pdb.db_peptides.index.to_list() == expected_aspn + + # Test regex rule instead of named enzyme + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), + enzyme="R", + digestion="full", + missed_cleavages=0, + min_peptide_len=6, + max_peptide_len=50, + max_mods=0, + precursor_tolerance=20, + isotope_error=[0, 0], + allowed_fixed_mods="C:C+57.021", + allowed_var_mods=( + "M:M+15.995,N:N+0.984,Q:Q+0.984," + "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" + ), + residues=residues_dict, + ) + assert pdb.db_peptides.index.to_list() == expected_argc + + # Test semispecific digest + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), + enzyme="trypsin", + digestion="partial", + missed_cleavages=0, + min_peptide_len=6, + max_peptide_len=6, + max_mods=0, + precursor_tolerance=10000, + isotope_error=[0, 0], + allowed_fixed_mods="C:C+57.021", + allowed_var_mods=( + "M:M+15.995,N:N+0.984,Q:Q+0.984," + "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" + ), + residues=residues_dict, + ) + assert pdb.db_peptides.index.to_list() == expected_semispecific + + # Test nonspecific digest + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), + enzyme="trypsin", + digestion="non-specific", + missed_cleavages=0, + min_peptide_len=6, + max_peptide_len=6, + max_mods=0, + precursor_tolerance=10000, + isotope_error=[0, 0], + allowed_fixed_mods="C:C+57.021", + allowed_var_mods=( + "M:M+15.995,N:N+0.984,Q:Q+0.984," + "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" + ), + residues=residues_dict, + ) + assert pdb.db_peptides.index.to_list() == expected_nonspecific + + +def test_get_candidates(tiny_fasta_file, residues_dict): + # precursor_window is 10000 + expected_smallwindow = ["LLIYGASTR"] + + # precursor window is 150000 + expected_midwindow = ["LLIYGASTR"] + + # precursor window is 600000 + expected_widewindow = ["ATSIPAR", "VTLSC+57.021R", "LLIYGASTR"] + + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), + enzyme="trypsin", + digestion="full", + missed_cleavages=1, + min_peptide_len=6, + max_peptide_len=50, + max_mods=0, + precursor_tolerance=10000, + isotope_error=[0, 0], + allowed_fixed_mods="C:C+57.021", + allowed_var_mods=( + "M:M+15.995,N:N+0.984,Q:Q+0.984," + "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" + ), + residues=residues_dict, + ) + candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) + assert expected_smallwindow == list(candidates) + + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), + enzyme="trypsin", + digestion="full", + missed_cleavages=1, + min_peptide_len=6, + max_peptide_len=50, + max_mods=0, + precursor_tolerance=150000, + isotope_error=[0, 0], + allowed_fixed_mods="C:C+57.021", + allowed_var_mods=( + "M:M+15.995,N:N+0.984,Q:Q+0.984," + "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" + ), + residues=residues_dict, + ) + candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) + assert expected_midwindow == list(candidates) + + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), + enzyme="trypsin", + digestion="full", + missed_cleavages=1, + min_peptide_len=6, + max_peptide_len=50, + max_mods=0, + precursor_tolerance=600000, + isotope_error=[0, 0], + allowed_fixed_mods="C:C+57.021", + allowed_var_mods=( + "M:M+15.995,N:N+0.984,Q:Q+0.984," + "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" + ), + residues=residues_dict, + ) + candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) + assert expected_widewindow == list(candidates) + + +def test_get_candidates_isotope_error(tiny_fasta_file, residues_dict): + + # Tide isotope error windows for 496.2, 2+: + # 0: [980.481617, 1000.289326] + # 1: [979.491114, 999.278813] + # 2: [978.500611, 998.268300] + # 3: [977.510108, 997.257787] + + peptide_list = [ + ("A", 1001, "foo"), + ("B", 1000, "foo"), + ("C", 999, "foo"), + ("D", 998, "foo"), + ("E", 997, "foo"), + ("F", 996, "foo"), + ("G", 995, "foo"), + ("H", 994, "foo"), + ("I", 993, "foo"), + ("J", 992, "foo"), + ("K", 991, "foo"), + ("L", 990, "foo"), + ("M", 989, "foo"), + ("N", 988, "foo"), + ("O", 987, "foo"), + ("P", 986, "foo"), + ("Q", 985, "foo"), + ("R", 984, "foo"), + ("S", 983, "foo"), + ("T", 982, "foo"), + ("U", 981, "foo"), + ("V", 980, "foo"), + ("W", 979, "foo"), + ("X", 978, "foo"), + ("Y", 977, "foo"), + ("Z", 976, "foo"), + ] + + peptide_list = pd.DataFrame( + peptide_list, columns=["peptide", "calc_mass", "protein"] + ).set_index("peptide") + peptide_list.sort_values("calc_mass", inplace=True) + + expected_isotope0 = list("UTSRQPONMLKJIHGFEDCB") + expected_isotope01 = list("VUTSRQPONMLKJIHGFEDCB") + expected_isotope012 = list("WVUTSRQPONMLKJIHGFEDCB") + expected_isotope0123 = list("XWVUTSRQPONMLKJIHGFEDCB") + + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), + enzyme="trypsin", + digestion="full", + missed_cleavages=0, + min_peptide_len=0, + max_peptide_len=0, + max_mods=0, + precursor_tolerance=10000, + isotope_error=[0, 0], + allowed_fixed_mods="C:C+57.021", + allowed_var_mods=( + "M:M+15.995,N:N+0.984,Q:Q+0.984," + "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" + ), + residues=residues_dict, + ) + pdb.db_peptides = peptide_list + candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) + assert expected_isotope0 == list(candidates) + + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), + enzyme="trypsin", + digestion="full", + missed_cleavages=0, + min_peptide_len=0, + max_peptide_len=0, + max_mods=0, + precursor_tolerance=10000, + isotope_error=[0, 1], + allowed_fixed_mods="C:C+57.021", + allowed_var_mods=( + "M:M+15.995,N:N+0.984,Q:Q+0.984," + "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" + ), + residues=residues_dict, + ) + pdb.db_peptides = peptide_list + candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) + assert expected_isotope01 == list(candidates) + + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), + enzyme="trypsin", + digestion="full", + missed_cleavages=0, + min_peptide_len=0, + max_peptide_len=0, + max_mods=0, + precursor_tolerance=10000, + isotope_error=[0, 2], + allowed_fixed_mods="C:C+57.021", + allowed_var_mods=( + "M:M+15.995,N:N+0.984,Q:Q+0.984," + "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" + ), + residues=residues_dict, + ) + pdb.db_peptides = peptide_list + candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) + assert expected_isotope012 == list(candidates) + + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), + enzyme="trypsin", + digestion="full", + missed_cleavages=0, + min_peptide_len=0, + max_peptide_len=0, + max_mods=0, + precursor_tolerance=10000, + isotope_error=[0, 3], + allowed_fixed_mods="C:C+57.021", + allowed_var_mods=( + "M:M+15.995,N:N+0.984,Q:Q+0.984," + "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" + ), + residues=residues_dict, + ) + pdb.db_peptides = peptide_list + candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) + assert expected_isotope0123 == list(candidates) + + def test_beam_search_decode(): """ Test beam search decoding and its sub-functions. @@ -462,7 +1233,7 @@ def test_beam_search_decode(): # Sizes. batch = 1 # B - length = model.max_length + 1 # L + length = model.max_peptide_len + 1 # L vocab = model.decoder.vocab_size + 1 # V beam = model.n_beams # S step = 3 @@ -579,12 +1350,12 @@ def test_beam_search_decode(): assert torch.equal(new_scores[:, step, :], expected_scores) # Test output if decoding loop isn't stopped with termination of all beams. - model.max_length = 0 + model.max_peptide_len = 0 # 1 spectrum with 5 peaks (2 values: m/z and intensity). spectra = torch.zeros(1, 5, 2) precursors = torch.tensor([[469.25364, 2.0, 235.63410]]) assert len(list(model.beam_search_decode(spectra, precursors))[0]) == 0 - model.max_length = 100 + model.max_peptide_len = 100 # Re-initialize scores and tokens to further test caching functionality. scores = torch.full( @@ -744,7 +1515,7 @@ def test_beam_search_decode(): batch = 2 # B beam = model.n_beams # S model.decoder.reverse = True - length = model.max_length + 1 # L + length = model.max_peptide_len + 1 # L vocab = model.decoder.vocab_size + 1 # V step = 4 @@ -785,7 +1556,7 @@ def test_beam_search_decode(): batch = 2 # B beam = model.n_beams # S model.decoder.reverse = True - length = model.max_length + 1 # L + length = model.max_peptide_len + 1 # L vocab = model.decoder.vocab_size + 1 # V step = 4