From 6431cbf68f6bb8f1719a56cc8c2d88e95ef51b97 Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Thu, 27 Jun 2024 16:44:34 -0700 Subject: [PATCH 01/14] downlaod weights from URL --- casanovo/casanovo.py | 98 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 92 insertions(+), 6 deletions(-) diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py index 8bdfa58f..54d6d649 100644 --- a/casanovo/casanovo.py +++ b/casanovo/casanovo.py @@ -37,6 +37,8 @@ import torch import tqdm from lightning.pytorch import seed_everything +from hashlib import shake_256 +from urllib.parse import urlparse from . import __version__ from . import utils @@ -59,10 +61,9 @@ def __init__(self, *args, **kwargs) -> None: click.Option( ("-m", "--model"), help=""" - The model weights (.ckpt file). If not provided, Casanovo - will try to download the latest release. + Either the model weights (.ckpt file) or a URL pointing to the model weights + file. If not provided, Casanovo will try to download the latest release. """, - type=click.Path(exists=True, dir_okay=False), ), click.Option( ("-o", "--output"), @@ -354,9 +355,10 @@ def setup_model( seed_everything(seed=config["random_seed"], workers=True) # Download model weights if these were not specified (except when training). + cache_dir = appdirs.user_cache_dir("casanovo", False, opinion=False) if model is None and not is_train: try: - model = _get_model_weights() + model = _get_model_weights(cache_dir) except github.RateLimitExceededException: logger.error( "GitHub API rate limit exceeded while trying to download the " @@ -371,6 +373,17 @@ def setup_model( "model weights" ) from None + # Download model from URL if model is a valid url + is_url = _is_valid_url(model) + if (model is not None) and is_url: + model = _get_weights_from_url(model, Path(cache_dir)) + + if (model is not None) and (not is_url) and (not Path(model).is_file()): + raise ValueError( + f"{model} is not a valid URL or checkpoint file path, " + "--model argument must be a URL or checkpoint file path" + ) + # Log the active configuration. logger.info("Casanovo version %s", str(__version__)) logger.debug("model = %s", model) @@ -382,7 +395,76 @@ def setup_model( return config, model -def _get_model_weights() -> str: +def _get_weights_from_url( + file_url: Optional[str], + cache_dir: Path, +) -> str: + """ + Attempt to download weight file from URL if weights are not already + cached. Otherwise use cased weights. Downloaded weight files will be + cached. + + Parameters + ---------- + file_url : str + url pointing to model weights file + cache_dir : Path + model weights cache directory path + + Returns + ------- + str + path to cached weights file + """ + os.makedirs(cache_dir, exist_ok=True) + url_hash = shake_256(file_url.encode("utf-8")).hexdigest(20) + cache_file_name = url_hash + ".ckpt" + cache_file_path = cache_dir / cache_file_name + + if cache_file_path.is_file(): + logger.info(f"Model weights {file_url} retrieved from local cache") + return str(cache_file_path) + + logger.info(f"Model weights {file_url} not in local cache, downloading") + file_response = requests.get(file_url) + + if not file_response.ok: + logger.error(f"Failed to download weights from {file_url}") + logger.error( + f"Server Response: {file_response.status_code}: {file_response.reason}" + ) + raise ConnectionError(f"Failed to download weights file: {file_url}") + + logger.info("Model weights downloaded, writing to cache") + with open(cache_file_path, "wb") as cache_file: + cache_file.write(file_response.content) + + logger.info("Model weights cached") + return str(cache_file_path) + + +def _is_valid_url(file_url: str) -> bool: + """ + Determine whether file URL is a valid URL + + Parameters + ---------- + file_url : str + url to verify + + Return + ------ + is_url : bool + whether file_url is a valid url + """ + try: + result = urlparse(file_url) + return all([result.scheme, result.netloc]) + except ValueError: + return False + + +def _get_model_weights(cache_dir: str) -> str: """ Use cached model weights or download them from GitHub. @@ -396,12 +478,16 @@ def _get_model_weights() -> str: Note that the GitHub API is limited to 60 requests from the same IP per hour. + Parameters + ---------- + cache_dir : str + model weights cache directory path + Returns ------- str The name of the model weights file. """ - cache_dir = appdirs.user_cache_dir("casanovo", False, opinion=False) os.makedirs(cache_dir, exist_ok=True) version = utils.split_version(__version__) version_match: Tuple[Optional[str], Optional[str], int] = None, None, 0 From 9e359bfcd60582b6a89a2d89edb2a065462bee68 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 28 Jun 2024 01:02:30 +0000 Subject: [PATCH 02/14] Generate new screengrabs with rich-codex --- docs/images/configure-help.svg | 64 ++++++----- docs/images/evaluate-help.svg | 133 +++++++++++++---------- docs/images/help.svg | 157 ++++++++++++++------------- docs/images/sequence-help.svg | 133 +++++++++++++---------- docs/images/train-help.svg | 193 ++++++++++++++++++--------------- 5 files changed, 370 insertions(+), 310 deletions(-) diff --git a/docs/images/configure-help.svg b/docs/images/configure-help.svg index 0822927a..b1fcce10 100644 --- a/docs/images/configure-help.svg +++ b/docs/images/configure-help.svg @@ -19,57 +19,63 @@ font-weight: 700; } - .terminal-3936755216-matrix { + .terminal-2766440694-matrix { font-family: Fira Code, monospace; font-size: 20px; line-height: 24.4px; font-variant-east-asian: full-width; } - .terminal-3936755216-title { + .terminal-2766440694-title { font-size: 18px; font-weight: bold; font-family: arial; } - .terminal-3936755216-r1 { fill: #c5c8c6 } + .terminal-2766440694-r1 { fill: #c5c8c6 } +.terminal-2766440694-r2 { fill: #d0b344 } +.terminal-2766440694-r3 { fill: #c5c8c6;font-weight: bold } +.terminal-2766440694-r4 { fill: #68a0b3;font-weight: bold } +.terminal-2766440694-r5 { fill: #868887 } +.terminal-2766440694-r6 { fill: #98a84b;font-weight: bold } +.terminal-2766440694-r7 { fill: #d0b344;font-weight: bold } - + - + - + - + - + - + - + - + - + - + - + - + @@ -81,21 +87,21 @@ - + - - $ casanovo configure --help - - Usage: casanovo configure [OPTIONS]                                             - - Generate a Casanovo configuration file to customize.                            - The casanovo configuration file is in the YAML format.                          - -╭─ Options ────────────────────────────────────────────────────────────────────╮ -│ --output  -o  FILE  The output configuration file.                           │ -│ --help    -h        Show this message and exit.                              │ -╰──────────────────────────────────────────────────────────────────────────────╯ - + + $ casanovo configure --help + +Usage:casanovo configure [OPTIONS]                                             + + Generate a Casanovo configuration file to customize.                            + The casanovo configuration file is in the YAML format.                          + +╭─ Options ────────────────────────────────────────────────────────────────────╮ +--output-oFILE  The output configuration file.                            +--help-h  Show this message and exit.                               +╰──────────────────────────────────────────────────────────────────────────────╯ + diff --git a/docs/images/evaluate-help.svg b/docs/images/evaluate-help.svg index b16c4ffd..86f4422e 100644 --- a/docs/images/evaluate-help.svg +++ b/docs/images/evaluate-help.svg @@ -1,4 +1,4 @@ - + - - + + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + + + + - + - + - - $ casanovo evaluate --help - - Usage: casanovo evaluate [OPTIONS] ANNOTATED_PEAK_PATH...                       - - Evaluate de novo peptide sequencing performance.                                - ANNOTATED_PEAK_PATH must be one or more annoated MGF files, such as those       - provided by MassIVE-KB.                                                         - -╭─ Arguments ──────────────────────────────────────────────────────────────────╮ -│ *  ANNOTATED_PEAK_PATH    FILE  [required]                                   │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Options ────────────────────────────────────────────────────────────────────╮ -│ --model      -m  FILE                        The model weights (.ckpt file). │ -│                                              If not provided, Casanovo will  │ -│                                              try to download the latest      │ -│                                              release.                        │ -│ --output     -o  FILE                        The mzTab file to which results │ -│                                              will be written.                │ -│ --config     -c  FILE                        The YAML configuration file     │ -│                                              overriding the default options. │ -│ --verbosity  -v  [debug|info|warning|error]  Set the verbosity of console    │ -│                                              logging messages. Log files are │ -│                                              always set to 'debug'.          │ -│ --help       -h                              Show this message and exit.     │ -╰──────────────────────────────────────────────────────────────────────────────╯ - + + $ casanovo evaluate --help + +Usage:casanovo evaluate [OPTIONSANNOTATED_PEAK_PATH...                       + + Evaluate de novo peptide sequencing performance.                                + ANNOTATED_PEAK_PATH must be one or more annoated MGF files, such as those       + provided by MassIVE-KB.                                                         + +╭─ Arguments ──────────────────────────────────────────────────────────────────╮ +*  ANNOTATED_PEAK_PATH    FILE[required] +╰──────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ────────────────────────────────────────────────────────────────────╮ +--model-mTEXT                        Either the model weights (.ckpt  +                                              file) or a URL pointing to the   +                                              model weights file. If not       +                                              provided, Casanovo will try to   +                                              download the latest release.     +--output-oFILE                        The mzTab file to which results  +                                              will be written.                 +--config-cFILE                        The YAML configuration file      +                                              overriding the default options.  +--verbosity-v[debug|info|warning|error]  Set the verbosity of console     +                                              logging messages. Log files are  +                                              always set to 'debug'.           +--help-h  Show this message and exit.      +╰──────────────────────────────────────────────────────────────────────────────╯ + diff --git a/docs/images/help.svg b/docs/images/help.svg index 2e22e2d3..eae8e93d 100644 --- a/docs/images/help.svg +++ b/docs/images/help.svg @@ -19,126 +19,133 @@ font-weight: 700; } - .terminal-100512290-matrix { + .terminal-771077830-matrix { font-family: Fira Code, monospace; font-size: 20px; line-height: 24.4px; font-variant-east-asian: full-width; } - .terminal-100512290-title { + .terminal-771077830-title { font-size: 18px; font-weight: bold; font-family: arial; } - .terminal-100512290-r1 { fill: #c5c8c6 } + .terminal-771077830-r1 { fill: #c5c8c6 } +.terminal-771077830-r2 { fill: #d0b344 } +.terminal-771077830-r3 { fill: #c5c8c6;font-weight: bold } +.terminal-771077830-r4 { fill: #68a0b3;font-weight: bold } +.terminal-771077830-r5 { fill: #d0b344;font-weight: bold } +.terminal-771077830-r6 { fill: #608ab1;text-decoration: underline; } +.terminal-771077830-r7 { fill: #868887 } +.terminal-771077830-r8 { fill: #98a84b;font-weight: bold } - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + @@ -150,44 +157,44 @@ - + - - $ casanovo --help - - Usage: casanovo [OPTIONS] COMMAND [ARGS]...                                     - - ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓  - ┃                                  Casanovo                                  ┃  - ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛  - Casanovo de novo sequences peptides from tandem mass spectra using a            - Transformer model. Casanovo currently supports mzML, mzXML, and MGF files for   - de novo sequencing and annotated MGF files, such as those from MassIVE-KB, for  - training new models.                                                            - - Links:                                                                          - -  • Documentation: https://casanovo.readthedocs.io                               -  • Official code repository: https://github.com/Noble-Lab/casanovo              - - If you use Casanovo in your work, please cite:                                  - -  • Yilmaz, M., Fondrie, W. E., Bittremieux, W., Oh, S. & Noble, W. S. De novo   -    mass spectrometry peptide sequencing with a transformer model. Proceedings   -    of the 39th International Conference on Machine Learning - ICML '22 (2022)   -    doi:10.1101/2022.02.07.479481.                                               - -╭─ Options ────────────────────────────────────────────────────────────────────╮ -│ --help  -h    Show this message and exit.                                    │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Commands ───────────────────────────────────────────────────────────────────╮ -│ configure  Generate a Casanovo configuration file to customize.              │ -│ evaluate   Evaluate de novo peptide sequencing performance.                  │ -│ sequence   De novo sequence peptides from tandem mass spectra.               │ -│ train      Train a Casanovo model on your own data.                          │ -│ version    Get the Casanovo version information                              │ -╰──────────────────────────────────────────────────────────────────────────────╯ - + + $ casanovo --help + +Usage:casanovo [OPTIONSCOMMAND [ARGS]...                                     + + ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓  + ┃                                  Casanovo                                  ┃  + ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛  + Casanovo de novo sequences peptides from tandem mass spectra using a            + Transformer model. Casanovo currently supports mzML, mzXML, and MGF files for   + de novo sequencing and annotated MGF files, such as those from MassIVE-KB, for  + training new models.                                                            + + Links:                                                                          + + • Documentation: https://casanovo.readthedocs.io + • Official code repository: https://github.com/Noble-Lab/casanovo + + If you use Casanovo in your work, please cite:                                  + + • Yilmaz, M., Fondrie, W. E., Bittremieux, W., Oh, S. & Noble, W. S. De novo   +mass spectrometry peptide sequencing with a transformer model. Proceedings   +of the 39th International Conference on Machine Learning - ICML '22 (2022)   +doi:10.1101/2022.02.07.479481.                                               + +╭─ Options ────────────────────────────────────────────────────────────────────╮ +--help-h    Show this message and exit.                                     +╰──────────────────────────────────────────────────────────────────────────────╯ +╭─ Commands ───────────────────────────────────────────────────────────────────╮ +configure Generate a Casanovo configuration file to customize.               +evaluate  Evaluate de novo peptide sequencing performance.                   +sequence  De novo sequence peptides from tandem mass spectra.                +train     Train a Casanovo model on your own data.                           +version   Get the Casanovo version information                               +╰──────────────────────────────────────────────────────────────────────────────╯ + diff --git a/docs/images/sequence-help.svg b/docs/images/sequence-help.svg index f5799766..976246e3 100644 --- a/docs/images/sequence-help.svg +++ b/docs/images/sequence-help.svg @@ -1,4 +1,4 @@ - + - - + + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + + + + - + - + - - $ casanovo sequence --help - - Usage: casanovo sequence [OPTIONS] PEAK_PATH...                                 - - De novo sequence peptides from tandem mass spectra.                             - PEAK_PATH must be one or more mzMl, mzXML, or MGF files from which to sequence  - peptides.                                                                       - -╭─ Arguments ──────────────────────────────────────────────────────────────────╮ -│ *  PEAK_PATH    FILE  [required]                                             │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Options ────────────────────────────────────────────────────────────────────╮ -│ --model      -m  FILE                        The model weights (.ckpt file). │ -│                                              If not provided, Casanovo will  │ -│                                              try to download the latest      │ -│                                              release.                        │ -│ --output     -o  FILE                        The mzTab file to which results │ -│                                              will be written.                │ -│ --config     -c  FILE                        The YAML configuration file     │ -│                                              overriding the default options. │ -│ --verbosity  -v  [debug|info|warning|error]  Set the verbosity of console    │ -│                                              logging messages. Log files are │ -│                                              always set to 'debug'.          │ -│ --help       -h                              Show this message and exit.     │ -╰──────────────────────────────────────────────────────────────────────────────╯ - + + $ casanovo sequence --help + +Usage:casanovo sequence [OPTIONSPEAK_PATH...                                 + + De novo sequence peptides from tandem mass spectra.                             + PEAK_PATH must be one or more mzMl, mzXML, or MGF files from which to sequence  + peptides.                                                                       + +╭─ Arguments ──────────────────────────────────────────────────────────────────╮ +*  PEAK_PATH    FILE[required] +╰──────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ────────────────────────────────────────────────────────────────────╮ +--model-mTEXT                        Either the model weights (.ckpt  +                                              file) or a URL pointing to the   +                                              model weights file. If not       +                                              provided, Casanovo will try to   +                                              download the latest release.     +--output-oFILE                        The mzTab file to which results  +                                              will be written.                 +--config-cFILE                        The YAML configuration file      +                                              overriding the default options.  +--verbosity-v[debug|info|warning|error]  Set the verbosity of console     +                                              logging messages. Log files are  +                                              always set to 'debug'.           +--help-h  Show this message and exit.      +╰──────────────────────────────────────────────────────────────────────────────╯ + diff --git a/docs/images/train-help.svg b/docs/images/train-help.svg index fccd4140..f6ce0933 100644 --- a/docs/images/train-help.svg +++ b/docs/images/train-help.svg @@ -1,4 +1,4 @@ - + - - + + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + + + + + + + + + + - + - + - - $ casanovo train --help - - Usage: casanovo train [OPTIONS] TRAIN_PEAK_PATH...                              - - Train a Casanovo model on your own data.                                        - TRAIN_PEAK_PATH must be one or more annoated MGF files, such as those provided  - by MassIVE-KB, from which to train a new Casnovo model.                         - -╭─ Arguments ──────────────────────────────────────────────────────────────────╮ -│ *  TRAIN_PEAK_PATH    FILE  [required]                                       │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Options ────────────────────────────────────────────────────────────────────╮ -│ *  --validation_peak_pa…  -p  FILE                    An annotated MGF file  │ -│                                                       for validation, like   │ -│                                                       from MassIVE-KB. Use   │ -│                                                       this option multiple   │ -│                                                       times to specify       │ -│                                                       multiple files.        │ -│                                                       [required]             │ -│    --model                -m  FILE                    The model weights      │ -│                                                       (.ckpt file). If not   │ -│                                                       provided, Casanovo     │ -│                                                       will try to download   │ -│                                                       the latest release.    │ -│    --output               -o  FILE                    The mzTab file to      │ -│                                                       which results will be  │ -│                                                       written.               │ -│    --config               -c  FILE                    The YAML configuration │ -│                                                       file overriding the    │ -│                                                       default options.       │ -│    --verbosity            -v  [debug|info|warning|er  Set the verbosity of   │ -│                               ror]                    console logging        │ -│                                                       messages. Log files    │ -│                                                       are always set to      │ -│                                                       'debug'.               │ -│    --help                 -h                          Show this message and  │ -│                                                       exit.                  │ -╰──────────────────────────────────────────────────────────────────────────────╯ - + + $ casanovo train --help + +Usage:casanovo train [OPTIONSTRAIN_PEAK_PATH...                              + + Train a Casanovo model on your own data.                                        + TRAIN_PEAK_PATH must be one or more annoated MGF files, such as those provided  + by MassIVE-KB, from which to train a new Casnovo model.                         + +╭─ Arguments ──────────────────────────────────────────────────────────────────╮ +*  TRAIN_PEAK_PATH    FILE[required] +╰──────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ────────────────────────────────────────────────────────────────────╮ +*--validation_peak_pa…-pFILE                    An annotated MGF file   +                                                       for validation, like    +                                                       from MassIVE-KB. Use    +                                                       this option multiple    +                                                       times to specify        +                                                       multiple files.         +[required]             +--model-mTEXT                    Either the model        +                                                       weights (.ckpt file)    +                                                       or a URL pointing to    +                                                       the model weights       +                                                       file. If not provided,  +                                                       Casanovo will try to    +                                                       download the latest     +                                                       release.                +--output-oFILE                    The mzTab file to       +                                                       which results will be   +                                                       written.                +--config-cFILE                    The YAML configuration  +                                                       file overriding the     +                                                       default options.        +--verbosity-v[debug|info|warning|er  Set the verbosity of    +ror]  console logging         +                                                       messages. Log files     +                                                       are always set to       +                                                       'debug'.                +--help-h  Show this message and   +                                                       exit.                   +╰──────────────────────────────────────────────────────────────────────────────╯ + From 1341ddc3976971531f366bf36d0ef9d827fdbbc7 Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Tue, 2 Jul 2024 14:04:42 -0700 Subject: [PATCH 03/14] reduced size of cached URL weight file names --- casanovo/casanovo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py index 54d6d649..3ec7784c 100644 --- a/casanovo/casanovo.py +++ b/casanovo/casanovo.py @@ -417,7 +417,7 @@ def _get_weights_from_url( path to cached weights file """ os.makedirs(cache_dir, exist_ok=True) - url_hash = shake_256(file_url.encode("utf-8")).hexdigest(20) + url_hash = shake_256(file_url.encode("utf-8")).hexdigest(10) cache_file_name = url_hash + ".ckpt" cache_file_path = cache_dir / cache_file_name From 610841cf8b8405a67adef4381cfe5e11b02ea681 Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Tue, 9 Jul 2024 15:34:41 -0700 Subject: [PATCH 04/14] implemented hash cache resolution --- casanovo/casanovo.py | 270 +++++++++++++++++++++++-------------------- 1 file changed, 143 insertions(+), 127 deletions(-) diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py index 3ec7784c..694d8877 100644 --- a/casanovo/casanovo.py +++ b/casanovo/casanovo.py @@ -1,12 +1,17 @@ """The command line entry point for Casanovo.""" import datetime +import email +import email.utils import functools +import hashlib import logging +import urllib import os import re import shutil import sys +import urllib.parse import warnings from pathlib import Path from typing import Optional, Tuple @@ -37,8 +42,6 @@ import torch import tqdm from lightning.pytorch import seed_everything -from hashlib import shake_256 -from urllib.parse import urlparse from . import __version__ from . import utils @@ -62,7 +65,7 @@ def __init__(self, *args, **kwargs) -> None: ("-m", "--model"), help=""" Either the model weights (.ckpt file) or a URL pointing to the model weights - file. If not provided, Casanovo will try to download the latest release. + file. If not provided, Casanovo will try to download the latest release automatically. """, ), click.Option( @@ -355,34 +358,32 @@ def setup_model( seed_everything(seed=config["random_seed"], workers=True) # Download model weights if these were not specified (except when training). - cache_dir = appdirs.user_cache_dir("casanovo", False, opinion=False) - if model is None and not is_train: - try: - model = _get_model_weights(cache_dir) - except github.RateLimitExceededException: - logger.error( - "GitHub API rate limit exceeded while trying to download the " - "model weights. Please download compatible model weights " - "manually from the official Casanovo code website " - "(https://github.com/Noble-Lab/casanovo) and specify these " - "explicitly using the `--model` parameter when running " - "Casanovo." + cache_dir = Path(appdirs.user_cache_dir("casanovo", False, opinion=False)) + if model is None: + if not is_train: + try: + model = _get_model_weights(cache_dir) + except github.RateLimitExceededException: + logger.error( + "GitHub API rate limit exceeded while trying to download the " + "model weights. Please download compatible model weights " + "manually from the official Casanovo code website " + "(https://github.com/Noble-Lab/casanovo) and specify these " + "explicitly using the `--model` parameter when running " + "Casanovo." + ) + raise PermissionError( + "GitHub API rate limit exceeded while trying to download the " + "model weights" + ) from None + else: + if _is_valid_url(model): + model = _get_weights_from_url(model, cache_dir) + elif not Path(model).is_file(): + raise ValueError( + f"{model} is not a valid URL or checkpoint file path, " + "--model argument must be a URL or checkpoint file path" ) - raise PermissionError( - "GitHub API rate limit exceeded while trying to download the " - "model weights" - ) from None - - # Download model from URL if model is a valid url - is_url = _is_valid_url(model) - if (model is not None) and is_url: - model = _get_weights_from_url(model, Path(cache_dir)) - - if (model is not None) and (not is_url) and (not Path(model).is_file()): - raise ValueError( - f"{model} is not a valid URL or checkpoint file path, " - "--model argument must be a URL or checkpoint file path" - ) # Log the active configuration. logger.info("Casanovo version %s", str(__version__)) @@ -395,76 +396,7 @@ def setup_model( return config, model -def _get_weights_from_url( - file_url: Optional[str], - cache_dir: Path, -) -> str: - """ - Attempt to download weight file from URL if weights are not already - cached. Otherwise use cased weights. Downloaded weight files will be - cached. - - Parameters - ---------- - file_url : str - url pointing to model weights file - cache_dir : Path - model weights cache directory path - - Returns - ------- - str - path to cached weights file - """ - os.makedirs(cache_dir, exist_ok=True) - url_hash = shake_256(file_url.encode("utf-8")).hexdigest(10) - cache_file_name = url_hash + ".ckpt" - cache_file_path = cache_dir / cache_file_name - - if cache_file_path.is_file(): - logger.info(f"Model weights {file_url} retrieved from local cache") - return str(cache_file_path) - - logger.info(f"Model weights {file_url} not in local cache, downloading") - file_response = requests.get(file_url) - - if not file_response.ok: - logger.error(f"Failed to download weights from {file_url}") - logger.error( - f"Server Response: {file_response.status_code}: {file_response.reason}" - ) - raise ConnectionError(f"Failed to download weights file: {file_url}") - - logger.info("Model weights downloaded, writing to cache") - with open(cache_file_path, "wb") as cache_file: - cache_file.write(file_response.content) - - logger.info("Model weights cached") - return str(cache_file_path) - - -def _is_valid_url(file_url: str) -> bool: - """ - Determine whether file URL is a valid URL - - Parameters - ---------- - file_url : str - url to verify - - Return - ------ - is_url : bool - whether file_url is a valid url - """ - try: - result = urlparse(file_url) - return all([result.scheme, result.netloc]) - except ValueError: - return False - - -def _get_model_weights(cache_dir: str) -> str: +def _get_model_weights(cache_dir: Path) -> str: """ Use cached model weights or download them from GitHub. @@ -480,7 +412,7 @@ def _get_model_weights(cache_dir: str) -> str: Parameters ---------- - cache_dir : str + cache_dir : Path model weights cache directory path Returns @@ -492,19 +424,30 @@ def _get_model_weights(cache_dir: str) -> str: version = utils.split_version(__version__) version_match: Tuple[Optional[str], Optional[str], int] = None, None, 0 # Try to find suitable model weights in the local cache. - for filename in os.listdir(cache_dir): - root, ext = os.path.splitext(filename) - if ext == ".ckpt": - file_version = tuple( - g for g in re.match(r".*_v(\d+)_(\d+)_(\d+)", root).groups() - ) - match = ( - sum(m) - if (m := [i == j for i, j in zip(version, file_version)])[0] - else 0 - ) - if match > version_match[2]: - version_match = os.path.join(cache_dir, filename), None, match + for curr_subdir in cache_dir.iterdir(): + if not curr_subdir.is_dir(): + continue + + for filename in curr_subdir.iterdir(): + root, ext = os.path.splitext(filename) + if ext == ".ckpt": + file_version = tuple( + g + for g in re.match(r".*_v(\d+)_(\d+)_(\d+)", root).groups() + ) + match = ( + sum(m) + if (m := [i == j for i, j in zip(version, file_version)])[ + 0 + ] + else 0 + ) + if match > version_match[2]: + version_match = ( + os.path.join(cache_dir, filename), + None, + match, + ) # Provide the cached model weights if found. if version_match[2] > 0: logger.info( @@ -544,19 +487,7 @@ def _get_model_weights(cache_dir: str) -> str: # Download the model weights if a matching release was found. if version_match[2] > 0: filename, url, _ = version_match - logger.info( - "Downloading model weights file %s from %s", filename, url - ) - r = requests.get(url, stream=True, allow_redirects=True) - r.raise_for_status() - file_size = int(r.headers.get("Content-Length", 0)) - desc = "(Unknown total file size)" if file_size == 0 else "" - r.raw.read = functools.partial(r.raw.read, decode_content=True) - with tqdm.tqdm.wrapattr( - r.raw, "read", total=file_size, desc=desc - ) as r_raw, open(filename, "wb") as f: - shutil.copyfileobj(r_raw, f) - return filename + return _get_weights_from_url(url, cache_dir) else: logger.error( "No matching model weights for release v%s found, please " @@ -571,5 +502,90 @@ def _get_model_weights(cache_dir: str) -> str: ) +def _get_weights_from_url( + file_url: str, + cache_dir: Path, + force_download: Optional[bool] = False, +) -> str: + """ + Resolve weight file from URL + + Attempt to download weight file from URL if weights are not already + cached - otherwise use cached weights. Downloaded weight files will be + cached. + + Parameters + ---------- + file_url : str + url pointing to model weights file + cache_dir : Path + model weights cache directory path + + Returns + ------- + str + path to cached weights file + """ + print("RESOLVING URL") + os.makedirs(cache_dir, exist_ok=True) + url_hash = hashlib.shake_256(file_url.encode("utf-8")).hexdigest(5) + cache_file_name = Path(urllib.parse.urlparse(file_url).path).name + cache_file_dir = cache_dir / url_hash + cache_file_path = cache_file_dir / cache_file_name + + if cache_file_path.is_file() and not force_download: + cache_time = cache_file_path.stat() + file_response = requests.head(file_url) + url_last_modified = 0 + + if "Last-Modified" in file_response.headers: + url_last_modified = email.utils.parsedate_to_datetime( + file_response.headers["Last-Modified"] + ).timestamp() + + if cache_time.st_mtime > url_last_modified: + logger.info( + "Model weights %s retrieved from local cache", file_url + ) + return str(cache_file_path) + + os.makedirs(cache_file_dir, exist_ok=True) + response = requests.get(file_url, stream=True, allow_redirects=True) + response.raise_for_status() + file_size = int(response.headers.get("Content-Length", 0)) + desc = "(Unknown total file size)" if file_size == 0 else "" + response.raw.read = functools.partial( + response.raw.read, decode_content=True + ) + + with tqdm.tqdm.wrapattr( + response.raw, "read", total=file_size, desc=desc + ) as r_raw, open(cache_file_path, "wb") as file: + shutil.copyfileobj(r_raw, file) + + return cache_file_path + + +def _is_valid_url(file_url: str) -> bool: + """ + Determine whether file URL is a valid URL + + Parameters + ---------- + file_url : str + url to verify + + Return + ------ + is_url : bool + whether file_url is a valid url + """ + try: + result = urllib.parse.urlparse(file_url) + return all([result.scheme, result.netloc]) + except ValueError: + return False + + if __name__ == "__main__": main() From 2e3b75631c0571a0ef371e385e1c9d65c1587531 Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Tue, 9 Jul 2024 16:57:10 -0700 Subject: [PATCH 05/14] preliminary get file from url test --- casanovo/casanovo.py | 31 +++++++++++++++++-------- tests/unit_tests/test_unit.py | 43 +++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+), 10 deletions(-) diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py index 694d8877..a11cf32c 100644 --- a/casanovo/casanovo.py +++ b/casanovo/casanovo.py @@ -431,10 +431,11 @@ def _get_model_weights(cache_dir: Path) -> str: for filename in curr_subdir.iterdir(): root, ext = os.path.splitext(filename) if ext == ".ckpt": - file_version = tuple( - g - for g in re.match(r".*_v(\d+)_(\d+)_(\d+)", root).groups() - ) + file_version_match = re.match(r".*_v(\d+)_(\d+)_(\d+)", root) + if file_version_match is None: + continue + + file_version = file_version_match.groups() match = ( sum(m) if (m := [i == j for i, j in zip(version, file_version)])[ @@ -487,7 +488,9 @@ def _get_model_weights(cache_dir: Path) -> str: # Download the model weights if a matching release was found. if version_match[2] > 0: filename, url, _ = version_match - return _get_weights_from_url(url, cache_dir) + return _get_weights_from_url( + url, cache_dir, cache_file_name=Path(filename).name + ) else: logger.error( "No matching model weights for release v%s found, please " @@ -506,6 +509,7 @@ def _get_weights_from_url( file_url: str, cache_dir: Path, force_download: Optional[bool] = False, + cache_file_name: Optional[str] = None, ) -> str: """ Resolve weight file from URL @@ -517,19 +521,26 @@ def _get_weights_from_url( Parameters ---------- file_url : str - url pointing to model weights file + URL pointing to model weights file. cache_dir : Path - model weights cache directory path + Model weights cache directory path. + force_download : Optional[bool], default=False + If True, forces a new download of the weight file even if it exists in + the cache. + cache_file_name : Optional[str], default=None + Custom name for the cached weight file. If None, the name is derived + from the URL. Returns ------- str - path to cached weights file + Path to the cached weights file. """ - print("RESOLVING URL") os.makedirs(cache_dir, exist_ok=True) url_hash = hashlib.shake_256(file_url.encode("utf-8")).hexdigest(5) - cache_file_name = Path(urllib.parse.urlparse(file_url).path).name + if cache_file_name is None: + cache_file_name = Path(urllib.parse.urlparse(file_url).path).name + cache_file_dir = cache_dir / url_hash cache_file_path = cache_file_dir / cache_file_name diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index f615a099..df6c8366 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -4,6 +4,8 @@ import platform import shutil import tempfile +import unittest +import unittest.mock import einops import github @@ -111,6 +113,47 @@ def request(self, *args, **kwargs): casanovo._get_model_weights() +def test_get_weights_from_url(): + file_url = "http://example.com/model_weights.ckpt" + file_content = b"fake model weights content" + + def mock_requests_get(url, stream=True, allow_redirects=True): + response = unittest.mock.MagicMock() + response.raise_for_status = unittest.mock.MagicMock() + response.headers = {"Content-Length": str(len(file_content))} + response.raw = unittest.mock.MagicMock() + response.raw.read = unittest.mock.MagicMock(return_value=file_content) + return response + + def mock_requests_head(url): + response = unittest.mock.MagicMock() + response.headers = {} + return response + + with tempfile.TemporaryDirectory() as tmp_dir: + cache_dir = Path(tmp_dir) + url_hash = hashlib.shake_256(file_url.encode("utf-8")).hexdigest(5) + cache_file_name = "model_weights.ckpt" + cache_file_dir = cache_dir / url_hash + cache_file_path = cache_file_dir / cache_file_name + + # Test downloading and caching the file + assert not cache_file_path.is_file() + result_path = _get_weights_from_url(file_url, cache_dir) + assert cache_file_path.is_file() + assert result_path == str(cache_file_path) + + # Test using the cached file + result_path_cached = _get_weights_from_url(file_url, cache_dir) + assert result_path_cached == str(cache_file_path) + + # Test force downloading the file + result_path_forced = _get_weights_from_url( + file_url, cache_dir, force_download=True + ) + assert result_path_forced == str(cache_file_path) + + def test_tensorboard(): """ Test that the tensorboard.SummaryWriter object is only created when a folder From 7e7f64de2d91b507a66f9784c3db5d7905668c82 Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Wed, 10 Jul 2024 15:29:23 -0700 Subject: [PATCH 06/14] hash resolition via dirname --- casanovo/casanovo.py | 113 +++++++++++++++------------ tests/unit_tests/test_unit.py | 143 +++++++++++++++++++++++++++------- 2 files changed, 179 insertions(+), 77 deletions(-) diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py index a11cf32c..3a2886a8 100644 --- a/casanovo/casanovo.py +++ b/casanovo/casanovo.py @@ -1,7 +1,6 @@ """The command line entry point for Casanovo.""" import datetime -import email import email.utils import functools import hashlib @@ -424,38 +423,26 @@ def _get_model_weights(cache_dir: Path) -> str: version = utils.split_version(__version__) version_match: Tuple[Optional[str], Optional[str], int] = None, None, 0 # Try to find suitable model weights in the local cache. - for curr_subdir in cache_dir.iterdir(): - if not curr_subdir.is_dir(): - continue - - for filename in curr_subdir.iterdir(): - root, ext = os.path.splitext(filename) - if ext == ".ckpt": - file_version_match = re.match(r".*_v(\d+)_(\d+)_(\d+)", root) - if file_version_match is None: - continue - - file_version = file_version_match.groups() - match = ( - sum(m) - if (m := [i == j for i, j in zip(version, file_version)])[ - 0 - ] - else 0 - ) - if match > version_match[2]: - version_match = ( - os.path.join(cache_dir, filename), - None, - match, - ) + for filename in os.listdir(cache_dir): + root, ext = os.path.splitext(filename) + if ext == ".ckpt": + file_version = tuple( + g for g in re.match(r".*_v(\d+)_(\d+)_(\d+)", root).groups() + ) + match = ( + sum(m) + if (m := [i == j for i, j in zip(version, file_version)])[0] + else 0 + ) + if match > version_match[2]: + version_match = os.path.join(cache_dir, filename), None, match # Provide the cached model weights if found. if version_match[2] > 0: logger.info( "Model weights file %s retrieved from local cache", version_match[0], ) - return version_match[0] + return Path(version_match[0]) # Otherwise try to find compatible model weights on GitHub. else: repo = github.Github().get_repo("Noble-Lab/casanovo") @@ -488,9 +475,9 @@ def _get_model_weights(cache_dir: Path) -> str: # Download the model weights if a matching release was found. if version_match[2] > 0: filename, url, _ = version_match - return _get_weights_from_url( - url, cache_dir, cache_file_name=Path(filename).name - ) + cache_file_path = cache_dir / filename + _download_weights(url, cache_file_path) + return cache_file_path else: logger.error( "No matching model weights for release v%s found, please " @@ -509,8 +496,7 @@ def _get_weights_from_url( file_url: str, cache_dir: Path, force_download: Optional[bool] = False, - cache_file_name: Optional[str] = None, -) -> str: +) -> Path: """ Resolve weight file from URL @@ -527,40 +513,71 @@ def _get_weights_from_url( force_download : Optional[bool], default=False If True, forces a new download of the weight file even if it exists in the cache. - cache_file_name : Optional[str], default=None - Custom name for the cached weight file. If None, the name is derived - from the URL. Returns ------- - str + Path Path to the cached weights file. """ os.makedirs(cache_dir, exist_ok=True) + cache_file_name = Path(urllib.parse.urlparse(file_url).path).name url_hash = hashlib.shake_256(file_url.encode("utf-8")).hexdigest(5) - if cache_file_name is None: - cache_file_name = Path(urllib.parse.urlparse(file_url).path).name - cache_file_dir = cache_dir / url_hash cache_file_path = cache_file_dir / cache_file_name if cache_file_path.is_file() and not force_download: cache_time = cache_file_path.stat() - file_response = requests.head(file_url) url_last_modified = 0 - if "Last-Modified" in file_response.headers: - url_last_modified = email.utils.parsedate_to_datetime( - file_response.headers["Last-Modified"] - ).timestamp() + try: + file_response = requests.head(file_url) + if file_response.ok: + if "Last-Modified" in file_response.headers: + url_last_modified = email.utils.parsedate_to_datetime( + file_response.headers["Last-Modified"] + ).timestamp() + else: + logger.warning( + "Attempted HEAD request to %s yielded non-ok status code - using cached file", + file_url, + ) + except ( + requests.ConnectionError, + requests.Timeout, + requests.TooManyRedirects, + ): + logger.warning( + "Failed to reach %s to get remote last modified time - using cached file", + file_url, + ) if cache_time.st_mtime > url_last_modified: logger.info( "Model weights %s retrieved from local cache", file_url ) - return str(cache_file_path) + return cache_file_path - os.makedirs(cache_file_dir, exist_ok=True) + _download_weights(file_url, cache_file_path) + return cache_file_path + + +def _download_weights(file_url: str, download_path: Path) -> None: + """ + Download weights file from URL + + Download the model weights file from the specified URL and save it to the + given path. Ensures the download directory exists, and uses a progress + bar to indicate download status. + + Parameters + ---------- + file_url : str + URL pointing to the model weights file. + download_path : Path + Path where the downloaded weights file will be saved. + """ + download_file_dir = download_path.parent + os.makedirs(download_file_dir, exist_ok=True) response = requests.get(file_url, stream=True, allow_redirects=True) response.raise_for_status() file_size = int(response.headers.get("Content-Length", 0)) @@ -571,11 +588,9 @@ def _get_weights_from_url( with tqdm.tqdm.wrapattr( response.raw, "read", total=file_size, desc=desc - ) as r_raw, open(cache_file_path, "wb") as file: + ) as r_raw, open(download_path, "wb") as file: shutil.copyfileobj(r_raw, file) - return cache_file_path - def _is_valid_url(file_url: str) -> bool: """ diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index df6c8366..79256cce 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -1,7 +1,13 @@ import collections +import datetime +import email.utils +import hashlib import heapq +import io import os +import pathlib import platform +import requests import shutil import tempfile import unittest @@ -88,17 +94,19 @@ def test_get_model_weights(monkeypatch): filename = os.path.join(tmp_dir, "casanovo_massivekb_v3_0_0.ckpt") assert not os.path.isfile(filename) - assert casanovo._get_model_weights() == filename + result_path = casanovo._get_model_weights(pathlib.Path(tmp_dir)) + assert str(result_path.resolve()) == filename assert os.path.isfile(filename) - assert casanovo._get_model_weights() == filename + result_path = casanovo._get_model_weights(pathlib.Path(tmp_dir)) + assert str(result_path.resolve()) == filename # Impossible to find model weights for (i) full version mismatch and (ii) # major version mismatch. for version in ["999.999.999", "999.0.0"]: - with monkeypatch.context() as mnk: + with monkeypatch.context() as mnk, tempfile.TemporaryDirectory() as tmp_dir: mnk.setattr(casanovo, "__version__", version) with pytest.raises(ValueError): - casanovo._get_model_weights() + casanovo._get_model_weights(pathlib.Path(tmp_dir)) # Test GitHub API rate limit. def request(self, *args, **kwargs): @@ -110,28 +118,59 @@ def request(self, *args, **kwargs): mnk.setattr("appdirs.user_cache_dir", lambda n, a, opinion: tmp_dir) mnk.setattr("github.Requester.Requester.requestJsonAndCheck", request) with pytest.raises(github.RateLimitExceededException): - casanovo._get_model_weights() + casanovo._get_model_weights(pathlib.Path(tmp_dir)) -def test_get_weights_from_url(): +def test_get_weights_from_url(monkeypatch): file_url = "http://example.com/model_weights.ckpt" file_content = b"fake model weights content" - def mock_requests_get(url, stream=True, allow_redirects=True): - response = unittest.mock.MagicMock() - response.raise_for_status = unittest.mock.MagicMock() - response.headers = {"Content-Length": str(len(file_content))} - response.raw = unittest.mock.MagicMock() - response.raw.read = unittest.mock.MagicMock(return_value=file_content) - return response - - def mock_requests_head(url): - response = unittest.mock.MagicMock() - response.headers = {} - return response - - with tempfile.TemporaryDirectory() as tmp_dir: - cache_dir = Path(tmp_dir) + class MockResponseGet: + class MockRaw(io.BytesIO): + def read(self, *args, **kwargs): + return super().read(*args) + + def __init__(self): + self.request_counter = 0 + self.is_ok = True + + def raise_for_status(self): + if not self.is_ok: + raise requests.HTTPError + + def __call__(self, url, stream=True, allow_redirects=True): + self.request_counter += 1 + response = unittest.mock.MagicMock() + response.raise_for_status = self.raise_for_status + response.headers = {"Content-Length": str(len(file_content))} + response.raw = MockResponseGet.MockRaw(file_content) + return response + + class MockResponseHead: + def __init__(self): + self.last_modified = None + self.is_ok = True + self.fail = False + + def __call__(self, url): + if self.fail: + raise requests.ConnectionError + + response = unittest.mock.MagicMock() + response.headers = dict() + response.ok = self.is_ok + if self.last_modified is not None: + response.headers["Last-Modified"] = self.last_modified + + return response + + with monkeypatch.context() as mnk, tempfile.TemporaryDirectory() as tmp_dir: + mock_get = MockResponseGet() + mock_head = MockResponseHead() + mnk.setattr(requests, "get", mock_get) + mnk.setattr(requests, "head", mock_head) + + cache_dir = pathlib.Path(tmp_dir) url_hash = hashlib.shake_256(file_url.encode("utf-8")).hexdigest(5) cache_file_name = "model_weights.ckpt" cache_file_dir = cache_dir / url_hash @@ -139,19 +178,67 @@ def mock_requests_head(url): # Test downloading and caching the file assert not cache_file_path.is_file() - result_path = _get_weights_from_url(file_url, cache_dir) + result_path = casanovo._get_weights_from_url(file_url, cache_dir) assert cache_file_path.is_file() - assert result_path == str(cache_file_path) + assert result_path.resolve() == cache_file_path.resolve() + assert mock_get.request_counter == 1 - # Test using the cached file - result_path_cached = _get_weights_from_url(file_url, cache_dir) - assert result_path_cached == str(cache_file_path) + # Test that cached file is used + result_path = casanovo._get_weights_from_url(file_url, cache_dir) + assert result_path.resolve() == cache_file_path.resolve() + assert mock_get.request_counter == 1 # Test force downloading the file - result_path_forced = _get_weights_from_url( + result_path = casanovo._get_weights_from_url( file_url, cache_dir, force_download=True ) - assert result_path_forced == str(cache_file_path) + assert result_path.resolve() == cache_file_path.resolve() + assert mock_get.request_counter == 2 + + # Test that file is re-downloaded if last modified is newer than + # file last modified + # NOTE: Assuming test takes < 1 year to run + mock_head.last_modified = email.utils.format_datetime( + datetime.datetime.now() + datetime.timedelta(days=365.0) + ) + result_path = casanovo._get_weights_from_url(file_url, cache_dir) + assert result_path.resolve() == cache_file_path.resolve() + assert mock_get.request_counter == 3 + + # Test file is not redownloaded if its newer than upstream file + mock_head.last_modified = email.utils.format_datetime( + datetime.datetime.now() - datetime.timedelta(days=365.0) + ) + result_path = casanovo._get_weights_from_url(file_url, cache_dir) + assert result_path.resolve() == cache_file_path.resolve() + assert mock_get.request_counter == 3 + + # Test that error is raised if file get response is not OK + mock_get.is_ok = False + with pytest.raises(requests.HTTPError): + casanovo._get_weights_from_url( + file_url, cache_dir, force_download=True + ) + mock_get.is_ok = True + assert mock_get.request_counter == 4 + + # Test that cached file is used if head requests yields non-ok status + # code, even if upstream file is newer + mock_head.is_ok = False + mock_head.last_modified = email.utils.format_datetime( + datetime.datetime.now() + datetime.timedelta(days=365.0) + ) + result_path = casanovo._get_weights_from_url(file_url, cache_dir) + assert result_path.resolve() == cache_file_path.resolve() + assert mock_get.request_counter == 4 + mock_head.is_ok = True + + # Test that cached file is used if head request fails + mock_head.fail = True + result_path = casanovo._get_weights_from_url(file_url, cache_dir) + assert result_path.resolve() == cache_file_path.resolve() + assert mock_get.request_counter == 4 + mock_head.fail = False def test_tensorboard(): From 0acf8d46c33cb04dee8dc4272e84a78629fa8555 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 10 Jul 2024 22:34:43 +0000 Subject: [PATCH 07/14] Generate new screengrabs with rich-codex --- docs/images/evaluate-help.svg | 146 ++++++++++++------------ docs/images/sequence-help.svg | 146 ++++++++++++------------ docs/images/train-help.svg | 202 ++++++++++++++++++---------------- 3 files changed, 261 insertions(+), 233 deletions(-) diff --git a/docs/images/evaluate-help.svg b/docs/images/evaluate-help.svg index 2f770e2e..bd8b258f 100644 --- a/docs/images/evaluate-help.svg +++ b/docs/images/evaluate-help.svg @@ -1,4 +1,4 @@ - + - - + + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + + + + + + + - + - + - - $ casanovo evaluate --help - -Usage:casanovo evaluate [OPTIONSANNOTATED_PEAK_PATH...                       - - Evaluate de novo peptide sequencing performance.                                - ANNOTATED_PEAK_PATH must be one or more annoated MGF files, such as those       - provided by MassIVE-KB.                                                         - -╭─ Arguments ──────────────────────────────────────────────────────────────────╮ -*  ANNOTATED_PEAK_PATH    FILE[required] -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Options ────────────────────────────────────────────────────────────────────╮ ---model-mFILE                        The model weights (.ckpt file).  -                                              If not provided, Casanovo will   -                                              try to download the latest       -                                              release.                         ---output-oFILE                        The mzTab file to which results  -                                              will be written.                 ---config-cFILE                        The YAML configuration file      -                                              overriding the default options.  ---verbosity-v[debug|info|warning|error]  Set the verbosity of console     -                                              logging messages. Log files are  -                                              always set to 'debug'.           ---help-h  Show this message and exit.      -╰──────────────────────────────────────────────────────────────────────────────╯ - + + $ casanovo evaluate --help + +Usage:casanovo evaluate [OPTIONSANNOTATED_PEAK_PATH...                       + + Evaluate de novo peptide sequencing performance.                                + ANNOTATED_PEAK_PATH must be one or more annoated MGF files, such as those       + provided by MassIVE-KB.                                                         + +╭─ Arguments ──────────────────────────────────────────────────────────────────╮ +*  ANNOTATED_PEAK_PATH    FILE[required] +╰──────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ────────────────────────────────────────────────────────────────────╮ +--model-mTEXT                        Either the model weights (.ckpt  +                                              file) or a URL pointing to the   +                                              model weights file. If not       +                                              provided, Casanovo will try to   +                                              download the latest release      +                                              automatically.                   +--output-oFILE                        The mzTab file to which results  +                                              will be written.                 +--config-cFILE                        The YAML configuration file      +                                              overriding the default options.  +--verbosity-v[debug|info|warning|error]  Set the verbosity of console     +                                              logging messages. Log files are  +                                              always set to 'debug'.           +--help-h  Show this message and exit.      +╰──────────────────────────────────────────────────────────────────────────────╯ + diff --git a/docs/images/sequence-help.svg b/docs/images/sequence-help.svg index 6635cfaa..5e75dfe4 100644 --- a/docs/images/sequence-help.svg +++ b/docs/images/sequence-help.svg @@ -1,4 +1,4 @@ - + - - + + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + + + + + + + - + - + - - $ casanovo sequence --help - -Usage:casanovo sequence [OPTIONSPEAK_PATH...                                 - - De novo sequence peptides from tandem mass spectra.                             - PEAK_PATH must be one or more mzMl, mzXML, or MGF files from which to sequence  - peptides.                                                                       - -╭─ Arguments ──────────────────────────────────────────────────────────────────╮ -*  PEAK_PATH    FILE[required] -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Options ────────────────────────────────────────────────────────────────────╮ ---model-mFILE                        The model weights (.ckpt file).  -                                              If not provided, Casanovo will   -                                              try to download the latest       -                                              release.                         ---output-oFILE                        The mzTab file to which results  -                                              will be written.                 ---config-cFILE                        The YAML configuration file      -                                              overriding the default options.  ---verbosity-v[debug|info|warning|error]  Set the verbosity of console     -                                              logging messages. Log files are  -                                              always set to 'debug'.           ---help-h  Show this message and exit.      -╰──────────────────────────────────────────────────────────────────────────────╯ - + + $ casanovo sequence --help + +Usage:casanovo sequence [OPTIONSPEAK_PATH...                                 + + De novo sequence peptides from tandem mass spectra.                             + PEAK_PATH must be one or more mzMl, mzXML, or MGF files from which to sequence  + peptides.                                                                       + +╭─ Arguments ──────────────────────────────────────────────────────────────────╮ +*  PEAK_PATH    FILE[required] +╰──────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ────────────────────────────────────────────────────────────────────╮ +--model-mTEXT                        Either the model weights (.ckpt  +                                              file) or a URL pointing to the   +                                              model weights file. If not       +                                              provided, Casanovo will try to   +                                              download the latest release      +                                              automatically.                   +--output-oFILE                        The mzTab file to which results  +                                              will be written.                 +--config-cFILE                        The YAML configuration file      +                                              overriding the default options.  +--verbosity-v[debug|info|warning|error]  Set the verbosity of console     +                                              logging messages. Log files are  +                                              always set to 'debug'.           +--help-h  Show this message and exit.      +╰──────────────────────────────────────────────────────────────────────────────╯ + diff --git a/docs/images/train-help.svg b/docs/images/train-help.svg index 58251215..e27717e1 100644 --- a/docs/images/train-help.svg +++ b/docs/images/train-help.svg @@ -1,4 +1,4 @@ - + - - + + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + + + + + + + + + + - + - + - - $ casanovo train --help - -Usage:casanovo train [OPTIONSTRAIN_PEAK_PATH...                              - - Train a Casanovo model on your own data.                                        - TRAIN_PEAK_PATH must be one or more annoated MGF files, such as those provided  - by MassIVE-KB, from which to train a new Casnovo model.                         - -╭─ Arguments ──────────────────────────────────────────────────────────────────╮ -*  TRAIN_PEAK_PATH    FILE[required] -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Options ────────────────────────────────────────────────────────────────────╮ -*--validation_peak_pa…-pFILE                    An annotated MGF file   -                                                       for validation, like    -                                                       from MassIVE-KB. Use    -                                                       this option multiple    -                                                       times to specify        -                                                       multiple files.         -[required]             ---model-mFILE                    The model weights       -                                                       (.ckpt file). If not    -                                                       provided, Casanovo      -                                                       will try to download    -                                                       the latest release.     ---output-oFILE                    The mzTab file to       -                                                       which results will be   -                                                       written.                ---config-cFILE                    The YAML configuration  -                                                       file overriding the     -                                                       default options.        ---verbosity-v[debug|info|warning|er  Set the verbosity of    -ror]  console logging         -                                                       messages. Log files     -                                                       are always set to       -                                                       'debug'.                ---help-h  Show this message and   -                                                       exit.                   -╰──────────────────────────────────────────────────────────────────────────────╯ - + + $ casanovo train --help + +Usage:casanovo train [OPTIONSTRAIN_PEAK_PATH...                              + + Train a Casanovo model on your own data.                                        + TRAIN_PEAK_PATH must be one or more annoated MGF files, such as those provided  + by MassIVE-KB, from which to train a new Casnovo model.                         + +╭─ Arguments ──────────────────────────────────────────────────────────────────╮ +*  TRAIN_PEAK_PATH    FILE[required] +╰──────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ────────────────────────────────────────────────────────────────────╮ +*--validation_peak_pa…-pFILE                    An annotated MGF file   +                                                       for validation, like    +                                                       from MassIVE-KB. Use    +                                                       this option multiple    +                                                       times to specify        +                                                       multiple files.         +[required]             +--model-mTEXT                    Either the model        +                                                       weights (.ckpt file)    +                                                       or a URL pointing to    +                                                       the model weights       +                                                       file. If not provided,  +                                                       Casanovo will try to    +                                                       download the latest     +                                                       release automatically.  +--output-oFILE                    The mzTab file to       +                                                       which results will be   +                                                       written.                +--config-cFILE                    The YAML configuration  +                                                       file overriding the     +                                                       default options.        +--verbosity-v[debug|info|warning|er  Set the verbosity of    +ror]  console logging         +                                                       messages. Log files     +                                                       are always set to       +                                                       'debug'.                +--help-h  Show this message and   +                                                       exit.                   +╰──────────────────────────────────────────────────────────────────────────────╯ + From 72cc151215d419c977086e4ca70f3318b88fb3f0 Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Thu, 11 Jul 2024 09:49:55 -0700 Subject: [PATCH 08/14] Download weights documentation --- docs/file_formats.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/docs/file_formats.md b/docs/file_formats.md index 7cde8c7a..448f290d 100644 --- a/docs/file_formats.md +++ b/docs/file_formats.md @@ -2,6 +2,8 @@ ## Input file formats for Casanovo +### MS/MS Spectra + When you're ready to use Casanovo for *de novo* peptide sequencing, you can input your MS/MS spectra in one of the following formats: - **[mzML](https://doi.org/10.1074/mcp.R110.000133)**: XML-based mass spectrometry community standard file format developed by the Proteomics Standards Initiative (PSI). @@ -11,6 +13,14 @@ When you're ready to use Casanovo for *de novo* peptide sequencing, you can inpu All three of the above file formats can be used as input to Casanovo for *de novo* peptide sequencing. As the official PSI standard format containing the complete information from a mass spectrometry run, mzML should typically be preferred. +### Model Weights + +In addition to MS/MS spectra, Casanova also optionally accepts a model weights (.ckpt) input file when running in training, sequencing or evaluating mode. +If no input weight file is provided, Casanovo will automatically use the most recent compatible weights from the official Casanovo github repo, which will be downloaded and cached if they are not already. +Alternatively, you can input custom model weights in the form of a local file system path or a URL pointing to a compatible Casanovo model weights file. +If a URL is provided the upstream weights file will be downloaded and cached for later use. +See the [command line interface documentation](cli.rst) for more details. + ## Output: Understanding the mzTab format After Casanovo processes your input file(s), it provides the sequencing results in an **[mzTab]((https://doi.org/10.1074/mcp.O113.036681))** file. From 49f78dc3f21b72ddc8c3d25d07639016bcfb6203 Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Fri, 26 Jul 2024 13:29:32 -0700 Subject: [PATCH 09/14] bad URL testcase - use Datetime library for processing request headers --- casanovo/casanovo.py | 10 ++++++---- tests/unit_tests/test_unit.py | 25 +++++++++++++++---------- 2 files changed, 21 insertions(+), 14 deletions(-) diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py index df72216c..e7b5c29f 100644 --- a/casanovo/casanovo.py +++ b/casanovo/casanovo.py @@ -1,11 +1,9 @@ """The command line entry point for Casanovo.""" import datetime -import email.utils import functools import hashlib import logging -import urllib import os import re import shutil @@ -528,6 +526,9 @@ def _get_weights_from_url( Path Path to the cached weights file. """ + if not _is_valid_url(file_url): + raise ValueError("file_url must point to a valid URL") + os.makedirs(cache_dir, exist_ok=True) cache_file_name = Path(urllib.parse.urlparse(file_url).path).name url_hash = hashlib.shake_256(file_url.encode("utf-8")).hexdigest(5) @@ -542,8 +543,9 @@ def _get_weights_from_url( file_response = requests.head(file_url) if file_response.ok: if "Last-Modified" in file_response.headers: - url_last_modified = email.utils.parsedate_to_datetime( - file_response.headers["Last-Modified"] + url_last_modified = datetime.datetime.strptime( + file_response.headers["Last-Modified"], + "%a, %d %b %Y %H:%M:%S %Z", ).timestamp() else: logger.warning( diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index 79256cce..08947b9c 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -1,6 +1,5 @@ import collections import datetime -import email.utils import hashlib import heapq import io @@ -198,17 +197,18 @@ def __call__(self, url): # Test that file is re-downloaded if last modified is newer than # file last modified # NOTE: Assuming test takes < 1 year to run - mock_head.last_modified = email.utils.format_datetime( - datetime.datetime.now() + datetime.timedelta(days=365.0) - ) + curr_utc = datetime.datetime.now().astimezone(datetime.timezone.utc) + mock_head.last_modified = ( + curr_utc + datetime.timedelta(days=365.0) + ).strftime("%a, %d %b %Y %H:%M:%S GMT") result_path = casanovo._get_weights_from_url(file_url, cache_dir) assert result_path.resolve() == cache_file_path.resolve() assert mock_get.request_counter == 3 # Test file is not redownloaded if its newer than upstream file - mock_head.last_modified = email.utils.format_datetime( - datetime.datetime.now() - datetime.timedelta(days=365.0) - ) + mock_head.last_modified = ( + curr_utc - datetime.timedelta(days=365.0) + ).strftime("%a, %d %b %Y %H:%M:%S GMT") result_path = casanovo._get_weights_from_url(file_url, cache_dir) assert result_path.resolve() == cache_file_path.resolve() assert mock_get.request_counter == 3 @@ -225,9 +225,9 @@ def __call__(self, url): # Test that cached file is used if head requests yields non-ok status # code, even if upstream file is newer mock_head.is_ok = False - mock_head.last_modified = email.utils.format_datetime( - datetime.datetime.now() + datetime.timedelta(days=365.0) - ) + mock_head.last_modified = ( + curr_utc + datetime.timedelta(days=365.0) + ).strftime("%a, %d %b %Y %H:%M:%S GMT") result_path = casanovo._get_weights_from_url(file_url, cache_dir) assert result_path.resolve() == cache_file_path.resolve() assert mock_get.request_counter == 4 @@ -240,6 +240,11 @@ def __call__(self, url): assert mock_get.request_counter == 4 mock_head.fail = False + # Test invalid URL + with pytest.raises(ValueError): + bad_url = "foobar" + casanovo._get_weights_from_url(bad_url, cache_dir) + def test_tensorboard(): """ From 794bdeb631542b6de4e2ff5b2eb12da1aee31aab Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Mon, 29 Jul 2024 13:42:06 -0700 Subject: [PATCH 10/14] unit tests for setup_model, Github api mocking --- casanovo/casanovo.py | 4 +- tests/unit_tests/test_unit.py | 246 ++++++++++++++++++++++++++++------ 2 files changed, 207 insertions(+), 43 deletions(-) diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py index e7b5c29f..6ebd8c42 100644 --- a/casanovo/casanovo.py +++ b/casanovo/casanovo.py @@ -386,10 +386,12 @@ def setup_model( if _is_valid_url(model): model = _get_weights_from_url(model, cache_dir) elif not Path(model).is_file(): - raise ValueError( + error_msg = ( f"{model} is not a valid URL or checkpoint file path, " "--model argument must be a URL or checkpoint file path" ) + logger.error(error_msg) + raise ValueError(error_msg) # Log the active configuration. logger.info("Casanovo version %s", str(__version__)) diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index 08947b9c..fff5feab 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -1,5 +1,6 @@ import collections import datetime +import functools import hashlib import heapq import io @@ -76,7 +77,172 @@ def test_split_version(): assert version == ("3", "0", "1") -@pytest.mark.skip(reason="Hit rate limit during CI/CD") +class MockResponseGet: + file_content = b"fake model weights content" + + class MockRaw(io.BytesIO): + def read(self, *args, **kwargs): + return super().read(*args) + + def __init__(self): + self.request_counter = 0 + self.is_ok = True + + def raise_for_status(self): + if not self.is_ok: + raise requests.HTTPError + + def __call__(self, url, stream=True, allow_redirects=True): + self.request_counter += 1 + response = unittest.mock.MagicMock() + response.raise_for_status = self.raise_for_status + response.headers = {"Content-Length": str(len(self.file_content))} + response.raw = MockResponseGet.MockRaw(self.file_content) + return response + + +class MockAsset: + def __init__(self, file_name): + self.name = file_name + self.browser_download_url = f"http://example.com/{file_name}" + + +class MockRelease: + def __init__(self, tag_name, assets): + self.tag_name = tag_name + self.assets = [MockAsset(asset) for asset in assets] + + def get_assets(self): + return self.assets + + +class MockRepo: + def __init__( + self, + release_dict={ + "v3.0.0": [ + "casanovo_massivekb.ckpt", + "casanovo_non-enzy.checkpt", + "v3.0.0.zip", + "v3.0.0.tar.gz", + ], + "v3.1.0": ["v3.1.0.zip", "v3.1.0.tar.gz"], + "v3.2.0": ["v3.2.0.zip", "v3.2.0.tar.gz"], + "v3.3.0": ["v3.3.0.zip", "v3.3.0.tar.gz"], + "v4.0.0": [ + "casanovo_massivekb.ckpt", + "casanovo_nontryptic.ckpt", + "v4.0.0.zip", + "v4.0.0.tar.gz", + ], + }, + ): + self.releases = [ + MockRelease(tag_name, assets) + for tag_name, assets in release_dict.items() + ] + + def get_releases(self): + return self.releases + + +class MockGithub: + def __init__(self, releases): + self.releases = releases + + def get_repo(self, repo_name): + return MockRepo() + + +def test_setup_model(monkeypatch): + test_releases = ["3.0.0", "3.0.999", "3.999.999"] + mock_get = MockResponseGet() + mock_github = functools.partial(MockGithub, test_releases) + version = "3.0.0" + + # Test model is none when not training + with monkeypatch.context() as mnk, tempfile.TemporaryDirectory() as tmp_dir: + mnk.setattr(casanovo, "__version__", version) + mnk.setattr("appdirs.user_cache_dir", lambda n, a, opinion: tmp_dir) + mnk.setattr(github, "Github", mock_github) + mnk.setattr(requests, "get", mock_get) + filename = pathlib.Path(tmp_dir) / "casanovo_massivekb_v3_0_0.ckpt" + + assert not filename.is_file() + _, result_path = casanovo.setup_model(None, None, None, False) + assert result_path.resolve() == filename.resolve() + assert filename.is_file() + assert mock_get.request_counter == 1 + os.remove(result_path) + + assert not filename.is_file() + _, result = casanovo.setup_model(None, None, None, True) + assert result is None + assert not filename.is_file() + assert mock_get.request_counter == 1 + + with monkeypatch.context() as mnk, tempfile.TemporaryDirectory() as tmp_dir: + mnk.setattr(casanovo, "__version__", version) + mnk.setattr("appdirs.user_cache_dir", lambda n, a, opinion: tmp_dir) + mnk.setattr(github, "Github", mock_github) + mnk.setattr(requests, "get", mock_get) + + cache_file_name = "model_weights.ckpt" + file_url = f"http://www.example.com/{cache_file_name}" + url_hash = hashlib.shake_256(file_url.encode("utf-8")).hexdigest(5) + cache_dir = pathlib.Path(tmp_dir) + cache_file_dir = cache_dir / url_hash + cache_file_path = cache_file_dir / cache_file_name + + assert not cache_file_path.is_file() + _, result_path = casanovo.setup_model(file_url, None, None, False) + assert cache_file_path.is_file() + assert result_path.resolve() == cache_file_path.resolve() + assert mock_get.request_counter == 2 + os.remove(result_path) + + assert not cache_file_path.is_file() + _, result_path = casanovo.setup_model(file_url, None, None, False) + assert cache_file_path.is_file() + assert result_path.resolve() == cache_file_path.resolve() + assert mock_get.request_counter == 3 + + # Test model is file + with monkeypatch.context() as mnk, tempfile.NamedTemporaryFile( + suffix=".ckpt" + ) as temp_file, tempfile.TemporaryDirectory() as tmp_dir: + mnk.setattr(casanovo, "__version__", version) + mnk.setattr("appdirs.user_cache_dir", lambda n, a, opinion: tmp_dir) + mnk.setattr(github, "Github", mock_github) + mnk.setattr(requests, "get", mock_get) + + temp_file_path = temp_file.name + _, result = casanovo.setup_model(temp_file_path, None, None, False) + assert mock_get.request_counter == 3 + assert result == temp_file_path + + _, result = casanovo.setup_model(temp_file_path, None, None, True) + assert mock_get.request_counter == 3 + assert result == temp_file_path + + # Test model is neither a URL or File + with monkeypatch.context() as mnk, tempfile.TemporaryDirectory() as tmp_dir: + mnk.setattr(casanovo, "__version__", version) + mnk.setattr("appdirs.user_cache_dir", lambda n, a, opinion: tmp_dir) + mnk.setattr(github, "Github", mock_github) + mnk.setattr(requests, "get", mock_get) + + with pytest.raises(ValueError): + casanovo.setup_model("FooBar", None, None, False) + + assert mock_get.request_counter == 3 + + with pytest.raises(ValueError): + casanovo.setup_model("FooBar", None, None, False) + + assert mock_get.request_counter == 3 + + def test_get_model_weights(monkeypatch): """ Test that model weights can be downloaded from GitHub or used from the @@ -84,12 +250,18 @@ def test_get_model_weights(monkeypatch): """ # Model weights for fully matching version, minor matching version, major # matching version. - for version in ["3.0.0", "3.0.999", "3.999.999"]: + test_releases = ["3.0.0", "3.0.999", "3.999.999"] + mock_get = MockResponseGet() + mock_github = functools.partial(MockGithub, test_releases) + + for version in test_releases: with monkeypatch.context() as mnk, tempfile.TemporaryDirectory() as tmp_dir: mnk.setattr(casanovo, "__version__", version) mnk.setattr( "appdirs.user_cache_dir", lambda n, a, opinion: tmp_dir ) + mnk.setattr(github, "Github", mock_github) + mnk.setattr(requests, "get", mock_get) filename = os.path.join(tmp_dir, "casanovo_massivekb_v3_0_0.ckpt") assert not os.path.isfile(filename) @@ -104,6 +276,8 @@ def test_get_model_weights(monkeypatch): for version in ["999.999.999", "999.0.0"]: with monkeypatch.context() as mnk, tempfile.TemporaryDirectory() as tmp_dir: mnk.setattr(casanovo, "__version__", version) + mnk.setattr(github, "Github", mock_github) + mnk.setattr(requests, "get", mock_get) with pytest.raises(ValueError): casanovo._get_model_weights(pathlib.Path(tmp_dir)) @@ -116,52 +290,35 @@ def request(self, *args, **kwargs): with monkeypatch.context() as mnk, tempfile.TemporaryDirectory() as tmp_dir: mnk.setattr("appdirs.user_cache_dir", lambda n, a, opinion: tmp_dir) mnk.setattr("github.Requester.Requester.requestJsonAndCheck", request) + mnk.setattr(requests, "get", mock_get) + mock_get.request_counter = 0 with pytest.raises(github.RateLimitExceededException): casanovo._get_model_weights(pathlib.Path(tmp_dir)) + assert mock_get.request_counter == 0 + + +class MockResponseHead: + def __init__(self): + self.last_modified = None + self.is_ok = True + self.fail = False + + def __call__(self, url): + if self.fail: + raise requests.ConnectionError + + response = unittest.mock.MagicMock() + response.headers = dict() + response.ok = self.is_ok + if self.last_modified is not None: + response.headers["Last-Modified"] = self.last_modified + + return response + def test_get_weights_from_url(monkeypatch): file_url = "http://example.com/model_weights.ckpt" - file_content = b"fake model weights content" - - class MockResponseGet: - class MockRaw(io.BytesIO): - def read(self, *args, **kwargs): - return super().read(*args) - - def __init__(self): - self.request_counter = 0 - self.is_ok = True - - def raise_for_status(self): - if not self.is_ok: - raise requests.HTTPError - - def __call__(self, url, stream=True, allow_redirects=True): - self.request_counter += 1 - response = unittest.mock.MagicMock() - response.raise_for_status = self.raise_for_status - response.headers = {"Content-Length": str(len(file_content))} - response.raw = MockResponseGet.MockRaw(file_content) - return response - - class MockResponseHead: - def __init__(self): - self.last_modified = None - self.is_ok = True - self.fail = False - - def __call__(self, url): - if self.fail: - raise requests.ConnectionError - - response = unittest.mock.MagicMock() - response.headers = dict() - response.ok = self.is_ok - if self.last_modified is not None: - response.headers["Last-Modified"] = self.last_modified - - return response with monkeypatch.context() as mnk, tempfile.TemporaryDirectory() as tmp_dir: mock_get = MockResponseGet() @@ -246,6 +403,11 @@ def __call__(self, url): casanovo._get_weights_from_url(bad_url, cache_dir) +def test_is_valid_url(): + assert casanovo._is_valid_url("https://www.washington.edu/") + assert not casanovo._is_valid_url("foobar") + + def test_tensorboard(): """ Test that the tensorboard.SummaryWriter object is only created when a folder From dc6a31b57f083442fcea9fd2313a87a7cc56b673 Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Mon, 29 Jul 2024 13:45:46 -0700 Subject: [PATCH 11/14] github weights download resolution documentation --- docs/file_formats.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/file_formats.md b/docs/file_formats.md index 448f290d..9c8185f0 100644 --- a/docs/file_formats.md +++ b/docs/file_formats.md @@ -15,8 +15,11 @@ As the official PSI standard format containing the complete information from a m ### Model Weights -In addition to MS/MS spectra, Casanova also optionally accepts a model weights (.ckpt) input file when running in training, sequencing or evaluating mode. +In addition to MS/MS spectra, Casanovo also optionally accepts a model weights (.ckpt) input file when running in training, sequencing or evaluating mode. If no input weight file is provided, Casanovo will automatically use the most recent compatible weights from the official Casanovo github repo, which will be downloaded and cached if they are not already. +Model weights are retrieved by matching release version. +If no model weights for an identical release (major, minor, patch), alternative releases with matching (i) major and minor, or (ii) major versions will be used. + Alternatively, you can input custom model weights in the form of a local file system path or a URL pointing to a compatible Casanovo model weights file. If a URL is provided the upstream weights file will be downloaded and cached for later use. See the [command line interface documentation](cli.rst) for more details. From c2bc971444bd00dbd065cb5831cb92716596174e Mon Sep 17 00:00:00 2001 From: Gwenneth Straub Date: Mon, 29 Jul 2024 14:43:17 -0700 Subject: [PATCH 12/14] test_get_model_weights mac os fix --- tests/unit_tests/test_unit.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index fff5feab..120e341a 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -263,13 +263,14 @@ def test_get_model_weights(monkeypatch): mnk.setattr(github, "Github", mock_github) mnk.setattr(requests, "get", mock_get) - filename = os.path.join(tmp_dir, "casanovo_massivekb_v3_0_0.ckpt") - assert not os.path.isfile(filename) - result_path = casanovo._get_model_weights(pathlib.Path(tmp_dir)) - assert str(result_path.resolve()) == filename - assert os.path.isfile(filename) - result_path = casanovo._get_model_weights(pathlib.Path(tmp_dir)) - assert str(result_path.resolve()) == filename + tmp_path = pathlib.Path(tmp_dir) + filename = tmp_path / "casanovo_massivekb_v3_0_0.ckpt" + assert not filename.is_file() + result_path = casanovo._get_model_weights(tmp_path) + assert result_path == filename + assert filename.is_file() + result_path = casanovo._get_model_weights(tmp_path) + assert result_path == filename # Impossible to find model weights for (i) full version mismatch and (ii) # major version mismatch. From 98ca77987a64fb9ec0e7adbaf23b8483d6e4a700 Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Fri, 2 Aug 2024 09:17:21 +0200 Subject: [PATCH 13/14] Update CHANGELOG --- CHANGELOG.md | 5 +++-- docs/file_formats.md | 16 +++++++++------- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1824cc1f..9bd936a4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,11 +8,12 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), ### Added -- During training, model checkpoints will now be saved at the end of each training epoch in addition to the checkpoints saved at the end of every validation run. +- During training, model checkpoints will be saved at the end of each training epoch in addition to the checkpoints saved at the end of every validation run. +- Besides as a local file, model weights can be specified from a URL. Upon initial download, the weights file is cached for future re-use. ### Fixed -- Precursor charges are now exported as integers instead of floats in the mzTab output file, in compliance with the mzTab specification. +- Precursor charges are exported as integers instead of floats in the mzTab output file, in compliance with the mzTab specification. ## [4.2.1] - 2024-06-25 diff --git a/docs/file_formats.md b/docs/file_formats.md index 9c8185f0..b01e4c02 100644 --- a/docs/file_formats.md +++ b/docs/file_formats.md @@ -2,7 +2,7 @@ ## Input file formats for Casanovo -### MS/MS Spectra +### MS/MS spectra When you're ready to use Casanovo for *de novo* peptide sequencing, you can input your MS/MS spectra in one of the following formats: @@ -13,15 +13,17 @@ When you're ready to use Casanovo for *de novo* peptide sequencing, you can inpu All three of the above file formats can be used as input to Casanovo for *de novo* peptide sequencing. As the official PSI standard format containing the complete information from a mass spectrometry run, mzML should typically be preferred. -### Model Weights +### Model weights -In addition to MS/MS spectra, Casanovo also optionally accepts a model weights (.ckpt) input file when running in training, sequencing or evaluating mode. -If no input weight file is provided, Casanovo will automatically use the most recent compatible weights from the official Casanovo github repo, which will be downloaded and cached if they are not already. -Model weights are retrieved by matching release version. -If no model weights for an identical release (major, minor, patch), alternative releases with matching (i) major and minor, or (ii) major versions will be used. +In addition to MS/MS spectra, Casanovo also optionally accepts a model weights (.ckpt extension) input file when running in training, sequencing, or evaluating mode. +These weights define the functionality of the Casanovo neural network. + +If no input weights file is provided, Casanovo will automatically use the most recent compatible weights from the [official Casanovo GitHub repository](https://github.com/Noble-Lab/casanovo), which will be downloaded and cached locally if they are not already. +Model weights are retrieved by matching Casanovo release version, which is of the form (major, minor, patch). +If no model weights for an identical release are available, alternative releases with matching (i) major and minor, or (ii) major versions will be used. Alternatively, you can input custom model weights in the form of a local file system path or a URL pointing to a compatible Casanovo model weights file. -If a URL is provided the upstream weights file will be downloaded and cached for later use. +If a URL is provided, the upstream weights file will be downloaded and cached locally for later use. See the [command line interface documentation](cli.rst) for more details. ## Output: Understanding the mzTab format From c3a0f77fbb1072a34d9d59f4ad23c3e4871971ed Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 2 Aug 2024 07:20:12 +0000 Subject: [PATCH 14/14] Generate new screengrabs with rich-codex --- docs/images/help.svg | 163 +++---------------------------------------- 1 file changed, 11 insertions(+), 152 deletions(-) diff --git a/docs/images/help.svg b/docs/images/help.svg index 80d63c7e..baf2e237 100644 --- a/docs/images/help.svg +++ b/docs/images/help.svg @@ -1,4 +1,4 @@ - + - - + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - + - + - - $ casanovo --help - -Usage:casanovo [OPTIONSCOMMAND [ARGS]...                                     - - ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓  - ┃                                  Casanovo                                  ┃  - ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛  - Casanovo de novo sequences peptides from tandem mass spectra using a            - Transformer model. Casanovo currently supports mzML, mzXML, and MGF files for   - de novo sequencing and annotated MGF files, such as those from MassIVE-KB, for  - training new models.                                                            - - Links:                                                                          - - • Documentation: https://casanovo.readthedocs.io                               - • Official code repository: https://github.com/Noble-Lab/casanovo              - - If you use Casanovo in your work, please cite:                                  - - • Yilmaz, M., Fondrie, W. E., Bittremieux, W., Oh, S. & Noble, W. S. De novo   -mass spectrometry peptide sequencing with a transformer model. Proceedings   -of the 39th International Conference on Machine Learning - ICML '22 (2022)   -doi:10.1101/2022.02.07.479481.                                               - -╭─ Options ────────────────────────────────────────────────────────────────────╮ ---help-h    Show this message and exit.                                     -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Commands ───────────────────────────────────────────────────────────────────╮ -configure Generate a Casanovo configuration file to customize.               -evaluate  Evaluate de novo peptide sequencing performance.                   -sequence  De novo sequence peptides from tandem mass spectra.                -train     Train a Casanovo model on your own data.                           -version   Get the Casanovo version information                               -╰──────────────────────────────────────────────────────────────────────────────╯ - + + $ casanovo --help