diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py index 397fcc33..3a1416c1 100644 --- a/casanovo/casanovo.py +++ b/casanovo/casanovo.py @@ -7,6 +7,7 @@ import re import shutil import sys +import time import warnings from pathlib import Path from typing import Optional, Tuple @@ -140,14 +141,17 @@ def sequence( """ output = setup_logging(output, verbosity) config, model = setup_model(model, config, output, False) + start_time = time.time() with ModelRunner(config, model) as runner: logger.info("Sequencing peptides from:") for peak_file in peak_path: logger.info(" %s", peak_file) runner.predict(peak_path, output) - - logger.info("DONE!") + psms = runner.writer.psms + utils.log_sequencing_report( + psms, start_time=start_time, end_time=time.time() + ) @main.command(cls=_SharedParams) @@ -171,14 +175,14 @@ def evaluate( """ output = setup_logging(output, verbosity) config, model = setup_model(model, config, output, False) + start_time = time.time() with ModelRunner(config, model) as runner: logger.info("Sequencing and evaluating peptides from:") for peak_file in annotated_peak_path: logger.info(" %s", peak_file) runner.evaluate(annotated_peak_path) - - logger.info("DONE!") + utils.log_run_report(start_time=start_time, end_time=time.time()) @main.command(cls=_SharedParams) @@ -214,6 +218,7 @@ def train( """ output = setup_logging(output, verbosity) config, model = setup_model(model, config, output, True) + start_time = time.time() with ModelRunner(config, model) as runner: logger.info("Training a model from:") for peak_file in train_peak_path: @@ -224,8 +229,7 @@ def train( logger.info(" %s", peak_file) runner.train(train_peak_path, validation_peak_path) - - logger.info("DONE!") + utils.log_run_report(start_time=start_time, end_time=time.time()) @main.command() diff --git a/casanovo/utils.py b/casanovo/utils.py index 71e5962b..fde6cd05 100644 --- a/casanovo/utils.py +++ b/casanovo/utils.py @@ -1,15 +1,24 @@ """Small utility functions""" +import heapq import logging import os import platform import re -from typing import Tuple +import socket +import sys +import time +from datetime import datetime +from typing import Tuple, Dict, List, Optional +import numpy as np +import pandas as pd import psutil import torch +SCORE_BINS = [0.0, 0.5, 0.9, 0.95, 0.99] + logger = logging.getLogger("casanovo") @@ -66,3 +75,179 @@ def split_version(version: str) -> Tuple[str, str, str]: """ version_regex = re.compile(r"(\d+)\.(\d+)\.*(\d*)(?:.dev\d+.+)?") return tuple(g for g in version_regex.match(version).groups()) + + +def get_score_bins( + scores: pd.Series, score_bins: List[float] +) -> Dict[float, int]: + """ + Get binned confidence scores + + From a list of confidence scores, return a dictionary mapping each + confidence score to the number of spectra with a confidence greater + than or equal to it. + + Parameters + ---------- + scores: pd.Series + Series of assigned peptide scores. + score_bins: List[float] + Confidence scores to map. + + Returns + ------- + score_bin_dict: Dict[float, int] + Dictionary mapping each confidence score to the number of spectra + with a confidence greater than or equal to it. + """ + return {score: (scores >= score).sum() for score in score_bins} + + +def get_peptide_lengths(sequences: pd.Series) -> np.ndarray: + """ + Get a numpy array containing the length of each peptide sequence + + Parameters + ---------- + sequences: pd.Series + Series of peptide sequences. + + Returns + ------- + sequence_lengths: np.ndarray + Numpy array containing the length of each sequence, listed in the + same order that the sequences are provided in. + """ + # Mass modifications do not contribute to sequence length + # FIXME: If PTMs are represented in ProForma notation this filtering + # operation needs to be reimplemented + return sequences.str.replace(r"[^a-zA-Z]", "", regex=True).apply(len) + + +def get_report_dict( + results_table: pd.DataFrame, score_bins: List[float] = SCORE_BINS +) -> Optional[Dict]: + """ + Generate sequencing run report + + Parameters + ---------- + results_table: pd.DataFrame + Parsed spectrum match table + score_bins: List[float], Optional + Confidence scores for creating confidence CMF, see get_score_bins + + Returns + ------- + report_gen: Dict + Generated report represented as a dictionary, or None if no + sequencing predictions were logged + """ + if results_table.empty: + return None + + peptide_lengths = get_peptide_lengths(results_table["sequence"]) + min_length, med_length, max_length = np.quantile( + peptide_lengths, [0, 0.5, 1] + ) + return { + "num_spectra": len(results_table), + "score_bins": get_score_bins(results_table["score"], score_bins), + "max_sequence_length": max_length, + "min_sequence_length": min_length, + "median_sequence_length": med_length, + } + + +def log_run_report( + start_time: Optional[int] = None, end_time: Optional[int] = None +) -> None: + """ + Log general run report + + Parameters + ---------- + start_time : Optional[int], default=None + The start time of the sequencing run in seconds since the epoch. + end_time : Optional[int], default=None + The end time of the sequencing run in seconds since the epoch. + """ + logger.info("======= End of Run Report =======") + if start_time is not None and end_time is not None: + start_datetime = datetime.fromtimestamp(start_time) + end_datetime = datetime.fromtimestamp(end_time) + delta_datetime = end_datetime - start_datetime + logger.info( + "Run Start Time: %s", + start_datetime.strftime("%y/%m/%d %H:%M:%S"), + ) + logger.info( + "Run End Time: %s", end_datetime.strftime("%y/%m/%d %H:%M:%S") + ) + logger.info("Time Elapsed: %s", delta_datetime) + + logger.info("Executed Command: %s", " ".join(sys.argv)) + logger.info("Executed on Host Machine: %s", socket.gethostname()) + + if torch.cuda.is_available(): + gpu_util = torch.cuda.max_memory_allocated() + logger.info("Max GPU Memory Utilization: %d MiB", gpu_util >> 20) + + +def log_sequencing_report( + predictions: Tuple[str, Tuple[str, str], float, float, float, float, str], + start_time: Optional[int] = None, + end_time: Optional[int] = None, + score_bins: List[float] = SCORE_BINS, +) -> None: + """ + Log sequencing run report + + next_prediction : Tuple[ + str, Tuple[str, str], float, float, float, float, str + ] + PSM predictions + start_time : Optional[int], default=None + The start time of the sequencing run in seconds since the epoch. + end_time : Optional[int], default=None + The end time of the sequencing run in seconds since the epoch. + score_bins: List[float], Optional + Confidence scores for creating confidence score distribution, + see get_score_bins + """ + log_run_report(start_time=start_time, end_time=end_time) + run_report = get_report_dict( + pd.DataFrame( + { + "sequence": [psm[0] for psm in predictions], + "score": [psm[2] for psm in predictions], + } + ), + score_bins=score_bins, + ) + + if run_report is None: + logger.warning( + "No predictions were logged, this may be due to an error" + ) + else: + num_spectra = run_report["num_spectra"] + logger.info("Sequenced %s spectra", num_spectra) + logger.info("Score Distribution:") + for score, pop in sorted(run_report["score_bins"].items()): + logger.info( + "%s spectra (%.2f%%) scored ≥ %.2f", + pop, + pop / num_spectra * 100, + score, + ) + + logger.info( + "Min Peptide Length: %d", run_report["min_sequence_length"] + ) + logger.info( + "Max Peptide Length: %d", run_report["max_sequence_length"] + ) + logger.info( + "Median Peptide Length: %d", run_report["median_sequence_length"] + ) diff --git a/tests/unit_tests/test_run_stats.py b/tests/unit_tests/test_run_stats.py new file mode 100644 index 00000000..9a438673 --- /dev/null +++ b/tests/unit_tests/test_run_stats.py @@ -0,0 +1,88 @@ +import random +import string + +import numpy as np +import pandas as pd + +from casanovo.utils import get_score_bins, get_peptide_lengths + + +np.random.seed(4000) +random.seed(4000) + + +def test_get_score_bins(): + NUM_TEST = 5 + NUM_BINS = 5 + BIN_MIN = -1.0 + BIN_MAX = 1.0 + BIN_RNG = BIN_MAX - BIN_MIN + MAX_NUM = 10 + MIN_NUM = 1 + + for _ in range(NUM_TEST): + curr_bins = (np.random.rand(NUM_BINS) * BIN_RNG) + BIN_MIN + curr_bins = np.sort(curr_bins) + nums_per_bin = np.random.randint(MIN_NUM, MAX_NUM, NUM_BINS) + expected = dict() + curr_scores = np.array([]) + cumulative_sum = 0 + + for i in range(len(nums_per_bin) - 1, -2, -1): + curr_min = BIN_MIN if i < 0 else curr_bins[i] + curr_max = ( + BIN_MAX if i + 1 >= len(nums_per_bin) else curr_bins[i + 1] + ) + curr_num = nums_per_bin[i] + next_scores = ( + np.random.rand(curr_num) * (curr_max - curr_min) + ) + curr_min + curr_scores = np.append(curr_scores, next_scores) + cumulative_sum += curr_num + + if i >= 0: + expected[curr_min] = cumulative_sum + + np.random.shuffle(curr_scores) + scores = pd.Series(curr_scores, name="score") + actual = get_score_bins(scores, curr_bins) + assert expected == actual + + +def test_get_peptide_lengths(): + NUM_TEST = 5 + MAX_LENGTH = 20 + MIN_LENGTH = 5 + MAX_NUM_PEPTIDES = 200 + MIN_NUM_PEPTIDES = 20 + PROB_MASS_MOD = 0.1 + + num_peptides = np.random.randint( + MIN_NUM_PEPTIDES, MAX_NUM_PEPTIDES, NUM_TEST + ) + for curr_num_peptides in num_peptides: + expected = np.random.randint(MIN_LENGTH, MAX_LENGTH, curr_num_peptides) + peptide_list = [] + + for curr_expected_len in expected: + curr_peptide_seq = "" + + i = 0 + while i < curr_expected_len: + if random.random() < PROB_MASS_MOD: + random_mass_mod = 50 * random.random() + random_mass_mod = ( + f"{random.choice('+-')}{random_mass_mod:.5f}" + ) + curr_peptide_seq += random_mass_mod + continue + + random_peptide = random.choice(string.ascii_uppercase) + curr_peptide_seq += random_peptide + i += 1 + + peptide_list.append(curr_peptide_seq) + + sequences = pd.Series(peptide_list, name="sequence") + actual = get_peptide_lengths(sequences) + assert np.array_equal(expected, actual)