Noble-Lab · Lilferrit · Jul 10, 2024 · Jun 18, 2024 · Jun 18, 2024 · Jun 19, 2024
diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py
@@ -7,6 +7,7 @@
 import re
 import shutil
 import sys
+import time
 import warnings
 from pathlib import Path
 from typing import Optional, Tuple
@@ -140,14 +141,17 @@ def sequence(
     """
     output = setup_logging(output, verbosity)
     config, model = setup_model(model, config, output, False)
+    start_time = time.time()
     with ModelRunner(config, model) as runner:
         logger.info("Sequencing peptides from:")
         for peak_file in peak_path:
             logger.info("  %s", peak_file)
 
         runner.predict(peak_path, output)
-
-    logger.info("DONE!")
+        psms = runner.writer.psms
+        utils.log_sequencing_report(
+            psms, start_time=start_time, end_time=time.time()
+        )
 
 
 @main.command(cls=_SharedParams)
@@ -171,14 +175,14 @@ def evaluate(
     """
     output = setup_logging(output, verbosity)
     config, model = setup_model(model, config, output, False)
+    start_time = time.time()
     with ModelRunner(config, model) as runner:
         logger.info("Sequencing and evaluating peptides from:")
         for peak_file in annotated_peak_path:
             logger.info("  %s", peak_file)
 
         runner.evaluate(annotated_peak_path)
-
-    logger.info("DONE!")
+        utils.log_run_report(start_time=start_time, end_time=time.time())
 
 
 @main.command(cls=_SharedParams)
@@ -214,6 +218,7 @@ def train(
     """
     output = setup_logging(output, verbosity)
     config, model = setup_model(model, config, output, True)
+    start_time = time.time()
     with ModelRunner(config, model) as runner:
         logger.info("Training a model from:")
         for peak_file in train_peak_path:
@@ -224,8 +229,7 @@ def train(
             logger.info("  %s", peak_file)
 
         runner.train(train_peak_path, validation_peak_path)
-
-    logger.info("DONE!")
+        utils.log_run_report(start_time=start_time, end_time=time.time())
 
 
 @main.command()

diff --git a/casanovo/utils.py b/casanovo/utils.py
@@ -1,15 +1,24 @@
 """Small utility functions"""
 
+import heapq
 import logging
 import os
 import platform
 import re
-from typing import Tuple
+import socket
+import sys
+import time
+from datetime import datetime
+from typing import Tuple, Dict, List, Optional
 
+import numpy as np
+import pandas as pd
 import psutil
 import torch
 
 
+SCORE_BINS = [0.0, 0.5, 0.9, 0.95, 0.99]
+
 logger = logging.getLogger("casanovo")
 
 
@@ -66,3 +75,179 @@
     """
     version_regex = re.compile(r"(\d+)\.(\d+)\.*(\d*)(?:.dev\d+.+)?")
     return tuple(g for g in version_regex.match(version).groups())
+
+
+def get_score_bins(
+    scores: pd.Series, score_bins: List[float]
+) -> Dict[float, int]:
+    """
+    Get binned confidence scores
+
+    From a list of confidence scores, return a dictionary mapping each
+    confidence score to the number of spectra with a confidence greater
+    than or equal to it.
+
+    Parameters
+    ----------
+    scores: pd.Series
+        Series of assigned peptide scores.
+    score_bins: List[float]
+        Confidence scores to map.
+
+    Returns
+    -------
+    score_bin_dict: Dict[float, int]
+        Dictionary mapping each confidence score to the number of spectra
+        with a confidence greater than or equal to it.
+    """
+    return {score: (scores >= score).sum() for score in score_bins}
+
+
+def get_peptide_lengths(sequences: pd.Series) -> np.ndarray:
+    """
+    Get a numpy array containing the length of each peptide sequence
+
+    Parameters
+    ----------
+    sequences: pd.Series
+        Series of peptide sequences.
+
+    Returns
+    -------
+    sequence_lengths: np.ndarray
+        Numpy array containing the length of each sequence, listed in the
+        same order that the sequences are provided in.
+    """
+    # Mass modifications do not contribute to sequence length
+    # FIXME: If PTMs are represented in ProForma notation this filtering
+    #  operation needs to be reimplemented
+    return sequences.str.replace(r"[^a-zA-Z]", "", regex=True).apply(len)
+
+
+def get_report_dict(
+    results_table: pd.DataFrame, score_bins: List[float] = SCORE_BINS
+) -> Optional[Dict]:
+    """
+    Generate sequencing run report
+
+    Parameters
+    ----------
+    results_table: pd.DataFrame
+        Parsed spectrum match table
+    score_bins: List[float], Optional
+        Confidence scores for creating confidence CMF, see get_score_bins
+
+    Returns
+    -------
+    report_gen: Dict
+        Generated report represented as a dictionary, or None if no
+        sequencing predictions were logged
+    """
+    if results_table.empty:
+        return None
+
+    peptide_lengths = get_peptide_lengths(results_table["sequence"])
+    min_length, med_length, max_length = np.quantile(
+        peptide_lengths, [0, 0.5, 1]
+    )
+    return {
+        "num_spectra": len(results_table),
+        "score_bins": get_score_bins(results_table["score"], score_bins),
+        "max_sequence_length": max_length,
+        "min_sequence_length": min_length,
+        "median_sequence_length": med_length,
+    }
+
+
+def log_run_report(
+    start_time: Optional[int] = None, end_time: Optional[int] = None
+) -> None:
+    """
+    Log general run report
+
+    Parameters
+    ----------
+    start_time : Optional[int], default=None
+        The start time of the sequencing run in seconds since the epoch.
+    end_time : Optional[int], default=None
+        The end time of the sequencing run in seconds since the epoch.
+    """
+    logger.info("======= End of Run Report =======")
+    if start_time is not None and end_time is not None:
+        start_datetime = datetime.fromtimestamp(start_time)
+        end_datetime = datetime.fromtimestamp(end_time)
+        delta_datetime = end_datetime - start_datetime
+        logger.info(
+            "Run Start Time: %s",
+            start_datetime.strftime("%y/%m/%d %H:%M:%S"),
+        )
+        logger.info(
+            "Run End Time: %s", end_datetime.strftime("%y/%m/%d %H:%M:%S")
+        )
+        logger.info("Time Elapsed: %s", delta_datetime)
+
+    logger.info("Executed Command: %s", " ".join(sys.argv))
+    logger.info("Executed on Host Machine: %s", socket.gethostname())
+
+    if torch.cuda.is_available():
+        gpu_util = torch.cuda.max_memory_allocated()
+        logger.info("Max GPU Memory Utilization: %d MiB", gpu_util >> 20)
+
+
+def log_sequencing_report(
+    predictions: Tuple[str, Tuple[str, str], float, float, float, float, str],
+    start_time: Optional[int] = None,
+    end_time: Optional[int] = None,
+    score_bins: List[float] = SCORE_BINS,
+) -> None:
+    """
+    Log sequencing run report
+
+    next_prediction : Tuple[
+        str, Tuple[str, str], float, float, float, float, str
+    ]
+        PSM predictions
+    start_time : Optional[int], default=None
+        The start time of the sequencing run in seconds since the epoch.
+    end_time : Optional[int], default=None
+        The end time of the sequencing run in seconds since the epoch.
+    score_bins: List[float], Optional
+        Confidence scores for creating confidence score distribution,
+        see get_score_bins
+    """
+    log_run_report(start_time=start_time, end_time=end_time)
+    run_report = get_report_dict(
+        pd.DataFrame(
+            {
+                "sequence": [psm[0] for psm in predictions],
+                "score": [psm[2] for psm in predictions],
+            }
+        ),
+        score_bins=score_bins,
+    )
+
+    if run_report is None:
+        logger.warning(
+            "No predictions were logged, this may be due to an error"
+        )
+    else:
+        num_spectra = run_report["num_spectra"]
+        logger.info("Sequenced %s spectra", num_spectra)
+        logger.info("Score Distribution:")
+        for score, pop in sorted(run_report["score_bins"].items()):
+            logger.info(
+                "%s spectra (%.2f%%) scored ≥ %.2f",
+                pop,
+                pop / num_spectra * 100,
+                score,
+            )
+
+        logger.info(
+            "Min Peptide Length: %d", run_report["min_sequence_length"]
+        )
+        logger.info(
+            "Max Peptide Length: %d", run_report["max_sequence_length"]
+        )
+        logger.info(
+            "Median Peptide Length: %d", run_report["median_sequence_length"]
+        )
diff --git a/tests/unit_tests/test_run_stats.py b/tests/unit_tests/test_run_stats.py
@@ -0,0 +1,88 @@
+import random
+import string
+
+import numpy as np
+import pandas as pd
+
+from casanovo.utils import get_score_bins, get_peptide_lengths
+
+
+np.random.seed(4000)
+random.seed(4000)
+
+
+def test_get_score_bins():
+    NUM_TEST = 5
+    NUM_BINS = 5
+    BIN_MIN = -1.0
+    BIN_MAX = 1.0
+    BIN_RNG = BIN_MAX - BIN_MIN
+    MAX_NUM = 10
+    MIN_NUM = 1
+
+    for _ in range(NUM_TEST):
+        curr_bins = (np.random.rand(NUM_BINS) * BIN_RNG) + BIN_MIN
+        curr_bins = np.sort(curr_bins)
+        nums_per_bin = np.random.randint(MIN_NUM, MAX_NUM, NUM_BINS)
+        expected = dict()
+        curr_scores = np.array([])
+        cumulative_sum = 0
+
+        for i in range(len(nums_per_bin) - 1, -2, -1):
+            curr_min = BIN_MIN if i < 0 else curr_bins[i]
+            curr_max = (
+                BIN_MAX if i + 1 >= len(nums_per_bin) else curr_bins[i + 1]
+            )
+            curr_num = nums_per_bin[i]
+            next_scores = (
+                np.random.rand(curr_num) * (curr_max - curr_min)
+            ) + curr_min
+            curr_scores = np.append(curr_scores, next_scores)
+            cumulative_sum += curr_num
+
+            if i >= 0:
+                expected[curr_min] = cumulative_sum
+
+        np.random.shuffle(curr_scores)
+        scores = pd.Series(curr_scores, name="score")
+        actual = get_score_bins(scores, curr_bins)
+        assert expected == actual
+
+
+def test_get_peptide_lengths():
+    NUM_TEST = 5
+    MAX_LENGTH = 20
+    MIN_LENGTH = 5
+    MAX_NUM_PEPTIDES = 200
+    MIN_NUM_PEPTIDES = 20
+    PROB_MASS_MOD = 0.1
+
+    num_peptides = np.random.randint(
+        MIN_NUM_PEPTIDES, MAX_NUM_PEPTIDES, NUM_TEST
+    )
+    for curr_num_peptides in num_peptides:
+        expected = np.random.randint(MIN_LENGTH, MAX_LENGTH, curr_num_peptides)
+        peptide_list = []
+
+        for curr_expected_len in expected:
+            curr_peptide_seq = ""
+
+            i = 0
+            while i < curr_expected_len:
+                if random.random() < PROB_MASS_MOD:
+                    random_mass_mod = 50 * random.random()
+                    random_mass_mod = (
+                        f"{random.choice('+-')}{random_mass_mod:.5f}"
+                    )
+                    curr_peptide_seq += random_mass_mod
+                    continue
+
+                random_peptide = random.choice(string.ascii_uppercase)
+                curr_peptide_seq += random_peptide
+                i += 1
+
+            peptide_list.append(curr_peptide_seq)
+
+        sequences = pd.Series(peptide_list, name="sequence")
+        actual = get_peptide_lengths(sequences)
+        assert np.array_equal(expected, actual)