Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enhanced Sequencing Run Logging #343

Merged
merged 36 commits into from
Jul 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
da45608
implemented report_gen submodule
Lilferrit Jun 18, 2024
2d6b5c3
report_gen documentation
Lilferrit Jun 18, 2024
28fa6c8
report_gen submodule test
Lilferrit Jun 19, 2024
97e5bf1
naming conventions
Lilferrit Jun 19, 2024
4f635f9
naming conventions
Lilferrit Jun 19, 2024
aa43a8c
PredictionWriter virtual class
Lilferrit Jun 21, 2024
46bb62c
multi prediction writer
Lilferrit Jun 21, 2024
40eecb1
LogPredicitonWriter wip
Lilferrit Jun 21, 2024
2d7effa
implemented logger io
Lilferrit Jun 21, 2024
a7beddf
removed report gen submodule
Lilferrit Jun 21, 2024
65b5a83
logger io test
Lilferrit Jun 21, 2024
1f656b6
logging info
Lilferrit Jun 21, 2024
4d2fab1
implemented end of run logging
Lilferrit Jun 21, 2024
9e903e7
Merge branch 'main' into run-report-logging
Lilferrit Jun 21, 2024
22f26c7
Generate new screengrabs with rich-codex
github-actions[bot] Jun 21, 2024
2f83bb7
logger io test fix
Lilferrit Jun 21, 2024
858704e
formatting fixes
Lilferrit Jun 21, 2024
6da1219
updated screeshots
Lilferrit Jun 21, 2024
bf6c20c
test file formatting
Lilferrit Jun 21, 2024
ed1b841
Restrict NumPy to pre-2.0
bittremieux Jun 24, 2024
968f60a
Update changelog
bittremieux Jun 24, 2024
0b12fb8
PredictionMultiWriter s\erialization
Lilferrit Jun 24, 2024
ff37b54
log writer error handling
Lilferrit Jun 24, 2024
dee9bf0
reformatting
Lilferrit Jun 24, 2024
411f717
Merge branch 'hotfix_numpy' of github.com:Noble-Lab/casanovo into run…
Lilferrit Jun 24, 2024
19d8aa8
verified skipped spectra counter
Lilferrit Jun 24, 2024
d467e87
Generate new screengrabs with rich-codex
github-actions[bot] Jun 24, 2024
56ef340
changelog merge confict
Lilferrit Jun 27, 2024
79c706e
migrated end of run report logging functionality to ms_io
Lilferrit Jun 28, 2024
4942a48
moved logging utility functions to util.py
Lilferrit Jul 3, 2024
66860e2
requested changes
Lilferrit Jul 8, 2024
a4d6649
more requested changes
Lilferrit Jul 9, 2024
13ce8a0
Merge branch 'dev' into run-report-logging
Lilferrit Jul 9, 2024
51df665
resolved dev merge conflicts
Lilferrit Jul 10, 2024
57b0284
Minor simplifications
bittremieux Jul 10, 2024
e0f5230
Fix tests
bittremieux Jul 10, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 10 additions & 6 deletions casanovo/casanovo.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import re
import shutil
import sys
import time
import warnings
from pathlib import Path
from typing import Optional, Tuple
Expand Down Expand Up @@ -140,14 +141,17 @@ def sequence(
"""
output = setup_logging(output, verbosity)
config, model = setup_model(model, config, output, False)
start_time = time.time()
with ModelRunner(config, model) as runner:
logger.info("Sequencing peptides from:")
for peak_file in peak_path:
logger.info(" %s", peak_file)

runner.predict(peak_path, output)

logger.info("DONE!")
psms = runner.writer.psms
utils.log_sequencing_report(
psms, start_time=start_time, end_time=time.time()
)


@main.command(cls=_SharedParams)
Expand All @@ -171,14 +175,14 @@ def evaluate(
"""
output = setup_logging(output, verbosity)
config, model = setup_model(model, config, output, False)
start_time = time.time()
with ModelRunner(config, model) as runner:
logger.info("Sequencing and evaluating peptides from:")
for peak_file in annotated_peak_path:
logger.info(" %s", peak_file)

runner.evaluate(annotated_peak_path)

logger.info("DONE!")
utils.log_run_report(start_time=start_time, end_time=time.time())


@main.command(cls=_SharedParams)
Expand Down Expand Up @@ -214,6 +218,7 @@ def train(
"""
output = setup_logging(output, verbosity)
config, model = setup_model(model, config, output, True)
start_time = time.time()
with ModelRunner(config, model) as runner:
logger.info("Training a model from:")
for peak_file in train_peak_path:
Expand All @@ -224,8 +229,7 @@ def train(
logger.info(" %s", peak_file)

runner.train(train_peak_path, validation_peak_path)

logger.info("DONE!")
utils.log_run_report(start_time=start_time, end_time=time.time())


@main.command()
Expand Down
187 changes: 186 additions & 1 deletion casanovo/utils.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,24 @@
"""Small utility functions"""

import heapq
import logging
import os
import platform
import re
from typing import Tuple
import socket
import sys
import time
from datetime import datetime
from typing import Tuple, Dict, List, Optional

import numpy as np
import pandas as pd
import psutil
import torch


SCORE_BINS = [0.0, 0.5, 0.9, 0.95, 0.99]

logger = logging.getLogger("casanovo")


Expand Down Expand Up @@ -66,3 +75,179 @@
"""
version_regex = re.compile(r"(\d+)\.(\d+)\.*(\d*)(?:.dev\d+.+)?")
return tuple(g for g in version_regex.match(version).groups())


def get_score_bins(
scores: pd.Series, score_bins: List[float]
) -> Dict[float, int]:
"""
Get binned confidence scores

From a list of confidence scores, return a dictionary mapping each
confidence score to the number of spectra with a confidence greater
than or equal to it.

Parameters
----------
scores: pd.Series
Series of assigned peptide scores.
score_bins: List[float]
Confidence scores to map.

Returns
-------
score_bin_dict: Dict[float, int]
Dictionary mapping each confidence score to the number of spectra
with a confidence greater than or equal to it.
"""
return {score: (scores >= score).sum() for score in score_bins}


def get_peptide_lengths(sequences: pd.Series) -> np.ndarray:
"""
Get a numpy array containing the length of each peptide sequence

Parameters
----------
sequences: pd.Series
Series of peptide sequences.

Returns
-------
sequence_lengths: np.ndarray
Numpy array containing the length of each sequence, listed in the
same order that the sequences are provided in.
"""
# Mass modifications do not contribute to sequence length
# FIXME: If PTMs are represented in ProForma notation this filtering
# operation needs to be reimplemented
return sequences.str.replace(r"[^a-zA-Z]", "", regex=True).apply(len)


def get_report_dict(
results_table: pd.DataFrame, score_bins: List[float] = SCORE_BINS
) -> Optional[Dict]:
"""
Generate sequencing run report

Parameters
----------
results_table: pd.DataFrame
Parsed spectrum match table
score_bins: List[float], Optional
Confidence scores for creating confidence CMF, see get_score_bins

Returns
-------
report_gen: Dict
Generated report represented as a dictionary, or None if no
sequencing predictions were logged
"""
if results_table.empty:
return None

Check warning on line 147 in casanovo/utils.py

View check run for this annotation

Codecov / codecov/patch

casanovo/utils.py#L147

Added line #L147 was not covered by tests

peptide_lengths = get_peptide_lengths(results_table["sequence"])
min_length, med_length, max_length = np.quantile(
peptide_lengths, [0, 0.5, 1]
)
return {
"num_spectra": len(results_table),
"score_bins": get_score_bins(results_table["score"], score_bins),
"max_sequence_length": max_length,
"min_sequence_length": min_length,
"median_sequence_length": med_length,
}


def log_run_report(
start_time: Optional[int] = None, end_time: Optional[int] = None
) -> None:
"""
Log general run report

Parameters
----------
start_time : Optional[int], default=None
The start time of the sequencing run in seconds since the epoch.
end_time : Optional[int], default=None
The end time of the sequencing run in seconds since the epoch.
"""
logger.info("======= End of Run Report =======")
if start_time is not None and end_time is not None:
start_datetime = datetime.fromtimestamp(start_time)
end_datetime = datetime.fromtimestamp(end_time)
delta_datetime = end_datetime - start_datetime
logger.info(
"Run Start Time: %s",
start_datetime.strftime("%y/%m/%d %H:%M:%S"),
)
logger.info(
"Run End Time: %s", end_datetime.strftime("%y/%m/%d %H:%M:%S")
)
logger.info("Time Elapsed: %s", delta_datetime)

logger.info("Executed Command: %s", " ".join(sys.argv))
logger.info("Executed on Host Machine: %s", socket.gethostname())

if torch.cuda.is_available():
gpu_util = torch.cuda.max_memory_allocated()
logger.info("Max GPU Memory Utilization: %d MiB", gpu_util >> 20)

Check warning on line 194 in casanovo/utils.py

View check run for this annotation

Codecov / codecov/patch

casanovo/utils.py#L193-L194

Added lines #L193 - L194 were not covered by tests


def log_sequencing_report(
predictions: Tuple[str, Tuple[str, str], float, float, float, float, str],
start_time: Optional[int] = None,
end_time: Optional[int] = None,
score_bins: List[float] = SCORE_BINS,
) -> None:
"""
Log sequencing run report

next_prediction : Tuple[
str, Tuple[str, str], float, float, float, float, str
]
PSM predictions
start_time : Optional[int], default=None
The start time of the sequencing run in seconds since the epoch.
end_time : Optional[int], default=None
The end time of the sequencing run in seconds since the epoch.
score_bins: List[float], Optional
Confidence scores for creating confidence score distribution,
see get_score_bins
"""
log_run_report(start_time=start_time, end_time=end_time)
bittremieux marked this conversation as resolved.
Show resolved Hide resolved
run_report = get_report_dict(
pd.DataFrame(
{
"sequence": [psm[0] for psm in predictions],
"score": [psm[2] for psm in predictions],
}
),
score_bins=score_bins,
)

if run_report is None:
logger.warning(

Check warning on line 230 in casanovo/utils.py

View check run for this annotation

Codecov / codecov/patch

casanovo/utils.py#L230

Added line #L230 was not covered by tests
"No predictions were logged, this may be due to an error"
)
else:
num_spectra = run_report["num_spectra"]
logger.info("Sequenced %s spectra", num_spectra)
logger.info("Score Distribution:")
for score, pop in sorted(run_report["score_bins"].items()):
logger.info(
"%s spectra (%.2f%%) scored ≥ %.2f",
pop,
pop / num_spectra * 100,
bittremieux marked this conversation as resolved.
Show resolved Hide resolved
score,
bittremieux marked this conversation as resolved.
Show resolved Hide resolved
)

logger.info(
"Min Peptide Length: %d", run_report["min_sequence_length"]
)
logger.info(
"Max Peptide Length: %d", run_report["max_sequence_length"]
)
logger.info(
"Median Peptide Length: %d", run_report["median_sequence_length"]
)
88 changes: 88 additions & 0 deletions tests/unit_tests/test_run_stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import random
import string

import numpy as np
import pandas as pd

from casanovo.utils import get_score_bins, get_peptide_lengths


np.random.seed(4000)
random.seed(4000)


def test_get_score_bins():
NUM_TEST = 5
NUM_BINS = 5
BIN_MIN = -1.0
BIN_MAX = 1.0
BIN_RNG = BIN_MAX - BIN_MIN
MAX_NUM = 10
MIN_NUM = 1

for _ in range(NUM_TEST):
curr_bins = (np.random.rand(NUM_BINS) * BIN_RNG) + BIN_MIN
curr_bins = np.sort(curr_bins)
nums_per_bin = np.random.randint(MIN_NUM, MAX_NUM, NUM_BINS)
expected = dict()
curr_scores = np.array([])
cumulative_sum = 0

for i in range(len(nums_per_bin) - 1, -2, -1):
curr_min = BIN_MIN if i < 0 else curr_bins[i]
curr_max = (
BIN_MAX if i + 1 >= len(nums_per_bin) else curr_bins[i + 1]
)
curr_num = nums_per_bin[i]
next_scores = (
np.random.rand(curr_num) * (curr_max - curr_min)
) + curr_min
curr_scores = np.append(curr_scores, next_scores)
cumulative_sum += curr_num

if i >= 0:
expected[curr_min] = cumulative_sum

np.random.shuffle(curr_scores)
scores = pd.Series(curr_scores, name="score")
actual = get_score_bins(scores, curr_bins)
assert expected == actual


def test_get_peptide_lengths():
NUM_TEST = 5
MAX_LENGTH = 20
MIN_LENGTH = 5
MAX_NUM_PEPTIDES = 200
MIN_NUM_PEPTIDES = 20
PROB_MASS_MOD = 0.1

num_peptides = np.random.randint(
MIN_NUM_PEPTIDES, MAX_NUM_PEPTIDES, NUM_TEST
)
for curr_num_peptides in num_peptides:
expected = np.random.randint(MIN_LENGTH, MAX_LENGTH, curr_num_peptides)
peptide_list = []

for curr_expected_len in expected:
curr_peptide_seq = ""

i = 0
while i < curr_expected_len:
if random.random() < PROB_MASS_MOD:
random_mass_mod = 50 * random.random()
random_mass_mod = (
f"{random.choice('+-')}{random_mass_mod:.5f}"
)
curr_peptide_seq += random_mass_mod
continue

random_peptide = random.choice(string.ascii_uppercase)
curr_peptide_seq += random_peptide
i += 1

peptide_list.append(curr_peptide_seq)

sequences = pd.Series(peptide_list, name="sequence")
actual = get_peptide_lengths(sequences)
assert np.array_equal(expected, actual)
Loading