Skip to content

Commit

Permalink
Skip spectra with few peaks
Browse files Browse the repository at this point in the history
  • Loading branch information
melihyilmaz committed Nov 10, 2023
1 parent 235420f commit 63ec7d3
Show file tree
Hide file tree
Showing 5 changed files with 35 additions and 11 deletions.
1 change: 1 addition & 0 deletions casanovo/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ class Config:
_default_config = Path(__file__).parent / "config.yaml"
_config_types = dict(
random_seed=int,
min_n_peaks=int,
n_peaks=int,
min_mz=float,
max_mz=float,
Expand Down
2 changes: 2 additions & 0 deletions casanovo/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ model_save_folder_path: ""
val_check_interval: 50_000

# SPECTRUM PROCESSING OPTIONS
# Min number of peaks allowed in a spectrum
min_n_peaks: 20
# Number of the most intense peaks to retain, any remaining peaks are discarded
n_peaks: 150
# Min peak m/z allowed, peaks with smaller m/z are discarded
Expand Down
34 changes: 23 additions & 11 deletions casanovo/data/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ class SpectrumDataset(Dataset):
----------
spectrum_index : depthcharge.data.SpectrumIndex
The MS/MS spectra to use as a dataset.
min_n_peaks : int
Minimum number of peaks allowed in each spectrum. Spectra with fewer
peaks are discarded.
n_peaks : Optional[int]
The number of top-n most intense peaks to keep in each spectrum. `None`
retains all peaks.
Expand All @@ -38,6 +41,7 @@ class SpectrumDataset(Dataset):
def __init__(
self,
spectrum_index: depthcharge.data.SpectrumIndex,
min_n_peaks: int = 20,
n_peaks: int = 150,
min_mz: float = 140.0,
max_mz: float = 2500.0,
Expand All @@ -47,6 +51,7 @@ def __init__(
):
"""Initialize a SpectrumDataset"""
super().__init__()
self.min_n_peaks = min_n_peaks
self.n_peaks = n_peaks
self.min_mz = min_mz
self.max_mz = max_mz
Expand Down Expand Up @@ -86,12 +91,13 @@ def __getitem__(
spectrum = self._process_peaks(
mz_array, int_array, precursor_mz, precursor_charge
)
return (
spectrum,
precursor_mz,
precursor_charge,
self.get_spectrum_id(idx),
)
if spectrum is not None:
return (
spectrum,
precursor_mz,
precursor_charge,
self.get_spectrum_id(idx),
)

def get_spectrum_id(self, idx: int) -> Tuple[str, str]:
"""
Expand Down Expand Up @@ -148,13 +154,13 @@ def _process_peaks(
)
try:
spectrum.set_mz_range(self.min_mz, self.max_mz)
if len(spectrum.mz) == 0:
if len(spectrum.mz) < self.min_n_peaks:
raise ValueError
spectrum.remove_precursor_peak(self.remove_precursor_tol, "Da")
if len(spectrum.mz) == 0:
if len(spectrum.mz) < self.min_n_peaks:
raise ValueError
spectrum.filter_intensity(self.min_intensity, self.n_peaks)
if len(spectrum.mz) == 0:
if len(spectrum.mz) < self.min_n_peaks:
raise ValueError
spectrum.scale_intensity("root", 1)
intensities = spectrum.intensity / np.linalg.norm(
Expand All @@ -163,7 +169,7 @@ def _process_peaks(
return torch.tensor(np.array([spectrum.mz, intensities])).T.float()
except ValueError:
# Replace invalid spectra by a dummy spectrum.
return torch.tensor([[0, 1]]).float()
return None # torch.tensor([[3, 3]]).float()

@property
def n_spectra(self) -> int:
Expand Down Expand Up @@ -194,6 +200,9 @@ class AnnotatedSpectrumDataset(SpectrumDataset):
----------
annotated_spectrum_index : depthcharge.data.SpectrumIndex
The MS/MS spectra to use as a dataset.
min_n_peaks : int
Minimum number of peaks allowed in each spectrum. Spectra with fewer
peaks are discarded.
n_peaks : Optional[int]
The number of top-n most intense peaks to keep in each spectrum. `None`
retains all peaks.
Expand All @@ -216,6 +225,7 @@ class AnnotatedSpectrumDataset(SpectrumDataset):
def __init__(
self,
annotated_spectrum_index: depthcharge.data.SpectrumIndex,
min_n_peaks: int = 20,
n_peaks: int = 150,
min_mz: float = 140.0,
max_mz: float = 2500.0,
Expand All @@ -225,6 +235,7 @@ def __init__(
):
super().__init__(
annotated_spectrum_index,
min_n_peaks=min_n_peaks,
n_peaks=n_peaks,
min_mz=min_mz,
max_mz=max_mz,
Expand Down Expand Up @@ -263,4 +274,5 @@ def __getitem__(self, idx: int) -> Tuple[torch.Tensor, float, int, str]:
spectrum = self._process_peaks(
mz_array, int_array, precursor_mz, precursor_charge
)
return spectrum, precursor_mz, precursor_charge, peptide
if spectrum is not None:
return spectrum, precursor_mz, precursor_charge, peptide
8 changes: 8 additions & 0 deletions casanovo/denovo/dataloaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ class DeNovoDataModule(pl.LightningDataModule):
The batch size to use for training.
eval_batch_size : int
The batch size to use for inference.
min_n_peaks : int
Minimum number of peaks allowed in each spectrum. Spectra with fewer
peaks are discarded.
n_peaks : Optional[int]
The number of top-n most intense peaks to keep in each spectrum. `None`
retains all peaks.
Expand Down Expand Up @@ -56,6 +59,7 @@ def __init__(
test_index: Optional[AnnotatedSpectrumIndex] = None,
train_batch_size: int = 128,
eval_batch_size: int = 1028,
min_n_peaks: int = 20,
n_peaks: Optional[int] = 150,
min_mz: float = 50.0,
max_mz: float = 2500.0,
Expand All @@ -70,6 +74,7 @@ def __init__(
self.test_index = test_index
self.train_batch_size = train_batch_size
self.eval_batch_size = eval_batch_size
self.min_n_peaks = min_n_peaks
self.n_peaks = n_peaks
self.min_mz = min_mz
self.max_mz = max_mz
Expand Down Expand Up @@ -97,6 +102,7 @@ def setup(self, stage: str = None, annotated: bool = True) -> None:
if stage in (None, "fit", "validate"):
make_dataset = functools.partial(
AnnotatedSpectrumDataset,
min_n_peaks=self.min_n_peaks,
n_peaks=self.n_peaks,
min_mz=self.min_mz,
max_mz=self.max_mz,
Expand All @@ -113,6 +119,7 @@ def setup(self, stage: str = None, annotated: bool = True) -> None:
if stage in (None, "test"):
make_dataset = functools.partial(
AnnotatedSpectrumDataset if annotated else SpectrumDataset,
min_n_peaks=self.min_n_peaks,
n_peaks=self.n_peaks,
min_mz=self.min_mz,
max_mz=self.max_mz,
Expand Down Expand Up @@ -201,6 +208,7 @@ def prepare_batch(
The spectrum identifiers (during de novo sequencing) or peptide
sequences (during training).
"""
batch = [spectrum for spectrum in batch if spectrum is not None]
spectra, precursor_mzs, precursor_charges, spectrum_ids = list(zip(*batch))
spectra = torch.nn.utils.rnn.pad_sequence(spectra, batch_first=True)
precursor_mzs = torch.tensor(precursor_mzs)
Expand Down
1 change: 1 addition & 0 deletions casanovo/denovo/model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,7 @@ def initialize_data_module(
train_index=train_index,
valid_index=valid_index,
test_index=test_index,
min_n_peaks=self.config.min_n_peaks,
min_mz=self.config.min_mz,
max_mz=self.config.max_mz,
min_intensity=self.config.min_intensity,
Expand Down

0 comments on commit 63ec7d3

Please sign in to comment.