Skip to content

Commit

Permalink
added ms2deepscore
Browse files Browse the repository at this point in the history
  • Loading branch information
zargham-ahmad committed Aug 15, 2024
1 parent 8db07ed commit bc5b20a
Show file tree
Hide file tree
Showing 7 changed files with 6,975 additions and 0 deletions.
146 changes: 146 additions & 0 deletions tools/ms2deepscore/macros.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
<macros>
<token name="@TOOL_VERSION@">2.0.0</token>

<xml name="creator">
<creator>
<person
givenName="Zargham"
familyName="Ahmad"
url="https://github.com/zargham-ahmad"
identifier="0000-0002-6096-224X" />
<organization
url="https://www.recetox.muni.cz/"
email="[email protected]"
name="RECETOX MUNI" />
</creator>
</xml>

<xml name="edam">
<xrefs>
<xref type="bio.tools">ms2deepscore</xref>
</xrefs>
</xml>

<xml name="input_param">
<conditional name="scores">
<param name="use_scores" label="Use Scores Object" type="select">
<option value="False" selected="true">FALSE</option>
<option value="True">TRUE</option>
</param>
<when value="True">
<param label="Scores object" name="scores_in" type="data" format="json"
help="Scores objects calculated previously using one of the matchms similarity tools." />
</when>
<when value="False">
<param label="Queries spectra" name="queries" type="data" format="msp"
help="Query mass spectra to match against references."/>
<param label="Reference spectra" name="references" type="data" format="msp"
help="Reference mass spectra to match against as library."/>
</when>
</conditional>
<param name="model" type="data" format="onnx" label="Pre-trained Model">
<help>Select the pre-trained MS2DeepScore model file (onnx format)</help>
</param>
<param name="model_param" type="data" format="json" label="Model Parameter">
<help>Select the pre-trained MS2DeepScore model Parameter</help>
</param>
</xml>

<xml name="training_param">
<param label="Spectra File" name="spectra" type="data" format="msp,mgf"
help="Spectra file that should be used for training. (it will be split in train, val and test)"/>
<param name="model_param" type="data" format="json" label="Model Settings" help="json file with the MS2Deepscore model settings."/>
<param name="validation_split_fraction" type="integer" min="0" max="100" value="20" label="Validation split fraction"
help="The fraction of the inchikeys that will be used for validation and test"/>
</xml>

<xml name="config_generator">
<section name="model_structure" title="Model Structure" expanded="true">
<param name="base_dims" type="text" label="Base Dimensions" value="2000,2000,2000" help="The in between layers to be used." />
<param name="embedding_dim" type="integer" label="Embedding Dimension" value="400" help="The dimension of the final embedding." />
<param name="ionisation_mode" type="select" label="Ionisation Mode">
<option value="positive" selected="true">Positive</option>
<option value="negative">Negative</option>
<option value="both">Both</option>
</param>
</section>

<section name="training_settings" title="Training Settings" expanded="true">
<param name="dropout_rate" type="float" label="Dropout Rate" value="0.0" />
<param name="learning_rate" type="float" label="Learning Rate" value="0.00025" />
<param name="epochs" type="integer" label="Epochs" value="250" />
<param name="patience" type="integer" label="Patience" value="20" help="How long the model should keep training if validation does not improve" />
<param name="loss_function" type="select" label="Loss Function">
<option value="mse" selected="true">Mean Squared Error (mse)</option>
<option value="mae">Mean Absolute Error (mae)</option>
<option value="rmse">Root Mean Squared Error (rmse)</option>
<option value="risk_mae">Risk Aware MAE (risk_aware_mae)</option>
<option value="risk_mse">Risk Aware MSE (risk_aware_mse)</option>
</param>
<param name="weighting_factor" type="integer" label="Weighting Factor" value="0" />
</section>

<section name="tensorization_settings" title="Tensorization Settings" expanded="true">
<param name="min_mz" type="integer" label="Min m/z" value="10" />
<param name="max_mz" type="integer" label="Max m/z" value="1000" />
<param name="mz_bin_width" type="float" label="m/z Bin Width" value="0.1" />
<param name="intensity_scaling" type="float" label="Intensity Scaling" value="0.5" />
</section>

<section name="data_generator_settings" title="Data generator settings" expanded="true">
<param name="batch_size" type="integer" value="32" label="Batch Size" help="Number of pairs per batch" />
</section>

<section name="compound_pairs_selection_settings" title="Compound pairs selection settings" expanded="true">
<param name="average_pairs_per_bin" type="integer" value="20" label="Average pairs per bin" help="The aimed average number of pairs of spectra per spectrum in each bin." />
<param name="random_seed" type="text" label="Random seed" value="None" help="Specify random seed for reproducible random number generation." />
</section>

<section name="tanimoto_score_settings" title="Tanimoto Score Settings" expanded="true">
<param name="fingerprint_type" type="text" value="daylight" label="Fingerprint Type" help="The fingerprint type that should be used for tanimoto score calculations." />
<param name="fingerprint_nbits" type="integer" label="Fingerprint Number of Bits" value="2048" help="The number of bits to use for the fingerprint." />
</section>
</xml>

<xml name="citations">
<citations>
<citation type="doi">https://doi.org/10.1186/s13321-021-00558-4</citation>
<citation type="doi">https://doi.org/10.1101/2024.03.25.586580</citation>
</citations>
</xml>

<token name="@init_scores@">
from matchms.importing import load_from_msp, scores_from_json
from matchms import Scores
#if $scores.use_scores == "True"
scores = scores_from_json("${scores_in}")
#else
scores = Scores(references=list(load_from_msp("$references")), queries=list(load_from_msp("$queries")), is_symmetric=False)
#end if
</token>

<token name="@init_logger@">
from matchms import set_matchms_logger_level
set_matchms_logger_level("WARNING")
</token>

<token name="@json_load@">
import numpy as np
import json

with open("$model_param", 'r') as json_file:
model_params = json.load(json_file)

# Conditionally convert specific keys if they are present
if 'base_dims' in model_params:
model_params['base_dims'] = tuple(model_params['base_dims'])

if 'same_prob_bins' in model_params:
model_params['same_prob_bins'] = np.array(model_params['same_prob_bins'])

if 'additional_metadata' in model_params:
model_params['additional_metadata'] = [
(entry[0], entry[1]) for entry in model_params['additional_metadata']
]
</token>
</macros>
76 changes: 76 additions & 0 deletions tools/ms2deepscore/ms2deepscore_config_generator.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
<tool id="ms2deepscore_config_generator" name="MS2DeepScore Configuration Generator" version="@TOOL_VERSION@+galaxy0">
<description>Generates model parameters for MS2DeepScore in JSON format</description>
<macros>
<import>macros.xml</import>
</macros>
<expand macro="creator"/>
<expand macro="edam" />

<requirements>
<requirement type="package" version="@TOOL_VERSION@">ms2deepscore</requirement>
</requirements>

<command detect_errors="exit_code"><![CDATA[
python3 ${python_wrapper}
]]></command>
<configfiles>
<configfile name="python_wrapper">
import numpy as np
from typing import Optional
from ms2deepscore.SettingsMS2Deepscore import SettingsMS2Deepscore

random_seed: Optional[int] = $compound_pairs_selection_settings.random_seed

params = {
"base_dims": tuple(int(dim.strip()) for dim in "$model_structure.base_dims".split(",")),
"embedding_dim": $model_structure.embedding_dim,
"ionisation_mode": "$model_structure.ionisation_mode",
"dropout_rate": $training_settings.dropout_rate,
"learning_rate": $training_settings.learning_rate,
"epochs": $training_settings.epochs,
"patience": $training_settings.patience,
"loss_function": "$training_settings.loss_function",
"weighting_factor": $training_settings.weighting_factor,
"min_mz": $tensorization_settings.min_mz,
"max_mz": $tensorization_settings.max_mz,
"mz_bin_width": $tensorization_settings.mz_bin_width,
"intensity_scaling": $tensorization_settings.intensity_scaling,
"batch_size": $data_generator_settings.batch_size,
"average_pairs_per_bin": $compound_pairs_selection_settings.average_pairs_per_bin,
"same_prob_bins": np.array([(0, 0.2), (0.2, 1.0)]),
"random_seed": random_seed,
"fingerprint_type": "$tanimoto_score_settings.fingerprint_type",
"fingerprint_nbits": $tanimoto_score_settings.fingerprint_nbits
}

settings = SettingsMS2Deepscore(**params)
settings.save_to_file("$output_file")
</configfile>
</configfiles>

<inputs>
<expand macro="config_generator" />
</inputs>

<outputs>
<data name="output_file" format="json" label="Model Parameter JSON" />
</outputs>

<tests>
<test expect_num_outputs="1">
<param name="base_dims" value="200,200" />
<param name="embedding_dim" value="100" />
<param name="ionisation_mode" value="negative" />
<param name="epochs" value="2" />
<param name="batch_size" value="2" />
<param name="average_pairs_per_bin" value="2" />
<param name="random_seed" value="42"/>
<output name="output_file" value="Model_Parameter_JSON.json" ftype="json" compare="diff" lines_diff="2"/>
</test>
</tests>

<help>
This tool generates model parameters for MS2DeepScore in JSON format based on the provided settings.
</help>
<expand macro="citations"/>
</tool>
74 changes: 74 additions & 0 deletions tools/ms2deepscore/ms2deepscore_similarity.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
<tool id="ms2deepscore_similarity" name="MS2DeepScore Similarity" version="@TOOL_VERSION@+galaxy0">
<description>Compute similarity scores using a pre-trained MS2DeepScore model</description>
<macros>
<import>macros.xml</import>
</macros>
<expand macro="creator"/>
<expand macro="edam" />

<requirements>
<requirement type="package" version="@TOOL_VERSION@">ms2deepscore</requirement>
<requirement type="package" version="1.16.2">onnx</requirement>
</requirements>

<command detect_errors="exit_code"><![CDATA[
python3 ${python_wrapper}
]]></command>
<configfiles>
<configfile name="python_wrapper">
@init_logger@

import onnx
import torch
from ms2deepscore import MS2DeepScore
from ms2deepscore.models.SiameseSpectralModel import SiameseSpectralModel
from ms2deepscore.SettingsMS2Deepscore import SettingsMS2Deepscore

onnx_model = onnx.load("$model")

# Extract the initializers (weights and biases)
initializers = {init.name: onnx.numpy_helper.to_array(init) for init in onnx_model.graph.initializer}

# Convert NumPy arrays to PyTorch tensors
state_dict = {name: torch.tensor(np_array) for name, np_array in initializers.items()}

@json_load@

model = SiameseSpectralModel(settings=SettingsMS2Deepscore(**model_params))
model.load_state_dict(state_dict)
model.eval()

similarity = MS2DeepScore(model)
name="MS2DeepScore_similarity_scores"

@init_scores@

layer = similarity.sparse_array(
references=scores.references,
queries=scores.queries,
idx_row = scores._scores.row,
idx_col = scores._scores.col,
is_symmetric=False)

scores._scores.add_sparse_data(scores._scores.row, scores._scores.col, layer, name)

scores.filter_by_range(name=name, low=0)
scores.to_json("$similarity_scores")
</configfile>
</configfiles>

<inputs>
<expand macro="input_param" />
</inputs>

<outputs>
<data label="ms2deepscore scores of ${on_string}" name="similarity_scores" format="json"/>
</outputs>

<help>
ms2deepscore provides a Siamese neural network that is trained to predict molecular structural
similarities (Tanimoto scores) from pairs of mass spectrometry spectra.
</help>

<expand macro="citations"/>
</tool>
84 changes: 84 additions & 0 deletions tools/ms2deepscore/ms2deepscore_training.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
<tool id="ms2deepscore_training" name="MS2DeepScore training" version="@TOOL_VERSION@+galaxy0">
<description>Compute similarity scores using a pre-trained MS2DeepScore model</description>
<macros>
<import>macros.xml</import>
</macros>
<expand macro="creator"/>
<expand macro="edam" />

<requirements>
<requirement type="package" version="@TOOL_VERSION@">ms2deepscore</requirement>
<requirement type="package" version="1.16.2">onnx</requirement>
</requirements>

<command detect_errors="exit_code"><![CDATA[
mkdir processing;
cp $spectra processing/input."$spectra.ext";
python3 ${python_wrapper}
]]></command>
<configfiles>
<configfile name="python_wrapper">
import onnx
import os
import torch
from ms2deepscore.models import load_model
from ms2deepscore.SettingsMS2Deepscore import SettingsMS2Deepscore
from ms2deepscore.wrapper_functions.training_wrapper_functions import train_ms2deepscore_wrapper, StoreTrainingData

@json_load@

settings = SettingsMS2Deepscore(**model_params)
file = "processing/input.$spectra.ext"
directory = train_ms2deepscore_wrapper(file, settings, $validation_split_fraction)

expected_file_names = StoreTrainingData(file)
pt_model_path = os.path.join(expected_file_names.trained_models_folder, directory, settings.model_file_name)

model = load_model(pt_model_path)
model.eval()

batch_size = 1
number_of_bins = settings.number_of_bins()
additional_inputs = len(settings.additional_metadata)

# Create dummy inputs
spectra_tensors_1 = torch.randn(batch_size, number_of_bins)
spectra_tensors_2 = torch.randn(batch_size, number_of_bins)
metadata_1 = torch.randn(batch_size, additional_inputs)
metadata_2 = torch.randn(batch_size, additional_inputs)

# Export the model to ONNX
torch.onnx.export(
model,
(spectra_tensors_1, spectra_tensors_2, metadata_1, metadata_2),
"$onnx_trained_model",
verbose=True
)

</configfile>
</configfiles>

<inputs>
<expand macro="training_param" />
</inputs>

<outputs>
<data label="Trained model" name="onnx_trained_model" format="onnx"/>
</outputs>

<tests>
<test expect_num_outputs="1">
<param name="spectra" value="clean_spectra.mgf" ftype="mgf"/>
<param name="model_param" value="Model_Parameter_JSON.json" ftype="json" />
<param name="validation_split_fraction" value="5"/>
<output name="onnx_trained_model" value="Trained_model.onnx" ftype="onnx" compare="sim_size"/>
</test>
</tests>

<help>
ms2deepscore provides a Siamese neural network that is trained to predict molecular structural
similarities (Tanimoto scores) from pairs of mass spectrometry spectra.
</help>

<expand macro="citations"/>
</tool>
Loading

0 comments on commit bc5b20a

Please sign in to comment.