-
Notifications
You must be signed in to change notification settings - Fork 13
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #568 from zargham-ahmad/ms2deepscore
Added ms2deepscore tool suite
- Loading branch information
Showing
13 changed files
with
12,261 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
name: ms2deepscore | ||
owner: recetox | ||
remote_repository_url: "https://github.com/RECETOX/galaxytools/tree/master/tools/ms2deepscore" | ||
homepage_url: "https://github.com/matchms/ms2deepscore" | ||
categories: | ||
- Metabolomics | ||
description: "Mass spectra similarity scoring using a trained ms2deepscore model." | ||
long_description: "ms2deepscore provides a Siamese neural network that is trained to predict molecular structural similarities (Tanimoto scores) from pairs of mass spectrometry spectra." | ||
auto_tool_repositories: | ||
name_template: "{{ tool_id }}" | ||
description_template: "{{ tool_name }} tool from the ms2deepscore package." | ||
suite: | ||
name: suite_ms2deepscore | ||
description: tools from the ms2deepscore suite are used for training a siamese model, and computing the similarities between pairs of spectra. | ||
type: repository_suite_definition | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,150 @@ | ||
<macros> | ||
<token name="@TOOL_VERSION@">2.0.0</token> | ||
<token name="@ONNX_VERSION@">1.16.2</token> | ||
|
||
<xml name="creator"> | ||
<creator> | ||
<person | ||
givenName="Zargham" | ||
familyName="Ahmad" | ||
url="https://github.com/zargham-ahmad" | ||
identifier="0000-0002-6096-224X" /> | ||
<organization | ||
url="https://www.recetox.muni.cz/" | ||
email="[email protected]" | ||
name="RECETOX MUNI" /> | ||
</creator> | ||
</xml> | ||
|
||
<xml name="edam"> | ||
<xrefs> | ||
<xref type="bio.tools">ms2deepscore</xref> | ||
</xrefs> | ||
</xml> | ||
|
||
<xml name="input_param"> | ||
<conditional name="scores"> | ||
<param name="use_scores" label="Use Scores Object" type="select"> | ||
<option value="False" selected="true">FALSE</option> | ||
<option value="True">TRUE</option> | ||
</param> | ||
<when value="True"> | ||
<param label="Scores object" name="scores_in" type="data" format="json" | ||
help="Scores objects calculated previously using one of the matchms similarity tools." /> | ||
</when> | ||
<when value="False"> | ||
<param label="Queries spectra" name="queries" type="data" format="msp" | ||
help="Query mass spectra to match against references."/> | ||
<param label="Reference spectra" name="references" type="data" format="msp" | ||
help="Reference mass spectra to match against as library."/> | ||
</when> | ||
</conditional> | ||
<param name="model" type="data" format="onnx" label="Model" | ||
help="Select the trained MS2DeepScore model file (onnx format) in the ONNX format as created by the 'MS2DeepScore Training' tool."/> | ||
<param name="model_param" type="data" format="json" label="Configuration" | ||
help="Select the MS2DeepScore model configurations in JSON format. Can be created using the 'MS2DeepScore Config Generator' tool."/> | ||
</xml> | ||
|
||
<xml name="training_param"> | ||
<param label="Training Dataset" name="spectra" type="data" format="msp,mgf" | ||
help="Spectra file that should be used for training. (it will be split in training, validation and test sets)."/> | ||
<param name="model_param" type="data" format="json" label="Model Settings" help="json file with the MS2Deepscore model settings."/> | ||
<param name="validation_split_fraction" type="integer" min="0" max="100" value="20" label="Validation split fraction [%]" | ||
help="The fraction of the inchikeys that will be used for validation and test"/> | ||
</xml> | ||
|
||
<xml name="config_generator"> | ||
<section name="model_structure" title="Model Structure" expanded="true"> | ||
<repeat name="layers" title="Layer" min="1" default="1" > | ||
<param name="dims" type="integer" label="Dimensions" min = "0" value="2000" help="Size of the in-between layer to add." /> | ||
</repeat> | ||
<param name="embedding_dim" type="integer" label="Embedding Dimension" value="400" help="The dimension of the final embedding layer." /> | ||
<param name="ionisation_mode" type="select" label="Ionisation Mode"> | ||
<option value="positive" selected="true">Positive</option> | ||
<option value="negative">Negative</option> | ||
<option value="both">Both</option> | ||
</param> | ||
</section> | ||
|
||
<section name="tensorization_settings" title="Tensorization Settings" expanded="true"> | ||
<param name="min_mz" type="integer" label="Min m/z" value="10" /> | ||
<param name="max_mz" type="integer" label="Max m/z" value="1000" /> | ||
<param name="mz_bin_width" type="float" label="m/z Bin Width" value="0.1" /> | ||
<param name="intensity_scaling" type="float" label="Intensity Scaling" value="0.5" /> | ||
<param name="fingerprint_type" type="text" value="daylight" label="Fingerprint Type" help="The fingerprint type that should be used for tanimoto score calculations." /> | ||
<param name="fingerprint_nbits" type="integer" label="Fingerprint Number of Bits" value="2048" help="The number of bits to use for the fingerprint." /> | ||
</section> | ||
|
||
|
||
<section name="training_settings" title="Training Settings" expanded="false"> | ||
<param name="dropout_rate" type="float" label="Dropout Rate" value="0.0" /> | ||
<param name="learning_rate" type="float" label="Learning Rate" value="0.00025" /> | ||
<param name="epochs" type="integer" label="Epochs" value="250" /> | ||
<param name="patience" type="integer" label="Patience" value="20" help="How long the model should keep training if validation does not improve" /> | ||
<param name="loss_function" type="select" label="Loss Function"> | ||
<option value="mse" selected="true">Mean Squared Error (mse)</option> | ||
<option value="mae">Mean Absolute Error (mae)</option> | ||
<option value="rmse">Root Mean Squared Error (rmse)</option> | ||
<option value="risk_mae">Risk Aware MAE (risk_aware_mae)</option> | ||
<option value="risk_mse">Risk Aware MSE (risk_aware_mse)</option> | ||
</param> | ||
<param name="weighting_factor" type="integer" label="Weighting Factor" value="0" /> | ||
<param name="batch_size" type="integer" value="32" label="Batch Size" help="Number of pairs per batch" /> | ||
<param name="average_pairs_per_bin" type="integer" value="20" label="Average pairs per bin" help="The aimed average number of pairs of spectra per spectrum in each bin." /> | ||
<param name="random_seed" type="text" label="Random seed" value="None" help="Specify random seed for reproducible random number generation." /> | ||
</section> | ||
</xml> | ||
|
||
<xml name="citations"> | ||
<citations> | ||
<citation type="doi">https://doi.org/10.1186/s13321-021-00558-4</citation> | ||
<citation type="doi">https://doi.org/10.1101/2024.03.25.586580</citation> | ||
</citations> | ||
</xml> | ||
|
||
|
||
<token name="@HELP@"> | ||
ms2deepscore provides a Siamese neural network that is trained to predict molecular structural similarities (Tanimoto scores) from pairs of mass spectrometry spectra. | ||
The library provides an intuitive classes to prepare data, train a siamese model, and compute similarities between pairs of spectra. | ||
In addition to the prediction of a structural similarity, MS2DeepScore can also make use of Monte-Carlo dropout to assess the model uncertainty. | ||
MS2DeepScore is able to identify highly-reliable structural matches and to predict Tanimoto scores for pairs of molecules based on their fragment spectra with a root mean squared error of about 0.15. | ||
Furthermore, the prediction uncertainty estimate can be used to select a subset of predictions with a root mean squared error of about 0.1. | ||
MS2DeepScore can also be used to create chemically meaningful mass spectral embeddings that could be used to cluster large numbers of spectra. | ||
</token> | ||
|
||
|
||
<token name="@init_scores@"> | ||
from matchms.importing import load_from_msp, scores_from_json | ||
from matchms import Scores | ||
#if $scores.use_scores == "True" | ||
scores = scores_from_json("${scores_in}") | ||
#else | ||
scores = Scores(references=list(load_from_msp("$references")), queries=list(load_from_msp("$queries")), is_symmetric=False) | ||
#end if | ||
</token> | ||
|
||
<token name="@init_logger@"> | ||
from matchms import set_matchms_logger_level | ||
set_matchms_logger_level("WARNING") | ||
</token> | ||
|
||
<token name="@json_load@"> | ||
import numpy as np | ||
import json | ||
|
||
with open("$model_param", 'r') as json_file: | ||
model_params = json.load(json_file) | ||
|
||
# Conditionally convert specific keys if they are present | ||
if 'base_dims' in model_params: | ||
model_params['base_dims'] = tuple(model_params['base_dims']) | ||
|
||
if 'same_prob_bins' in model_params: | ||
model_params['same_prob_bins'] = np.array(model_params['same_prob_bins']) | ||
|
||
if 'additional_metadata' in model_params: | ||
model_params['additional_metadata'] = [ | ||
(entry[0], entry[1]) for entry in model_params['additional_metadata'] | ||
] | ||
</token> | ||
</macros> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
<tool id="ms2deepscore_config_generator" name="MS2DeepScore Configuration Generator" version="@TOOL_VERSION@+galaxy0"> | ||
<description>Generates model parameters for MS2DeepScore in JSON format</description> | ||
<macros> | ||
<import>macros.xml</import> | ||
</macros> | ||
<expand macro="creator"/> | ||
<expand macro="edam" /> | ||
|
||
<requirements> | ||
<requirement type="package" version="@TOOL_VERSION@">ms2deepscore</requirement> | ||
</requirements> | ||
|
||
<command detect_errors="exit_code"><![CDATA[ | ||
python3 ${python_wrapper} | ||
]]></command> | ||
<configfiles> | ||
<configfile name="python_wrapper"> | ||
import numpy as np | ||
from typing import Optional | ||
from ms2deepscore.SettingsMS2Deepscore import SettingsMS2Deepscore | ||
|
||
random_seed: Optional[int] = $training_settings.random_seed | ||
|
||
#set dims = tuple([int($layer.dims) for $i, $layer in enumerate($model_structure.layers)]) | ||
|
||
params = { | ||
"base_dims": $dims, | ||
"embedding_dim": $model_structure.embedding_dim, | ||
"ionisation_mode": "$model_structure.ionisation_mode", | ||
"dropout_rate": $training_settings.dropout_rate, | ||
"learning_rate": $training_settings.learning_rate, | ||
"epochs": $training_settings.epochs, | ||
"patience": $training_settings.patience, | ||
"loss_function": "$training_settings.loss_function", | ||
"weighting_factor": $training_settings.weighting_factor, | ||
"min_mz": $tensorization_settings.min_mz, | ||
"max_mz": $tensorization_settings.max_mz, | ||
"mz_bin_width": $tensorization_settings.mz_bin_width, | ||
"intensity_scaling": $tensorization_settings.intensity_scaling, | ||
"batch_size": $training_settings.batch_size, | ||
"average_pairs_per_bin": $training_settings.average_pairs_per_bin, | ||
"same_prob_bins": np.array([(0, 0.2), (0.2, 1.0)]), | ||
"random_seed": random_seed, | ||
"fingerprint_type": "$tensorization_settings.fingerprint_type", | ||
"fingerprint_nbits": $tensorization_settings.fingerprint_nbits | ||
} | ||
|
||
settings = SettingsMS2Deepscore(**params) | ||
settings.save_to_file("$output_file") | ||
</configfile> | ||
</configfiles> | ||
|
||
<inputs> | ||
<expand macro="config_generator" /> | ||
</inputs> | ||
|
||
<outputs> | ||
<data name="output_file" format="json" label="Model Parameter JSON" /> | ||
</outputs> | ||
|
||
<tests> | ||
<test expect_num_outputs="1"> | ||
<param name="layers_0|dims" value="20"/> | ||
<param name="layers_1|dims" value="20"/> | ||
<param name="embedding_dim" value="15" /> | ||
<param name="ionisation_mode" value="negative" /> | ||
<param name="epochs" value="2" /> | ||
<param name="batch_size" value="2" /> | ||
<param name="average_pairs_per_bin" value="2" /> | ||
<param name="random_seed" value="42"/> | ||
<output name="output_file" value="Model_Parameter_JSON.json" ftype="json" compare="diff" lines_diff="2"/> | ||
</test> | ||
</tests> | ||
|
||
<help> | ||
<![CDATA[ | ||
Info | ||
==== | ||
This tool generates a configuration file needed to train a MS2DeepScore model using the 'MS2DeepScore Training' tool. | ||
The generated JSON file contains all the parameters necessary for model training - this includes the model architecture as well as information regarding how to discretize the input tensors. | ||
If you trained a model offline, you can also upload the JSON file configuration of that trained model into Galaxy and use it for the 'MS2DeepScore' similarity or training module. | ||
About | ||
===== | ||
@HELP@ | ||
]]> | ||
</help> | ||
<expand macro="citations"/> | ||
</tool> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,104 @@ | ||
<tool id="ms2deepscore_similarity" name="MS2DeepScore Similarity" version="@TOOL_VERSION@+galaxy0"> | ||
<description>Compute similarity scores using a pre-trained MS2DeepScore model</description> | ||
<macros> | ||
<import>macros.xml</import> | ||
</macros> | ||
<expand macro="creator"/> | ||
<expand macro="edam" /> | ||
|
||
<requirements> | ||
<requirement type="package" version="@TOOL_VERSION@">ms2deepscore</requirement> | ||
<requirement type="package" version="@ONNX_VERSION@">onnx</requirement> | ||
</requirements> | ||
|
||
<command detect_errors="exit_code"><![CDATA[ | ||
python3 ${python_wrapper} | ||
]]></command> | ||
<configfiles> | ||
<configfile name="python_wrapper"> | ||
@init_logger@ | ||
|
||
import onnx | ||
import torch | ||
from ms2deepscore import MS2DeepScore | ||
from matchms import calculate_scores | ||
from matchms.importing import load_from_msp, scores_from_json | ||
from ms2deepscore.models.SiameseSpectralModel import SiameseSpectralModel | ||
from ms2deepscore.SettingsMS2Deepscore import SettingsMS2Deepscore | ||
|
||
onnx_model = onnx.load("$model") | ||
|
||
# Extract the initializers (weights and biases) | ||
initializers = {init.name: onnx.numpy_helper.to_array(init) for init in onnx_model.graph.initializer} | ||
|
||
# Convert NumPy arrays to PyTorch tensors | ||
state_dict = {name: torch.tensor(np_array) for name, np_array in initializers.items()} | ||
|
||
@json_load@ | ||
|
||
model = SiameseSpectralModel(settings=SettingsMS2Deepscore(**model_params)) | ||
model.load_state_dict(state_dict) | ||
model.eval() | ||
|
||
#if $scores.use_scores == "True" | ||
scores_in = scores_from_json("${scores_in}") | ||
references=scores_in.references | ||
queries=scores_in.queries | ||
#else | ||
references=list(load_from_msp("$references")) | ||
queries=list(load_from_msp("$queries")) | ||
#end if | ||
|
||
similarity = MS2DeepScore(model) | ||
scores = calculate_scores(references, queries, similarity) | ||
|
||
#if $scores.use_scores == "True" | ||
scores_in._scores.add_dense_matrix(scores.to_array(), "MS2DeepScore") | ||
scores_in.to_json("$similarity_scores") | ||
#else | ||
scores.to_json("$similarity_scores") | ||
#end if | ||
</configfile> | ||
</configfiles> | ||
|
||
<inputs> | ||
<expand macro="input_param" /> | ||
</inputs> | ||
|
||
<outputs> | ||
<data label="ms2deepscore scores of ${on_string}" name="similarity_scores" format="json"/> | ||
</outputs> | ||
|
||
<tests> | ||
<test expect_num_outputs="1"> | ||
<param name="use_scores" value="False"/> | ||
<param name="references" value="inp_filtered_library.msp" ftype="msp"/> | ||
<param name="queries" value="inp_filtered_spectra.msp" ftype="msp"/> | ||
<param name="model" value="Trained_model.onnx" ftype="onnx"/> | ||
<param name="model_param" value="Model_Parameter_JSON.json" ftype="json"/> | ||
<output name="similarity_scores" value="msp_json_score_out.json" ftype="json" compare="sim_size" /> | ||
</test> | ||
<test expect_num_outputs="1"> | ||
<param name="use_scores" value="True"/> | ||
<param name="scores_in" value="ri_match_60.json" ftype="json"/> | ||
<param name="model" value="Trained_model.onnx" ftype="onnx"/> | ||
<param name="model_param" value="Model_Parameter_JSON.json" ftype="json"/> | ||
<output name="similarity_scores" value="usescore_json_score_out.json" ftype="json" compare="sim_size" /> | ||
</test> | ||
</tests> | ||
|
||
<help> | ||
<![CDATA[ | ||
Info | ||
==== | ||
Use a MS2DeepScore model for spectral similarity calculation. | ||
For security reasons, only ONNX models are supported to be used. | ||
If you want to use your own model, you can convert it to ONNX using the code contained in the 'MS2DeepScore Model Training' tool. | ||
About | ||
===== | ||
@HELP@ | ||
]]> | ||
</help> | ||
<expand macro="citations"/> | ||
</tool> |
Oops, something went wrong.