Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added ms2deepscore tool suite #568

Merged
merged 21 commits into from
Sep 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions tools/ms2deepscore/.shed.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
name: ms2deepscore
owner: recetox
remote_repository_url: "https://github.com/RECETOX/galaxytools/tree/master/tools/ms2deepscore"
homepage_url: "https://github.com/matchms/ms2deepscore"
categories:
- Metabolomics
description: "Mass spectra similarity scoring using a trained ms2deepscore model."
long_description: "ms2deepscore provides a Siamese neural network that is trained to predict molecular structural similarities (Tanimoto scores) from pairs of mass spectrometry spectra."
auto_tool_repositories:
name_template: "{{ tool_id }}"
description_template: "{{ tool_name }} tool from the ms2deepscore package."
suite:
name: suite_ms2deepscore
description: tools from the ms2deepscore suite are used for training a siamese model, and computing the similarities between pairs of spectra.
type: repository_suite_definition

150 changes: 150 additions & 0 deletions tools/ms2deepscore/macros.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
<macros>
<token name="@TOOL_VERSION@">2.0.0</token>
<token name="@ONNX_VERSION@">1.16.2</token>

<xml name="creator">
<creator>
<person
givenName="Zargham"
familyName="Ahmad"
url="https://github.com/zargham-ahmad"
identifier="0000-0002-6096-224X" />
<organization
url="https://www.recetox.muni.cz/"
email="[email protected]"
name="RECETOX MUNI" />
</creator>
</xml>

<xml name="edam">
<xrefs>
<xref type="bio.tools">ms2deepscore</xref>
</xrefs>
</xml>

<xml name="input_param">
<conditional name="scores">
<param name="use_scores" label="Use Scores Object" type="select">
<option value="False" selected="true">FALSE</option>
<option value="True">TRUE</option>
</param>
<when value="True">
<param label="Scores object" name="scores_in" type="data" format="json"
help="Scores objects calculated previously using one of the matchms similarity tools." />
</when>
<when value="False">
<param label="Queries spectra" name="queries" type="data" format="msp"
help="Query mass spectra to match against references."/>
<param label="Reference spectra" name="references" type="data" format="msp"
help="Reference mass spectra to match against as library."/>
</when>
</conditional>
<param name="model" type="data" format="onnx" label="Model"
help="Select the trained MS2DeepScore model file (onnx format) in the ONNX format as created by the 'MS2DeepScore Training' tool."/>
<param name="model_param" type="data" format="json" label="Configuration"
help="Select the MS2DeepScore model configurations in JSON format. Can be created using the 'MS2DeepScore Config Generator' tool."/>
</xml>

<xml name="training_param">
<param label="Training Dataset" name="spectra" type="data" format="msp,mgf"
help="Spectra file that should be used for training. (it will be split in training, validation and test sets)."/>
<param name="model_param" type="data" format="json" label="Model Settings" help="json file with the MS2Deepscore model settings."/>
<param name="validation_split_fraction" type="integer" min="0" max="100" value="20" label="Validation split fraction [%]"
help="The fraction of the inchikeys that will be used for validation and test"/>
</xml>

<xml name="config_generator">
<section name="model_structure" title="Model Structure" expanded="true">
<repeat name="layers" title="Layer" min="1" default="1" >
<param name="dims" type="integer" label="Dimensions" min = "0" value="2000" help="Size of the in-between layer to add." />
</repeat>
<param name="embedding_dim" type="integer" label="Embedding Dimension" value="400" help="The dimension of the final embedding layer." />
<param name="ionisation_mode" type="select" label="Ionisation Mode">
<option value="positive" selected="true">Positive</option>
<option value="negative">Negative</option>
<option value="both">Both</option>
</param>
</section>

<section name="tensorization_settings" title="Tensorization Settings" expanded="true">
<param name="min_mz" type="integer" label="Min m/z" value="10" />
<param name="max_mz" type="integer" label="Max m/z" value="1000" />
<param name="mz_bin_width" type="float" label="m/z Bin Width" value="0.1" />
<param name="intensity_scaling" type="float" label="Intensity Scaling" value="0.5" />
<param name="fingerprint_type" type="text" value="daylight" label="Fingerprint Type" help="The fingerprint type that should be used for tanimoto score calculations." />
<param name="fingerprint_nbits" type="integer" label="Fingerprint Number of Bits" value="2048" help="The number of bits to use for the fingerprint." />
</section>


<section name="training_settings" title="Training Settings" expanded="false">
<param name="dropout_rate" type="float" label="Dropout Rate" value="0.0" />
<param name="learning_rate" type="float" label="Learning Rate" value="0.00025" />
<param name="epochs" type="integer" label="Epochs" value="250" />
<param name="patience" type="integer" label="Patience" value="20" help="How long the model should keep training if validation does not improve" />
<param name="loss_function" type="select" label="Loss Function">
<option value="mse" selected="true">Mean Squared Error (mse)</option>
<option value="mae">Mean Absolute Error (mae)</option>
<option value="rmse">Root Mean Squared Error (rmse)</option>
<option value="risk_mae">Risk Aware MAE (risk_aware_mae)</option>
<option value="risk_mse">Risk Aware MSE (risk_aware_mse)</option>
</param>
<param name="weighting_factor" type="integer" label="Weighting Factor" value="0" />
<param name="batch_size" type="integer" value="32" label="Batch Size" help="Number of pairs per batch" />
<param name="average_pairs_per_bin" type="integer" value="20" label="Average pairs per bin" help="The aimed average number of pairs of spectra per spectrum in each bin." />
<param name="random_seed" type="text" label="Random seed" value="None" help="Specify random seed for reproducible random number generation." />
</section>
</xml>

<xml name="citations">
<citations>
<citation type="doi">https://doi.org/10.1186/s13321-021-00558-4</citation>
<citation type="doi">https://doi.org/10.1101/2024.03.25.586580</citation>
</citations>
</xml>


<token name="@HELP@">
ms2deepscore provides a Siamese neural network that is trained to predict molecular structural similarities (Tanimoto scores) from pairs of mass spectrometry spectra.
The library provides an intuitive classes to prepare data, train a siamese model, and compute similarities between pairs of spectra.
In addition to the prediction of a structural similarity, MS2DeepScore can also make use of Monte-Carlo dropout to assess the model uncertainty.
MS2DeepScore is able to identify highly-reliable structural matches and to predict Tanimoto scores for pairs of molecules based on their fragment spectra with a root mean squared error of about 0.15.
Furthermore, the prediction uncertainty estimate can be used to select a subset of predictions with a root mean squared error of about 0.1.
MS2DeepScore can also be used to create chemically meaningful mass spectral embeddings that could be used to cluster large numbers of spectra.
</token>


<token name="@init_scores@">
from matchms.importing import load_from_msp, scores_from_json
from matchms import Scores
#if $scores.use_scores == "True"
scores = scores_from_json("${scores_in}")
#else
scores = Scores(references=list(load_from_msp("$references")), queries=list(load_from_msp("$queries")), is_symmetric=False)
#end if
</token>

<token name="@init_logger@">
from matchms import set_matchms_logger_level
set_matchms_logger_level("WARNING")
</token>

<token name="@json_load@">
import numpy as np
import json

with open("$model_param", 'r') as json_file:
model_params = json.load(json_file)

# Conditionally convert specific keys if they are present
if 'base_dims' in model_params:
model_params['base_dims'] = tuple(model_params['base_dims'])

if 'same_prob_bins' in model_params:
model_params['same_prob_bins'] = np.array(model_params['same_prob_bins'])

if 'additional_metadata' in model_params:
model_params['additional_metadata'] = [
(entry[0], entry[1]) for entry in model_params['additional_metadata']
]
</token>
</macros>
89 changes: 89 additions & 0 deletions tools/ms2deepscore/ms2deepscore_config_generator.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
<tool id="ms2deepscore_config_generator" name="MS2DeepScore Configuration Generator" version="@TOOL_VERSION@+galaxy0">
<description>Generates model parameters for MS2DeepScore in JSON format</description>
<macros>
<import>macros.xml</import>
</macros>
<expand macro="creator"/>
<expand macro="edam" />

<requirements>
<requirement type="package" version="@TOOL_VERSION@">ms2deepscore</requirement>
</requirements>

<command detect_errors="exit_code"><![CDATA[
python3 ${python_wrapper}
]]></command>
<configfiles>
<configfile name="python_wrapper">
import numpy as np
from typing import Optional
from ms2deepscore.SettingsMS2Deepscore import SettingsMS2Deepscore

random_seed: Optional[int] = $training_settings.random_seed

#set dims = tuple([int($layer.dims) for $i, $layer in enumerate($model_structure.layers)])

params = {
"base_dims": $dims,
"embedding_dim": $model_structure.embedding_dim,
"ionisation_mode": "$model_structure.ionisation_mode",
"dropout_rate": $training_settings.dropout_rate,
"learning_rate": $training_settings.learning_rate,
"epochs": $training_settings.epochs,
"patience": $training_settings.patience,
"loss_function": "$training_settings.loss_function",
"weighting_factor": $training_settings.weighting_factor,
"min_mz": $tensorization_settings.min_mz,
"max_mz": $tensorization_settings.max_mz,
"mz_bin_width": $tensorization_settings.mz_bin_width,
"intensity_scaling": $tensorization_settings.intensity_scaling,
"batch_size": $training_settings.batch_size,
"average_pairs_per_bin": $training_settings.average_pairs_per_bin,
"same_prob_bins": np.array([(0, 0.2), (0.2, 1.0)]),
"random_seed": random_seed,
"fingerprint_type": "$tensorization_settings.fingerprint_type",
"fingerprint_nbits": $tensorization_settings.fingerprint_nbits
}

settings = SettingsMS2Deepscore(**params)
settings.save_to_file("$output_file")
</configfile>
</configfiles>

<inputs>
<expand macro="config_generator" />
</inputs>

<outputs>
<data name="output_file" format="json" label="Model Parameter JSON" />
</outputs>

<tests>
<test expect_num_outputs="1">
<param name="layers_0|dims" value="20"/>
<param name="layers_1|dims" value="20"/>
<param name="embedding_dim" value="15" />
<param name="ionisation_mode" value="negative" />
<param name="epochs" value="2" />
<param name="batch_size" value="2" />
<param name="average_pairs_per_bin" value="2" />
<param name="random_seed" value="42"/>
<output name="output_file" value="Model_Parameter_JSON.json" ftype="json" compare="diff" lines_diff="2"/>
</test>
</tests>

<help>
<![CDATA[
Info
====
This tool generates a configuration file needed to train a MS2DeepScore model using the 'MS2DeepScore Training' tool.
The generated JSON file contains all the parameters necessary for model training - this includes the model architecture as well as information regarding how to discretize the input tensors.
If you trained a model offline, you can also upload the JSON file configuration of that trained model into Galaxy and use it for the 'MS2DeepScore' similarity or training module.

About
=====
@HELP@
]]>
</help>
<expand macro="citations"/>
</tool>
104 changes: 104 additions & 0 deletions tools/ms2deepscore/ms2deepscore_similarity.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
<tool id="ms2deepscore_similarity" name="MS2DeepScore Similarity" version="@TOOL_VERSION@+galaxy0">
<description>Compute similarity scores using a pre-trained MS2DeepScore model</description>
<macros>
<import>macros.xml</import>
</macros>
<expand macro="creator"/>
<expand macro="edam" />

<requirements>
<requirement type="package" version="@TOOL_VERSION@">ms2deepscore</requirement>
<requirement type="package" version="@ONNX_VERSION@">onnx</requirement>
</requirements>

<command detect_errors="exit_code"><![CDATA[
python3 ${python_wrapper}
]]></command>
<configfiles>
<configfile name="python_wrapper">
@init_logger@

import onnx
import torch
from ms2deepscore import MS2DeepScore
from matchms import calculate_scores
from matchms.importing import load_from_msp, scores_from_json
from ms2deepscore.models.SiameseSpectralModel import SiameseSpectralModel
from ms2deepscore.SettingsMS2Deepscore import SettingsMS2Deepscore

onnx_model = onnx.load("$model")

# Extract the initializers (weights and biases)
initializers = {init.name: onnx.numpy_helper.to_array(init) for init in onnx_model.graph.initializer}

# Convert NumPy arrays to PyTorch tensors
state_dict = {name: torch.tensor(np_array) for name, np_array in initializers.items()}

@json_load@

model = SiameseSpectralModel(settings=SettingsMS2Deepscore(**model_params))
model.load_state_dict(state_dict)
model.eval()

#if $scores.use_scores == "True"
scores_in = scores_from_json("${scores_in}")
references=scores_in.references
queries=scores_in.queries
#else
references=list(load_from_msp("$references"))
queries=list(load_from_msp("$queries"))
#end if

similarity = MS2DeepScore(model)
scores = calculate_scores(references, queries, similarity)

#if $scores.use_scores == "True"
scores_in._scores.add_dense_matrix(scores.to_array(), "MS2DeepScore")
scores_in.to_json("$similarity_scores")
#else
scores.to_json("$similarity_scores")
#end if
</configfile>
</configfiles>

<inputs>
<expand macro="input_param" />
</inputs>

<outputs>
<data label="ms2deepscore scores of ${on_string}" name="similarity_scores" format="json"/>
</outputs>

<tests>
<test expect_num_outputs="1">
<param name="use_scores" value="False"/>
<param name="references" value="inp_filtered_library.msp" ftype="msp"/>
<param name="queries" value="inp_filtered_spectra.msp" ftype="msp"/>
<param name="model" value="Trained_model.onnx" ftype="onnx"/>
<param name="model_param" value="Model_Parameter_JSON.json" ftype="json"/>
<output name="similarity_scores" value="msp_json_score_out.json" ftype="json" compare="sim_size" />
</test>
<test expect_num_outputs="1">
<param name="use_scores" value="True"/>
<param name="scores_in" value="ri_match_60.json" ftype="json"/>
<param name="model" value="Trained_model.onnx" ftype="onnx"/>
<param name="model_param" value="Model_Parameter_JSON.json" ftype="json"/>
<output name="similarity_scores" value="usescore_json_score_out.json" ftype="json" compare="sim_size" />
</test>
</tests>

<help>
<![CDATA[
Info
====
Use a MS2DeepScore model for spectral similarity calculation.
For security reasons, only ONNX models are supported to be used.
If you want to use your own model, you can convert it to ONNX using the code contained in the 'MS2DeepScore Model Training' tool.

About
=====
@HELP@
]]>
</help>
<expand macro="citations"/>
</tool>
Loading