From 3b12f6a8e8f4e9180f01ca303393ffadf7a75bca Mon Sep 17 00:00:00 2001 From: Mingxun Wang Date: Sat, 28 Aug 2021 14:55:31 -0700 Subject: [PATCH 1/4] wip for resolution of variables --- massql/msql_engine.py | 86 ++++++++++++++++++++++++++++++++++--------- massql/msql_parser.py | 4 +- tests/test_parse.py | 8 +++- tests/test_query.py | 21 ++++++++++- 4 files changed, 98 insertions(+), 21 deletions(-) diff --git a/massql/msql_engine.py b/massql/msql_engine.py index 98b36f7..9689cca 100644 --- a/massql/msql_engine.py +++ b/massql/msql_engine.py @@ -168,27 +168,48 @@ def _evalute_variable_query(parsed_dict, input_filename, cache=True, parallel=Fa presearch_parse = copy.deepcopy(parsed_dict) non_variable_conditions = [] for condition in presearch_parse["conditions"]: - for value in condition["value"]: + if "value" in condition: + for value in condition["value"]: + try: + # Checking if X is in any string + if "X" in value: + continue + except TypeError: + # This is when the target is actually a float + pass + non_variable_conditions.append(condition) + elif "min" in condition: + min_val = condition["min"] + max_val = condition["max"] + try: + if "X" in min_val: + continue + except TypeError: + # This is when the target is actually a float + pass + try: - # Checking if X is in any string - if "X" in value: + if "X" in max_val: continue except TypeError: # This is when the target is actually a float pass + non_variable_conditions.append(condition) + presearch_parse["conditions"] = non_variable_conditions ms1_df, ms2_df = _executeconditions_query(presearch_parse, input_filename, cache=cache) variable_x_ms1_df = ms1_df - # TODO: Checking if we can prefilter the X variable, if there are conditions + # Here we are trying to pre-filter conditions based upon the qualifiers to make the variable search space smaller for condition in parsed_dict["conditions"]: if not condition["conditiontype"] == "where": continue - if not "X" in condition["value"]: - continue + if "value" in condition: + if not "X" in condition["value"]: + continue # Filtering MS1 peaks only to consider contention for X if condition["type"] == "ms1mzcondition": @@ -198,6 +219,8 @@ def _evalute_variable_query(parsed_dict, input_filename, cache=True, parallel=Fa (ms1_df["i_norm"] > min_intpercent) & (ms1_df["i_tic_norm"] > min_tic_percent_intensity)] + # TODO: Do this for other types of variables + # Here we will start with the smallest mass and then go up masses_considered_df_list = [] if variable_properties["query_ms1"]: @@ -227,27 +250,56 @@ def _evalute_variable_query(parsed_dict, input_filename, cache=True, parallel=Fa mz_val = masses_obj["mz"] for condition in substituted_parse["conditions"]: - for i, value in enumerate(condition["value"]): - # Rewriting the condition value + # This is for standard conditions + if "value" in condition: + for i, value in enumerate(condition["value"]): + # Rewriting the condition value + try: + if "X" in value: + new_value = math_parser.parse(value).evaluate({ + "X" : mz_val + }) + condition["value"][i] = new_value + except TypeError: + # This is when the target is actually a float + pass + + # Rewriting the qualifier values + try: + if "qualifiers" in condition: + for qualifier in condition["qualifiers"]: + if "qualifier" in qualifier: + if "value" in condition["qualifiers"][qualifier]: + old_value = condition["qualifiers"][qualifier]["value"] + condition["qualifiers"][qualifier]["value"] = old_value.replace("X", str(mz_val)) + except AttributeError: + pass + + # TODO: For other types of conditions that might include variables + if "min" in condition: + # Rewriting the condition min + value = condition["min"] try: if "X" in value: new_value = math_parser.parse(value).evaluate({ "X" : mz_val }) - condition["value"][i] = new_value + condition["min"] = new_value except TypeError: # This is when the target is actually a float pass - # Rewriting the qualifier values + if "max" in condition: + # Rewriting the condition min + value = condition["max"] try: - if "qualifiers" in condition: - for qualifier in condition["qualifiers"]: - if "qualifier" in qualifier: - if "value" in condition["qualifiers"][qualifier]: - old_value = condition["qualifiers"][qualifier]["value"] - condition["qualifiers"][qualifier]["value"] = old_value.replace("X", str(mz_val)) - except AttributeError: + if "X" in value: + new_value = math_parser.parse(value).evaluate({ + "X" : mz_val + }) + condition["max"] = new_value + except TypeError: + # This is when the target is actually a float pass # Let's consider this mz diff --git a/massql/msql_parser.py b/massql/msql_parser.py index 9561cf8..33a08b4 100644 --- a/massql/msql_parser.py +++ b/massql/msql_parser.py @@ -191,8 +191,8 @@ def condition(self, items): condition_dict["type"] = condition_type if function == "mobilityrange": - condition_dict["min"] = float(items[-2]) - condition_dict["max"] = float(items[-1]) + condition_dict["min"] = items[-2] + condition_dict["max"] = items[-1] return condition_dict diff --git a/tests/test_parse.py b/tests/test_parse.py index 45ae6b8..fc6a683 100644 --- a/tests/test_parse.py +++ b/tests/test_parse.py @@ -183,6 +183,11 @@ def test_mobility(): assert(parsed_output["conditions"][0]["min"] == 100) +def test_mobility_variables(): + query = "QUERY scaninfo(MS2DATA) WHERE MS2PREC=X AND MOBILITY=range(min=X/100, max=2*X/100)" + parsed_output = msql_parser.parse_msql(query) + print(parsed_output) + def main(): #test_xrange_parse() #test_parse() @@ -200,7 +205,8 @@ def main(): #test_ms2_or() #test_ms1_multiple_or() #test_ms1_multiple_or_with_variable() - test_mobility() + #test_mobility() + test_mobility_variables() if __name__ == "__main__": diff --git a/tests/test_query.py b/tests/test_query.py index 7fd020f..d904a0c 100644 --- a/tests/test_query.py +++ b/tests/test_query.py @@ -547,6 +547,24 @@ def test_ms2_mobility(): assert(len(results_df) == 8682) +def test_ms2_mobility_variable(): + query = "QUERY scaninfo(MS2DATA) WHERE MS2PREC=X AND MOBILITY=range(min=X/1000, max=1.5*X/1000)" + results_df = msql_engine.process_query(query, "tests/data/meoh_water_ms2_1_31_1_395.mzML") + + print(results_df) + + assert(len(results_df) == 8682) + +def test_ms2_mobility_variable2(): + query = "QUERY scaninfo(MS2DATA) WHERE MS2PREC=X AND MOBILITY=range(min=1, max=2)" + results_df = msql_engine.process_query(query, "tests/data/meoh_water_ms2_1_31_1_395.mzML") + + print(results_df) + + assert(len(results_df) == 8682) + + + def main(): #msql_engine.init_ray() @@ -607,8 +625,9 @@ def main(): #test_topdown() #test_defect() #test_or_against_iron() - test_quad_brominated() + #test_quad_brominated() #test_ms2_mobility() + test_ms2_mobility_variable() if __name__ == "__main__": main() From 963e96c3b61c60c908d2a1901705a3a1723374ac Mon Sep 17 00:00:00 2001 From: Mingxun Wang Date: Sat, 28 Aug 2021 15:03:55 -0700 Subject: [PATCH 2/4] refining query --- massql/msql_cmd.py | 12 ++++++++---- tests/test_query.py | 2 +- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/massql/msql_cmd.py b/massql/msql_cmd.py index c9d34e9..5a0638d 100644 --- a/massql/msql_cmd.py +++ b/massql/msql_cmd.py @@ -1,14 +1,18 @@ #!/usr/bin/env python -from massql import msql_parser -from massql import msql_engine -from massql import msql_extract - import argparse import os +import sys import json import pandas as pd +# Making sure the root is in the path, kind of a hack +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) + +from massql import msql_parser +from massql import msql_engine +from massql import msql_extract + def main(): parser = argparse.ArgumentParser(description="MSQL CMD") parser.add_argument('filename', help='Input filename') diff --git a/tests/test_query.py b/tests/test_query.py index d904a0c..c9bc595 100644 --- a/tests/test_query.py +++ b/tests/test_query.py @@ -548,7 +548,7 @@ def test_ms2_mobility(): assert(len(results_df) == 8682) def test_ms2_mobility_variable(): - query = "QUERY scaninfo(MS2DATA) WHERE MS2PREC=X AND MOBILITY=range(min=X/1000, max=1.5*X/1000)" + query = "QUERY scaninfo(MS2DATA) WHERE MS2PREC=X AND MOBILITY=range(min=X*0.0011+0.5-0.1, max=X*0.0011+0.5+0.1)" results_df = msql_engine.process_query(query, "tests/data/meoh_water_ms2_1_31_1_395.mzML") print(results_df) From a740c6aa988f51d27c634c918f6d9257a8c623ff Mon Sep 17 00:00:00 2001 From: Mingxun Wang Date: Sat, 28 Aug 2021 15:39:51 -0700 Subject: [PATCH 3/4] adding additional testing --- massql/msql_engine.py | 19 +++++++++++-------- massql/msql_engine_filters.py | 7 +++++-- tests/test_query.py | 11 ++++++----- 3 files changed, 22 insertions(+), 15 deletions(-) diff --git a/massql/msql_engine.py b/massql/msql_engine.py index 9689cca..e775124 100644 --- a/massql/msql_engine.py +++ b/massql/msql_engine.py @@ -240,15 +240,23 @@ def _evalute_variable_query(parsed_dict, input_filename, cache=True, parallel=Fa running_max_mz = 0 for masses_obj in tqdm(masses_list): - if running_max_mz > masses_obj["mz"]: + mz_val = masses_obj["mz"] + + if running_max_mz > mz_val: + continue + + # Cheking the validity of the mz_val + if mz_val < variable_properties["min"] or mz_val > variable_properties["max"]: + continue + mz_val_defect = mz_val - int(mz_val) + if mz_val_defect < variable_properties["mindefect"] or mz_val_defect > variable_properties["maxdefect"]: continue ####################### # Writing new query ####################### substituted_parse = copy.deepcopy(parsed_dict) - mz_val = masses_obj["mz"] - + for condition in substituted_parse["conditions"]: # This is for standard conditions if "value" in condition: @@ -307,11 +315,6 @@ def _evalute_variable_query(parsed_dict, input_filename, cache=True, parallel=Fa # Checking the x conditions substituted_parse["comment"] = str(mz_val) - if mz_val < variable_properties["min"] or mz_val > variable_properties["max"]: - continue - mz_val_defect = mz_val - int(mz_val) - if mz_val_defect < variable_properties["mindefect"] or mz_val_defect > variable_properties["maxdefect"]: - continue all_concrete_queries.append(substituted_parse) else: diff --git a/massql/msql_engine_filters.py b/massql/msql_engine_filters.py index 4974fec..6c101c8 100644 --- a/massql/msql_engine_filters.py +++ b/massql/msql_engine_filters.py @@ -273,8 +273,11 @@ def ms2prec_condition(condition, ms1_df, ms2_df, reference_conditions_register): ] # Filtering the MS1 data now - ms1_scans = set(ms2_df["ms1scan"]) - ms1_filtered_df = ms1_df[ms1_df["scan"].isin(ms1_scans)] + if len(ms1_df) > 0: + ms1_scans = set(ms2_df["ms1scan"]) + ms1_filtered_df = ms1_df[ms1_df["scan"].isin(ms1_scans)] + else: + ms1_filtered_df = ms1_df ms1_list.append(ms1_filtered_df) ms2_list.append(ms2_filtered_df) diff --git a/tests/test_query.py b/tests/test_query.py index c9bc595..11d8d23 100644 --- a/tests/test_query.py +++ b/tests/test_query.py @@ -548,20 +548,20 @@ def test_ms2_mobility(): assert(len(results_df) == 8682) def test_ms2_mobility_variable(): - query = "QUERY scaninfo(MS2DATA) WHERE MS2PREC=X AND MOBILITY=range(min=X*0.0011+0.5-0.1, max=X*0.0011+0.5+0.1)" + query = "QUERY scaninfo(MS2DATA) WHERE MS2PREC=X AND MOBILITY=range(min=X*0.0011+0.5-0.1, max=X*0.0011+0.5+0.1) AND X=range(min=854.5, max=854.7)" results_df = msql_engine.process_query(query, "tests/data/meoh_water_ms2_1_31_1_395.mzML") print(results_df) - assert(len(results_df) == 8682) + assert(len(results_df) == 4) def test_ms2_mobility_variable2(): - query = "QUERY scaninfo(MS2DATA) WHERE MS2PREC=X AND MOBILITY=range(min=1, max=2)" + query = "QUERY scaninfo(MS2DATA) WHERE MS2PREC=X AND MOBILITY=range(min=1, max=2) AND X=range(min=400, max=500)" results_df = msql_engine.process_query(query, "tests/data/meoh_water_ms2_1_31_1_395.mzML") print(results_df) - assert(len(results_df) == 8682) + assert(len(results_df) == 1654) @@ -627,7 +627,8 @@ def main(): #test_or_against_iron() #test_quad_brominated() #test_ms2_mobility() - test_ms2_mobility_variable() + #test_ms2_mobility_variable() + test_ms2_mobility_variable2() if __name__ == "__main__": main() From 284861537c917bc46c211e0754afcea1d950168e Mon Sep 17 00:00:00 2001 From: Mingxun Wang Date: Sat, 28 Aug 2021 15:41:18 -0700 Subject: [PATCH 4/4] bump version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 0480211..1722d8e 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ setuptools.setup( name="massql", - version="0.0.7", + version="0.0.8", author="Mingxun Wang", author_email="mwang87@gmail.com", description="Mass spectrometry query language python implementation",