Merge pull request #14 from mpi2/12-solr_request-should-handle-errors…

…-more-gracefully [Solr_request] - 12 solr request should handle errors more gracefully
mpi2 · Oct 14, 2024 · 89df667 · 89df667
2 parents d6f02a0 + 66d26ee
commit 89df667
Show file tree

Hide file tree

Showing 13 changed files with 203 additions and 8 deletions.
diff --git a/.gitignore b/.gitignore
@@ -12,3 +12,7 @@ dist/
 *.pytest*
 *.pytest_cache
 __pycache__
+
+
+# Local notes
+notes.md
diff --git a/impc_api_helper/MANIFEST.in b/impc_api_helper/MANIFEST.in
@@ -0,0 +1 @@
+include impc_api_helper/utils/core_fields.json
diff --git a/impc_api_helper/README.md b/impc_api_helper/README.md
@@ -25,6 +25,35 @@ num_found, df = solr_request( core='genotype-phenotype', params={
 )
 ```
 
+#### Solr request validation
+A common pitfall when writing a query is the misspelling of `core` and `fields` arguments. For this, we have included an `validate` argument that raises a warning when these values are not as expected. Note this does not prevent you from executing a query; it just alerts you to a potential issue.
+
+##### Core validation
+```
+num_found, df = solr_request( core='invalid_core', params={
+        'q': '*:*',
+        'rows': 10
+    },
+    validate=True
+)
+
+> InvalidCoreWarning: Invalid core: "genotype-phenotyp", select from the available cores:
+> dict_keys(['experiment', 'genotype-phenotype', 'impc_images', 'phenodigm', 'statistical-result']))
+```
+
+##### Field list validation
+```
+num_found, df = solr_request( core='genotype-phenotype', params={
+        'q': '*:*',
+        'rows': 10,
+        'fl': 'invalid_field,marker_symbol,allele_symbol'
+    },
+    validate=True
+)
+> InvalidFieldWarning: Unexpected field name: "invalid_field". Check the spelling of fields.
+> To see expected fields check the documentation at: https://www.ebi.ac.uk/mi/impc/solrdoc/
+```
+
 ### Batch request
 For larger requests, use the batch request function to query the API responsibly.
 ```

diff --git a/impc_api_helper/impc_api_helper/__init__.py b/impc_api_helper/impc_api_helper/__init__.py
@@ -1,5 +1,6 @@
 from .solr_request import solr_request, batch_request
 from .iterator_solr_request import iterator_solr_request
+from .utils import validators, warnings
 
 # Control what gets imported by client
 __all__ = ["solr_request", "batch_request", "iterator_solr_request"]
diff --git a/impc_api_helper/impc_api_helper/solr_request.py b/impc_api_helper/impc_api_helper/solr_request.py
@@ -1,25 +1,27 @@
 from IPython.display import display
 from tqdm import tqdm
-
-
 import pandas as pd
 import requests
+from impc_api_helper.utils.validators import CoreParamsValidator
 
 # Display the whole dataframe <15
 pd.set_option("display.max_rows", 15)
 pd.set_option("display.max_columns", None)
 
 
 # Create helper function
-def solr_request(core, params, silent=False):
+def solr_request(core, params, silent=False, validate=False):
     """Performs a single Solr request to the IMPC Solr API.
     
     Args:
         core (str): name of IMPC solr core.
         params (dict): dictionary containing the API call parameters.
         silent (bool, optional): default False
             If True, displays: URL of API call, the number of found docs 
-            and a portion of the DataFrame. 
+            and a portion of the DataFrame.
+        validate (bool, optional): default False
+            If True, validates the parameters against the core schema and raises warnings
+            if any parameter seems invalid.
 
 
     Returns:
@@ -61,6 +63,12 @@ def solr_request(core, params, silent=False):
         )
     """
 
+    if validate:
+        CoreParamsValidator(
+            core=core,
+            params=params
+        )
+
     base_url = "https://www.ebi.ac.uk/mi/impc/solr/"
     solr_url = base_url + core + "/select"
 

diff --git a/impc_api_helper/impc_api_helper/utils/__init__.py b/impc_api_helper/impc_api_helper/utils/__init__.py
diff --git a/impc_api_helper/impc_api_helper/utils/core_fields.json b/impc_api_helper/impc_api_helper/utils/core_fields.json
@@ -0,0 +1,15 @@
+{
+  "experiment": [
+    "id", "observation_id", "specimen_id", "phenotyping_center_id", "phenotyping_center", "production_center_id", "production_center", "specimen_project_id", "specimen_project_name", "gene_accession_id", "gene_symbol", "allele_accession_id", "allele_symbol", "zygosity", "sex", "biological_model_id", "biological_sample_id", "biological_sample_group", "strain_accession_id", "strain_name", "genetic_background", "allelic_composition", "colony_id", "litter_id", "date_of_birth", "external_sample_id", "life_stage_name", "life_stage_acc", "datasource_id", "datasource_name", "project_id", "project_name", "pipeline_id", "pipeline_name", "pipeline_stable_id", "procedure_id", "procedure_name", "procedure_stable_id", "procedure_group", "parameter_id", "parameter_name", "parameter_stable_id", "procedure_sequence_id", "experiment_id", "observation_type", "data_type", "experiment_source_id", "date_of_experiment", "weight_parameter_stable_id", "weight_date", "weight_days_old", "weight", "data_point", "order_index", "dimension", "time_point", "discrete_point", "category", "raw_category", "metadata", "metadata_group", "anatomy_id", "anatomy_term", "anatomy_id_term", "anatomy_term_synonym", "top_level_anatomy_id", "top_level_anatomy_term", "top_level_anatomy_term_synonym", "selected_top_level_anatomy_id", "selected_top_level_anatomy_term", "selected_top_level_anatomy_term_synonym", "intermediate_anatomy_id", "intermediate_anatomy_term", "intermediate_anatomy_term_synonym", "parent_anatomy_id", "parent_anatomy_term", "parent_anatomy_term_synonym", "child_anatomy_id", "child_anatomy_term", "child_anatomy_term_synonym", "download_file_path", "image_link", "file_type", "increment_value", "parameter_association_stable_id", "parameter_association_sequence_id", "parameter_association_dim_id", "parameter_association_name", "parameter_association_value", "developmental_stage_acc", "developmental_stage_name", "text_value", "sub_term_id", "sub_term_name", "sub_term_description", "age_in_days", "age_in_weeks"
+  ],
+  "genotype-phenotype": [
+    "doc_id", "ontology_db_id", "assertion_type", "assertion_type_id", "mpath_term_id", "mpath_term_name", "anatomy_term_id", "anatomy_term_name", "intermediate_anatomy_term_id", "intermediate_anatomy_term_name", "top_level_anatomy_term_id", "top_level_anatomy_term_name", "mp_term_id", "mp_term_name", "alt_mp_term_id", "top_level_mp_term_id", "top_level_mp_term_name", "intermediate_mp_term_id", "intermediate_mp_term_name", "marker_symbol", "marker_accession_id", "colony_id", "allele_name", "allele_symbol", "allele_accession_id", "strain_name", "strain_accession_id", "phenotyping_center", "project_external_id", "project_name", "project_fullname", "resource_name", "resource_fullname", "sex", "zygosity", "pipeline_name", "pipeline_stable_id", "pipeline_stable_key", "procedure_name", "procedure_stable_id", "procedure_stable_key", "parameter_name", "parameter_stable_id", "parameter_stable_key", "statistical_method", "percentage_change", "p_value", "effect_size", "external_id", "life_stage_acc", "life_stage_name"
+  ],
+  "impc_images": [
+    "id", "observation_id", "specimen_id", "phenotyping_center_id", "phenotyping_center", "production_center_id", "production_center", "specimen_project_id", "specimen_project_name", "gene_accession_id", "gene_symbol", "allele_accession_id", "allele_symbol", "zygosity", "sex", "biological_model_id", "biological_sample_id", "biological_sample_group", "strain_accession_id", "strain_name", "genetic_background", "allelic_composition", "colony_id", "litter_id", "date_of_birth", "external_sample_id", "life_stage_name", "life_stage_acc", "datasource_id", "datasource_name", "project_id", "project_name", "pipeline_id", "pipeline_name", "pipeline_stable_id", "procedure_id", "procedure_name", "procedure_stable_id", "procedure_group", "parameter_id", "parameter_name", "parameter_stable_id", "procedure_sequence_id", "experiment_id", "observation_type", "data_type", "experiment_source_id", "date_of_experiment", "weight_parameter_stable_id", "weight_date", "weight_days_old", "weight", "data_point", "order_index", "dimension", "time_point", "discrete_point", "category", "raw_category", "metadata", "metadata_group", "mp_id", "mp_term", "top_level_mp_id", "top_level_mp_term", "intermediate_mp_id", "intermediate_mp_term", "anatomy_id", "anatomy_term", "anatomy_id_term", "anatomy_term_synonym", "top_level_anatomy_id", "top_level_anatomy_term", "top_level_anatomy_term_synonym", "selected_top_level_anatomy_id", "selected_top_level_anatomy_term", "selected_top_level_anatomy_term_synonym", "intermediate_anatomy_id", "intermediate_anatomy_term", "intermediate_anatomy_term_synonym", "parent_anatomy_id", "parent_anatomy_term", "parent_anatomy_term_synonym", "child_anatomy_id", "child_anatomy_term", "child_anatomy_term_synonym", "download_file_path", "image_link", "file_type", "parameter_association_stable_id", "parameter_association_sequence_id", "parameter_association_dim_id", "parameter_association_name", "parameter_association_value", "developmental_stage_acc", "developmental_stage_name", "text_value", "sub_term_id", "sub_term_name", "sub_term_description", "sequence_id", "age_in_days", "age_in_weeks", "download_url", "jpeg_url", "thumbnail_url", "omero_id"
+  ],
+  "phenodigm": [
+    "type", "disease_id", "disease_source", "disease_term", "disease_alts", "disease_locus", "disease_classes", "disease_phenotypes", "gene_id", "gene_symbol", "gene_symbols_withdrawn", "gene_locus", "hgnc_gene_id", "hgnc_gene_symbol", "hgnc_gene_symbols_withdrawn", "hgnc_gene_locus", "mouse_model", "impc_model", "model_id", "model_source", "model_description", "model_genetic_background", "marker_id", "marker_symbol", "marker_locus", "marker_num_models", "model_phenotypes", "ontology", "phenotype_id", "phenotype_term", "phenotype_synonym", "hp_id", "hp_term", "mp_id", "mp_term", "association_curated", "association_ortholog", "marker_symbols_withdrawn", "disease_matched_phenotypes", "model_matched_phenotypes", "disease_model_avg_raw", "disease_model_avg_norm", "disease_model_max_raw", "disease_model_max_norm", "search_qf", "human_curated_gene", "impc_model_with_curated_gene", "mgi_model_with_curated_gene", "impc_model_with_computed_association", "mgi_model_with_computed_association"
+  ],
+  "statistical-result": ["doc_id", "db_id", "data_type", "anatomy_term_id", "anatomy_term_name", "intermediate_anatomy_term_id", "intermediate_anatomy_term_name", "top_level_anatomy_term_id", "top_level_anatomy_term_name", "mp_term_id_options", "mp_term_name_options", "mp_term_id", "mp_term_name", "top_level_mp_term_id", "top_level_mp_term_name", "intermediate_mp_term_id", "intermediate_mp_term_name", "male_mp_term_id", "male_mp_term_name", "male_top_level_mp_term_id", "male_top_level_mp_term_name", "male_intermediate_mp_term_id", "male_intermediate_mp_term_name", "female_mp_term_id", "female_mp_term_name", "female_top_level_mp_term_id", "female_top_level_mp_term_name", "female_intermediate_mp_term_id", "female_intermediate_mp_term_name", "resource_name", "resource_fullname", "resource_id", "project_name", "phenotyping_center", "pipeline_stable_id", "pipeline_stable_key", "pipeline_name", "pipeline_id", "procedure_stable_id", "procedure_stable_key", "procedure_name", "procedure_id", "parameter_stable_id", "parameter_stable_key", "parameter_name", "parameter_id", "colony_id", "marker_symbol", "marker_accession_id", "allele_symbol", "allele_name", "allele_accession_id", "strain_name", "strain_accession_id", "sex", "zygosity", "control_selection_method", "dependent_variable", "metadata_group", "data_frame", "genetic_background", "production_center", "external_db_id", "id", "organisation_id", "phenotyping_center_id", "project_id", "male_control_mean", "male_mutant_mean", "female_control_mean", "female_mutant_mean", "genotype_p_value_low_vs_normal_high", "genotype_p_value_low_normal_vs_high", "genotype_effect_size_low_vs_normal_high", "genotype_effect_size_low_normal_vs_high", "female_p_value_low_vs_normal_high", "female_p_value_low_normal_vs_high", "female_effect_size_low_vs_normal_high", "female_effect_size_low_normal_vs_high", "male_p_value_low_vs_normal_high", "male_p_value_low_normal_vs_high", "male_effect_size_low_vs_normal_high", "male_effect_size_low_normal_vs_high", "categories", "categorical_p_value", "categorical_effect_size", "batch_significant", "variance_significant", "null_test_p_value", "genotype_effect_p_value", "genotype_effect_stderr_estimate", "genotype_effect_parameter_estimate", "male_percentage_change", "female_percentage_change", "sex_effect_p_value", "sex_effect_stderr_estimate", "sex_effect_parameter_estimate", "weight_effect_p_value", "weight_effect_stderr_estimate", "weight_effect_parameter_estimate", "group1_genotype", "group1_residuals_normality_test", "group2_genotype", "group2_residuals_normality_test", "blups_test", "rotated_residuals_test", "intercept_estimate", "intercept_estimate_stderr_estimate", "interaction_significant", "interaction_effect_p_value", "female_ko_effect_p_value", "female_ko_effect_stderr_estimate", "female_ko_parameter_estimate", "female_effect_size", "male_ko_effect_p_value", "male_ko_effect_stderr_estimate", "male_ko_parameter_estimate", "male_effect_size", "classification_tag", "phenotype_sex", "life_stage_acc", "life_stage_name", "significant", "soft_windowing_bandwidth", "soft_windowing_shape", "soft_windowing_peaks", "soft_windowing_min_obs_required", "soft_windowing_total_obs_or_weight", "soft_windowing_threshold", "soft_windowing_number_of_doe", "soft_windowing_doe_note", "metadata"]
+}
diff --git a/impc_api_helper/impc_api_helper/utils/validators.py b/impc_api_helper/impc_api_helper/utils/validators.py
@@ -0,0 +1,78 @@
+from pydantic import BaseModel, model_validator
+import json
+from typing import List, Dict
+from pathlib import Path
+import warnings
+from dataclasses import dataclass, field
+from impc_api_helper.utils.warnings import warning_config, InvalidCoreWarning, InvalidFieldWarning
+
+# Initialise warning config
+warning_config()
+
+# Dataclass for the json validator
+@dataclass
+class ValidationJson:
+    CORE_FILE: Path = Path(__file__).resolve().parent / 'core_fields.json'
+    _validation_json: Dict[str, List[str]] = field(default_factory=dict, init=False)
+
+    # Eager initialisation
+    def __post_init__(self):
+        self._validation_json = self.load_core_fields(self.CORE_FILE)
+
+    def load_core_fields(self, filename: Path) -> Dict[str, List[str]]:
+            with open(filename, "r") as f:
+                return json.load(f)
+
+    def valid_cores(self):
+        return self._validation_json.keys()
+
+    def valid_fields(self, core: str) -> List[str]:
+        return self._validation_json.get(core, [])
+
+# Function to parse the fields (fl) params in params
+def get_fields(fields: str) -> List[str]:
+    return fields.split(",")
+
+
+class CoreParamsValidator(BaseModel):
+    core: str
+    params: Dict
+
+    @model_validator(mode='before')
+    @classmethod
+    def validate_core_and_fields(cls, values):
+        invalid_core: bool = False
+        core = values.get("core")
+        params = values.get("params")
+
+        # Call the Validator Object
+        jv = ValidationJson()
+
+        # Validate core
+        if core not in jv.valid_cores():
+            invalid_core = True
+            warnings.warn(
+                message=f'Invalid core: "{core}", select from the available cores:\n{jv.valid_cores()})\n',
+                category=InvalidCoreWarning)
+
+        # Compare passed fl values vs the allowed fl values for a given core
+        fields: str = params.get("fl")
+
+        # If no fields were specified, pass
+        if fields is None:
+            print("No fields passed, skipping field validation...")
+            return values
+
+        # Get the fields passed to params and the expected fields for the core
+        field_list: List[str] = get_fields(fields)
+
+
+        # Validate each field in params
+        # TODO: perhaps pass al invalid fields as a list, instead of many warning messages
+        if invalid_core is not True:
+            for fl in field_list:
+                if fl not in jv.valid_fields(core):
+                    warnings.warn(message=f"""Unexpected field name: "{fl}". Check the spelling of fields.\nTo see expected fields check the documentation at: https://www.ebi.ac.uk/mi/impc/solrdoc/""",
+                    category=InvalidFieldWarning)
+        # Return validated values
+        return values
diff --git a/impc_api_helper/impc_api_helper/utils/warnings.py b/impc_api_helper/impc_api_helper/utils/warnings.py
@@ -0,0 +1,23 @@
+"""Module for warnings and excepton utils"""
+
+import warnings
+
+
+# Custom warnings
+class InvalidCoreWarning(Warning):
+    """Exception raised when the core is not in the expected core names"""
+
+
+class InvalidFieldWarning(Warning):
+    """Exception raised when the field name is not in the expected fields"""
+
+
+# Custom warning function
+def warning_config():
+    """Customises formatting and filters for warnings"""
+
+    def custom_warning(message, category, filename, lineno, line=None):
+        return f'{category.__name__}: {message}\n'
+
+    warnings.formatwarning = custom_warning
+    warnings.simplefilter("always", Warning)
diff --git a/impc_api_helper/pyproject.toml b/impc_api_helper/pyproject.toml
@@ -14,7 +14,8 @@ authors = [
 dependencies = [
     "pandas>=2.2.0",
     "requests>=2.31.0",
-    "tqdm>=4.66.4"
+    "tqdm>=4.66.4",
+    "pydantic>=2.9"
 ]
 
 readme = "README.md"
@@ -25,9 +26,9 @@ dev = [
     "pytest>=8.2.2"
 ]
 
+[tool.setuptools.packages.find]
+include = ["impc_api_helper", "impc_api_helper.*"]
 
 [project.urls]
 "Homepage" = "https://github.com/mpi2/impc-data-api-workshop"
 
-[tool.setuptools]
-packages = ["impc_api_helper"]
diff --git a/impc_api_helper/setup.py b/impc_api_helper/setup.py
@@ -1,16 +1,19 @@
 from setuptools import setup, find_packages
 
+
 setup(
     name='impc_api_helper',
     version='0.1.0',
     description='A package to facilitate making API request to the IMPC Solr API',
     author='MPI2, Marina Kan, Diego Pava',
     url='https://github.com/mpi2/impc-data-api-workshop',
-    packages=find_packages(),
+    packages=find_packages(include=["impc_api_helper", "impc_api_helper.*"]),
+    include_package_data=True,
     install_requires=[
         'pandas>=2.2.0',
         'requests>=2.31.0',
         'tqdm>=4.66.4',
+        'pydantic>=2.9'
     ],
 
     extras_require={

diff --git a/impc_api_helper/tests/test_solr_request.py b/impc_api_helper/tests/test_solr_request.py
@@ -2,6 +2,7 @@
 from unittest.mock import patch
 from solr_request import solr_request, _process_faceting
 from .test_helpers import check_url_status_code_and_params
+from impc_api_helper.utils.warnings import InvalidCoreWarning, InvalidFieldWarning
 
 
 class TestSolrRequest:
@@ -286,3 +287,33 @@ def test_process_faceting(self, params, data):
         assert df.iloc[1, 1] == 9
         assert df.iloc[2, 0] == "banana"
         assert df.iloc[2, 1] == 24
+
+    # Validation tests
+    def _validation_response():
+        return {
+            "status_code": 200,
+            "json": {
+                "response": {
+                    "numFound": 101,
+                    "docs": [],
+                }
+            },
+        }
+
+    @pytest.mark.parametrize(
+        "mock_response", [_validation_response()], indirect=["mock_response"]
+    )
+    def test_solr_request_core_validation(self, common_params, mock_response):
+        with pytest.warns(InvalidCoreWarning):
+            _ = solr_request(core="invalid_core", params=common_params, validate=True)
+
+    @pytest.mark.parametrize(
+        "mock_response", [_validation_response()], indirect=["mock_response"]
+    )
+    def test_solr_request_fields_validation(self, mock_response):
+        with pytest.warns(InvalidFieldWarning):
+            _ = solr_request(
+                core="experiment",
+                params={"q": "*:*", "fl": "invalid_field,another_invalid_field"},
+                validate=True,
+            )
diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,4 @@
 pandas>=2.2.0
 requests>=2.31.0
 tqdm>=4.66.4
+pydantic>=2.9