Skip to content

Commit

Permalink
Merge pull request #14 from mpi2/12-solr_request-should-handle-errors…
Browse files Browse the repository at this point in the history
…-more-gracefully

[Solr_request] - 12 solr request should handle errors more gracefully
  • Loading branch information
dpavam authored Oct 14, 2024
2 parents d6f02a0 + 66d26ee commit 89df667
Show file tree
Hide file tree
Showing 13 changed files with 203 additions and 8 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,7 @@ dist/
*.pytest*
*.pytest_cache
__pycache__


# Local notes
notes.md
1 change: 1 addition & 0 deletions impc_api_helper/MANIFEST.in
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
include impc_api_helper/utils/core_fields.json
29 changes: 29 additions & 0 deletions impc_api_helper/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,35 @@ num_found, df = solr_request( core='genotype-phenotype', params={
)
```

#### Solr request validation
A common pitfall when writing a query is the misspelling of `core` and `fields` arguments. For this, we have included an `validate` argument that raises a warning when these values are not as expected. Note this does not prevent you from executing a query; it just alerts you to a potential issue.

##### Core validation
```
num_found, df = solr_request( core='invalid_core', params={
'q': '*:*',
'rows': 10
},
validate=True
)
> InvalidCoreWarning: Invalid core: "genotype-phenotyp", select from the available cores:
> dict_keys(['experiment', 'genotype-phenotype', 'impc_images', 'phenodigm', 'statistical-result']))
```

##### Field list validation
```
num_found, df = solr_request( core='genotype-phenotype', params={
'q': '*:*',
'rows': 10,
'fl': 'invalid_field,marker_symbol,allele_symbol'
},
validate=True
)
> InvalidFieldWarning: Unexpected field name: "invalid_field". Check the spelling of fields.
> To see expected fields check the documentation at: https://www.ebi.ac.uk/mi/impc/solrdoc/
```

### Batch request
For larger requests, use the batch request function to query the API responsibly.
```
Expand Down
1 change: 1 addition & 0 deletions impc_api_helper/impc_api_helper/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from .solr_request import solr_request, batch_request
from .iterator_solr_request import iterator_solr_request
from .utils import validators, warnings

# Control what gets imported by client
__all__ = ["solr_request", "batch_request", "iterator_solr_request"]
16 changes: 12 additions & 4 deletions impc_api_helper/impc_api_helper/solr_request.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,27 @@
from IPython.display import display
from tqdm import tqdm


import pandas as pd
import requests
from impc_api_helper.utils.validators import CoreParamsValidator

# Display the whole dataframe <15
pd.set_option("display.max_rows", 15)
pd.set_option("display.max_columns", None)


# Create helper function
def solr_request(core, params, silent=False):
def solr_request(core, params, silent=False, validate=False):
"""Performs a single Solr request to the IMPC Solr API.
Args:
core (str): name of IMPC solr core.
params (dict): dictionary containing the API call parameters.
silent (bool, optional): default False
If True, displays: URL of API call, the number of found docs
and a portion of the DataFrame.
and a portion of the DataFrame.
validate (bool, optional): default False
If True, validates the parameters against the core schema and raises warnings
if any parameter seems invalid.
Returns:
Expand Down Expand Up @@ -61,6 +63,12 @@ def solr_request(core, params, silent=False):
)
"""

if validate:
CoreParamsValidator(
core=core,
params=params
)

base_url = "https://www.ebi.ac.uk/mi/impc/solr/"
solr_url = base_url + core + "/select"

Expand Down
Empty file.
15 changes: 15 additions & 0 deletions impc_api_helper/impc_api_helper/utils/core_fields.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"experiment": [
"id", "observation_id", "specimen_id", "phenotyping_center_id", "phenotyping_center", "production_center_id", "production_center", "specimen_project_id", "specimen_project_name", "gene_accession_id", "gene_symbol", "allele_accession_id", "allele_symbol", "zygosity", "sex", "biological_model_id", "biological_sample_id", "biological_sample_group", "strain_accession_id", "strain_name", "genetic_background", "allelic_composition", "colony_id", "litter_id", "date_of_birth", "external_sample_id", "life_stage_name", "life_stage_acc", "datasource_id", "datasource_name", "project_id", "project_name", "pipeline_id", "pipeline_name", "pipeline_stable_id", "procedure_id", "procedure_name", "procedure_stable_id", "procedure_group", "parameter_id", "parameter_name", "parameter_stable_id", "procedure_sequence_id", "experiment_id", "observation_type", "data_type", "experiment_source_id", "date_of_experiment", "weight_parameter_stable_id", "weight_date", "weight_days_old", "weight", "data_point", "order_index", "dimension", "time_point", "discrete_point", "category", "raw_category", "metadata", "metadata_group", "anatomy_id", "anatomy_term", "anatomy_id_term", "anatomy_term_synonym", "top_level_anatomy_id", "top_level_anatomy_term", "top_level_anatomy_term_synonym", "selected_top_level_anatomy_id", "selected_top_level_anatomy_term", "selected_top_level_anatomy_term_synonym", "intermediate_anatomy_id", "intermediate_anatomy_term", "intermediate_anatomy_term_synonym", "parent_anatomy_id", "parent_anatomy_term", "parent_anatomy_term_synonym", "child_anatomy_id", "child_anatomy_term", "child_anatomy_term_synonym", "download_file_path", "image_link", "file_type", "increment_value", "parameter_association_stable_id", "parameter_association_sequence_id", "parameter_association_dim_id", "parameter_association_name", "parameter_association_value", "developmental_stage_acc", "developmental_stage_name", "text_value", "sub_term_id", "sub_term_name", "sub_term_description", "age_in_days", "age_in_weeks"
],
"genotype-phenotype": [
"doc_id", "ontology_db_id", "assertion_type", "assertion_type_id", "mpath_term_id", "mpath_term_name", "anatomy_term_id", "anatomy_term_name", "intermediate_anatomy_term_id", "intermediate_anatomy_term_name", "top_level_anatomy_term_id", "top_level_anatomy_term_name", "mp_term_id", "mp_term_name", "alt_mp_term_id", "top_level_mp_term_id", "top_level_mp_term_name", "intermediate_mp_term_id", "intermediate_mp_term_name", "marker_symbol", "marker_accession_id", "colony_id", "allele_name", "allele_symbol", "allele_accession_id", "strain_name", "strain_accession_id", "phenotyping_center", "project_external_id", "project_name", "project_fullname", "resource_name", "resource_fullname", "sex", "zygosity", "pipeline_name", "pipeline_stable_id", "pipeline_stable_key", "procedure_name", "procedure_stable_id", "procedure_stable_key", "parameter_name", "parameter_stable_id", "parameter_stable_key", "statistical_method", "percentage_change", "p_value", "effect_size", "external_id", "life_stage_acc", "life_stage_name"
],
"impc_images": [
"id", "observation_id", "specimen_id", "phenotyping_center_id", "phenotyping_center", "production_center_id", "production_center", "specimen_project_id", "specimen_project_name", "gene_accession_id", "gene_symbol", "allele_accession_id", "allele_symbol", "zygosity", "sex", "biological_model_id", "biological_sample_id", "biological_sample_group", "strain_accession_id", "strain_name", "genetic_background", "allelic_composition", "colony_id", "litter_id", "date_of_birth", "external_sample_id", "life_stage_name", "life_stage_acc", "datasource_id", "datasource_name", "project_id", "project_name", "pipeline_id", "pipeline_name", "pipeline_stable_id", "procedure_id", "procedure_name", "procedure_stable_id", "procedure_group", "parameter_id", "parameter_name", "parameter_stable_id", "procedure_sequence_id", "experiment_id", "observation_type", "data_type", "experiment_source_id", "date_of_experiment", "weight_parameter_stable_id", "weight_date", "weight_days_old", "weight", "data_point", "order_index", "dimension", "time_point", "discrete_point", "category", "raw_category", "metadata", "metadata_group", "mp_id", "mp_term", "top_level_mp_id", "top_level_mp_term", "intermediate_mp_id", "intermediate_mp_term", "anatomy_id", "anatomy_term", "anatomy_id_term", "anatomy_term_synonym", "top_level_anatomy_id", "top_level_anatomy_term", "top_level_anatomy_term_synonym", "selected_top_level_anatomy_id", "selected_top_level_anatomy_term", "selected_top_level_anatomy_term_synonym", "intermediate_anatomy_id", "intermediate_anatomy_term", "intermediate_anatomy_term_synonym", "parent_anatomy_id", "parent_anatomy_term", "parent_anatomy_term_synonym", "child_anatomy_id", "child_anatomy_term", "child_anatomy_term_synonym", "download_file_path", "image_link", "file_type", "parameter_association_stable_id", "parameter_association_sequence_id", "parameter_association_dim_id", "parameter_association_name", "parameter_association_value", "developmental_stage_acc", "developmental_stage_name", "text_value", "sub_term_id", "sub_term_name", "sub_term_description", "sequence_id", "age_in_days", "age_in_weeks", "download_url", "jpeg_url", "thumbnail_url", "omero_id"
],
"phenodigm": [
"type", "disease_id", "disease_source", "disease_term", "disease_alts", "disease_locus", "disease_classes", "disease_phenotypes", "gene_id", "gene_symbol", "gene_symbols_withdrawn", "gene_locus", "hgnc_gene_id", "hgnc_gene_symbol", "hgnc_gene_symbols_withdrawn", "hgnc_gene_locus", "mouse_model", "impc_model", "model_id", "model_source", "model_description", "model_genetic_background", "marker_id", "marker_symbol", "marker_locus", "marker_num_models", "model_phenotypes", "ontology", "phenotype_id", "phenotype_term", "phenotype_synonym", "hp_id", "hp_term", "mp_id", "mp_term", "association_curated", "association_ortholog", "marker_symbols_withdrawn", "disease_matched_phenotypes", "model_matched_phenotypes", "disease_model_avg_raw", "disease_model_avg_norm", "disease_model_max_raw", "disease_model_max_norm", "search_qf", "human_curated_gene", "impc_model_with_curated_gene", "mgi_model_with_curated_gene", "impc_model_with_computed_association", "mgi_model_with_computed_association"
],
"statistical-result": ["doc_id", "db_id", "data_type", "anatomy_term_id", "anatomy_term_name", "intermediate_anatomy_term_id", "intermediate_anatomy_term_name", "top_level_anatomy_term_id", "top_level_anatomy_term_name", "mp_term_id_options", "mp_term_name_options", "mp_term_id", "mp_term_name", "top_level_mp_term_id", "top_level_mp_term_name", "intermediate_mp_term_id", "intermediate_mp_term_name", "male_mp_term_id", "male_mp_term_name", "male_top_level_mp_term_id", "male_top_level_mp_term_name", "male_intermediate_mp_term_id", "male_intermediate_mp_term_name", "female_mp_term_id", "female_mp_term_name", "female_top_level_mp_term_id", "female_top_level_mp_term_name", "female_intermediate_mp_term_id", "female_intermediate_mp_term_name", "resource_name", "resource_fullname", "resource_id", "project_name", "phenotyping_center", "pipeline_stable_id", "pipeline_stable_key", "pipeline_name", "pipeline_id", "procedure_stable_id", "procedure_stable_key", "procedure_name", "procedure_id", "parameter_stable_id", "parameter_stable_key", "parameter_name", "parameter_id", "colony_id", "marker_symbol", "marker_accession_id", "allele_symbol", "allele_name", "allele_accession_id", "strain_name", "strain_accession_id", "sex", "zygosity", "control_selection_method", "dependent_variable", "metadata_group", "data_frame", "genetic_background", "production_center", "external_db_id", "id", "organisation_id", "phenotyping_center_id", "project_id", "male_control_mean", "male_mutant_mean", "female_control_mean", "female_mutant_mean", "genotype_p_value_low_vs_normal_high", "genotype_p_value_low_normal_vs_high", "genotype_effect_size_low_vs_normal_high", "genotype_effect_size_low_normal_vs_high", "female_p_value_low_vs_normal_high", "female_p_value_low_normal_vs_high", "female_effect_size_low_vs_normal_high", "female_effect_size_low_normal_vs_high", "male_p_value_low_vs_normal_high", "male_p_value_low_normal_vs_high", "male_effect_size_low_vs_normal_high", "male_effect_size_low_normal_vs_high", "categories", "categorical_p_value", "categorical_effect_size", "batch_significant", "variance_significant", "null_test_p_value", "genotype_effect_p_value", "genotype_effect_stderr_estimate", "genotype_effect_parameter_estimate", "male_percentage_change", "female_percentage_change", "sex_effect_p_value", "sex_effect_stderr_estimate", "sex_effect_parameter_estimate", "weight_effect_p_value", "weight_effect_stderr_estimate", "weight_effect_parameter_estimate", "group1_genotype", "group1_residuals_normality_test", "group2_genotype", "group2_residuals_normality_test", "blups_test", "rotated_residuals_test", "intercept_estimate", "intercept_estimate_stderr_estimate", "interaction_significant", "interaction_effect_p_value", "female_ko_effect_p_value", "female_ko_effect_stderr_estimate", "female_ko_parameter_estimate", "female_effect_size", "male_ko_effect_p_value", "male_ko_effect_stderr_estimate", "male_ko_parameter_estimate", "male_effect_size", "classification_tag", "phenotype_sex", "life_stage_acc", "life_stage_name", "significant", "soft_windowing_bandwidth", "soft_windowing_shape", "soft_windowing_peaks", "soft_windowing_min_obs_required", "soft_windowing_total_obs_or_weight", "soft_windowing_threshold", "soft_windowing_number_of_doe", "soft_windowing_doe_note", "metadata"]
}
78 changes: 78 additions & 0 deletions impc_api_helper/impc_api_helper/utils/validators.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
from pydantic import BaseModel, model_validator
import json
from typing import List, Dict
from pathlib import Path
import warnings
from dataclasses import dataclass, field
from impc_api_helper.utils.warnings import warning_config, InvalidCoreWarning, InvalidFieldWarning

# Initialise warning config
warning_config()

# Dataclass for the json validator
@dataclass
class ValidationJson:
CORE_FILE: Path = Path(__file__).resolve().parent / 'core_fields.json'
_validation_json: Dict[str, List[str]] = field(default_factory=dict, init=False)

# Eager initialisation
def __post_init__(self):
self._validation_json = self.load_core_fields(self.CORE_FILE)

def load_core_fields(self, filename: Path) -> Dict[str, List[str]]:
with open(filename, "r") as f:
return json.load(f)

def valid_cores(self):
return self._validation_json.keys()

def valid_fields(self, core: str) -> List[str]:
return self._validation_json.get(core, [])

# Function to parse the fields (fl) params in params
def get_fields(fields: str) -> List[str]:
return fields.split(",")


class CoreParamsValidator(BaseModel):
core: str
params: Dict

@model_validator(mode='before')
@classmethod
def validate_core_and_fields(cls, values):
invalid_core: bool = False
core = values.get("core")
params = values.get("params")

# Call the Validator Object
jv = ValidationJson()

# Validate core
if core not in jv.valid_cores():
invalid_core = True
warnings.warn(
message=f'Invalid core: "{core}", select from the available cores:\n{jv.valid_cores()})\n',
category=InvalidCoreWarning)

# Compare passed fl values vs the allowed fl values for a given core
fields: str = params.get("fl")

# If no fields were specified, pass
if fields is None:
print("No fields passed, skipping field validation...")
return values

# Get the fields passed to params and the expected fields for the core
field_list: List[str] = get_fields(fields)


# Validate each field in params
# TODO: perhaps pass al invalid fields as a list, instead of many warning messages
if invalid_core is not True:
for fl in field_list:
if fl not in jv.valid_fields(core):
warnings.warn(message=f"""Unexpected field name: "{fl}". Check the spelling of fields.\nTo see expected fields check the documentation at: https://www.ebi.ac.uk/mi/impc/solrdoc/""",
category=InvalidFieldWarning)
# Return validated values
return values
23 changes: 23 additions & 0 deletions impc_api_helper/impc_api_helper/utils/warnings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
"""Module for warnings and excepton utils"""

import warnings


# Custom warnings
class InvalidCoreWarning(Warning):
"""Exception raised when the core is not in the expected core names"""


class InvalidFieldWarning(Warning):
"""Exception raised when the field name is not in the expected fields"""


# Custom warning function
def warning_config():
"""Customises formatting and filters for warnings"""

def custom_warning(message, category, filename, lineno, line=None):
return f'{category.__name__}: {message}\n'

warnings.formatwarning = custom_warning
warnings.simplefilter("always", Warning)
7 changes: 4 additions & 3 deletions impc_api_helper/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ authors = [
dependencies = [
"pandas>=2.2.0",
"requests>=2.31.0",
"tqdm>=4.66.4"
"tqdm>=4.66.4",
"pydantic>=2.9"
]

readme = "README.md"
Expand All @@ -25,9 +26,9 @@ dev = [
"pytest>=8.2.2"
]

[tool.setuptools.packages.find]
include = ["impc_api_helper", "impc_api_helper.*"]

[project.urls]
"Homepage" = "https://github.com/mpi2/impc-data-api-workshop"

[tool.setuptools]
packages = ["impc_api_helper"]
5 changes: 4 additions & 1 deletion impc_api_helper/setup.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,19 @@
from setuptools import setup, find_packages


setup(
name='impc_api_helper',
version='0.1.0',
description='A package to facilitate making API request to the IMPC Solr API',
author='MPI2, Marina Kan, Diego Pava',
url='https://github.com/mpi2/impc-data-api-workshop',
packages=find_packages(),
packages=find_packages(include=["impc_api_helper", "impc_api_helper.*"]),
include_package_data=True,
install_requires=[
'pandas>=2.2.0',
'requests>=2.31.0',
'tqdm>=4.66.4',
'pydantic>=2.9'
],

extras_require={
Expand Down
31 changes: 31 additions & 0 deletions impc_api_helper/tests/test_solr_request.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from unittest.mock import patch
from solr_request import solr_request, _process_faceting
from .test_helpers import check_url_status_code_and_params
from impc_api_helper.utils.warnings import InvalidCoreWarning, InvalidFieldWarning


class TestSolrRequest:
Expand Down Expand Up @@ -286,3 +287,33 @@ def test_process_faceting(self, params, data):
assert df.iloc[1, 1] == 9
assert df.iloc[2, 0] == "banana"
assert df.iloc[2, 1] == 24

# Validation tests
def _validation_response():
return {
"status_code": 200,
"json": {
"response": {
"numFound": 101,
"docs": [],
}
},
}

@pytest.mark.parametrize(
"mock_response", [_validation_response()], indirect=["mock_response"]
)
def test_solr_request_core_validation(self, common_params, mock_response):
with pytest.warns(InvalidCoreWarning):
_ = solr_request(core="invalid_core", params=common_params, validate=True)

@pytest.mark.parametrize(
"mock_response", [_validation_response()], indirect=["mock_response"]
)
def test_solr_request_fields_validation(self, mock_response):
with pytest.warns(InvalidFieldWarning):
_ = solr_request(
core="experiment",
params={"q": "*:*", "fl": "invalid_field,another_invalid_field"},
validate=True,
)
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
pandas>=2.2.0
requests>=2.31.0
tqdm>=4.66.4
pydantic>=2.9

0 comments on commit 89df667

Please sign in to comment.