From 25facd1c067ba15769681e30e0928f57c2a37daa Mon Sep 17 00:00:00 2001 From: dpavam Date: Fri, 23 Aug 2024 22:39:43 +0100 Subject: [PATCH 01/64] Draft unified batch_solr_request function --- .../iterator_solr_request_2.py | 145 ++++++++++++++++++ 1 file changed, 145 insertions(+) create mode 100644 impc_api_helper/impc_api_helper/iterator_solr_request_2.py diff --git a/impc_api_helper/impc_api_helper/iterator_solr_request_2.py b/impc_api_helper/impc_api_helper/iterator_solr_request_2.py new file mode 100644 index 0000000..201573b --- /dev/null +++ b/impc_api_helper/impc_api_helper/iterator_solr_request_2.py @@ -0,0 +1,145 @@ +from IPython.display import display +import json +import pandas as pd +import requests +from tqdm import tqdm +from solr_request import solr_request +import logging + +# Set logger +logger = logging.getLogger(__name__) +logger.setLevel(logging.WARNING) +FORMAT = "%(levelname)s: %(asctime)s - %(message)s" +logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S') + + +def batch_solr_request(core, params, download=False, batch_size=5000): + # Set params for batch request + params["start"] = 0 # Start at the first result + params["rows"] = batch_size # Fetch results in chunks of 5000 + + # If user did not specify format, defaults to json. + if params.get("wt") is None: + params["wt"] = "json" + + # Check if it's multiple request + if params.get("field_list") is not None: + # Extract entities_list and entity_type from params + field_list = params.pop("field_list") + field_type = params.pop("field_type") + + # Construct the filter query with grouped model IDs + fq = "{}:({})".format( + field_type, " OR ".join(["{}".format(id) for id in field_list]) + ) + # Show users the field and field values they passed to the function + print("Queried field:", fq) + # Set internal params the users should not change + params["fq"] = fq + + # Determine the total number of rows. Note that we do not request any data (rows = 0). + num_results, _ = solr_request( + core=core, params={**params, "start": 0, "rows": 0, "wt": "json"}, silent=True + ) + print(f"Number of found documents: {num_results}") + + # Download/display only logic + # If user decides to download, a generator is used to fetch data in batches without storing results in memory. + if download: + # Implement loop behaviour + filename = f"{core}.{params['wt']}" + gen = _batch_solr_generator(core, params, num_results) + _solr_downloader(params, filename, gen) + logger.warning("Showing the first batch of results only. See downloaded file for the whole data.") + match params["wt"]: + case "json": + return pd.read_json(filename, nrows=batch_size, lines=True) + case "csv": + return pd.read_csv(filename, nrows=batch_size, header=None) + + # If the number of results is small enough and download is off, it's okay to show as df + if num_results < 1000000 and not download: + return _batch_to_df(core, params, num_results) + + # If it's too big, warn the user and ask if they want to proceed. + else: + logger.warning( + "Your request might exceed the available memory. We suggest setting 'download=True' and reading the file in batches" + ) + prompt = input( + "Do you wish to proceed anyway? press ('y' or enter to proceed) / type('n' or 'exit' to cancel)" + ) + match prompt: + case "n" | "exit": + print("Exiting gracefully") + exit() + case "y": + print("Fetching data...") + return _batch_to_df(core, params, num_results) + + + + +# Helper batch_to_df +def _batch_to_df(core, params, num_results): + start = params["start"] + batch_size = params["rows"] + chunks = [] + # If the 'wt' param was changed by error, we set it to 'json' + params["wt"] = "json" + + # Request chunks until we have complete data. + with tqdm(total=num_results) as pbar: + while start < num_results: + # Update progress bar with the number of rows requested. + pbar.update(batch_size) + + # Request chunk. We don't need num_results anymore because it does not change. + _, df_chunk = solr_request( + core=core, + params={**params, "start": start, "rows": batch_size}, + silent=True, + ) + + # Record chunk. + chunks.append(df_chunk) + # Increment start. + start += batch_size + # Prepare final dataframe. + return pd.concat(chunks, ignore_index=True) + + +def _batch_solr_generator(core, params, num_results): + base_url = "https://www.ebi.ac.uk/mi/impc/solr/" + solr_url = base_url + core + "/select" + start = params["start"] + batch_size = params["rows"] + + with tqdm(total=num_results) as pbar: + while start < num_results: + params["start"] = start + response = requests.get(solr_url, params=params, timeout=10) + + if response.status_code == 200: + if params.get("wt") == "json": + data = response.json()["response"]["docs"] + else: + data = response.text + yield data + else: + raise Exception(f"Request failed. Status code: {response.status_code}") + + pbar.update(batch_size) + start += batch_size + + +# File writer +def _solr_downloader(params, filename, solr_generator): + with open(filename, "w", encoding="UTF-8") as f: + for chunk in solr_generator: + if params.get("wt") == "json": + for item in chunk: + json.dump(item, f) + f.write("\n") + else: + f.write(chunk) From 5c7bbf545361ede8b85156b1dd41666c08beb098 Mon Sep 17 00:00:00 2001 From: dpavam Date: Fri, 23 Aug 2024 22:40:06 +0100 Subject: [PATCH 02/64] adds temporary file to test batch_solr_request --- impc_api_helper/impc_api_helper/temp.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 impc_api_helper/impc_api_helper/temp.py diff --git a/impc_api_helper/impc_api_helper/temp.py b/impc_api_helper/impc_api_helper/temp.py new file mode 100644 index 0000000..1a7fe66 --- /dev/null +++ b/impc_api_helper/impc_api_helper/temp.py @@ -0,0 +1,19 @@ +from iterator_solr_request_2 import batch_solr_request +import pandas as pd + +markers = ['"Cthrc1"', '*11'] +df = batch_solr_request( + core="genotype-phenotype", + params={ + "q": "*:*", + "fl": "marker_symbol,mp_term_name,p_value", + 'field_list': markers, + 'field_type': 'marker_symbol' + }, + download=True, +) + +df = pd.read_json('genotype-phenotype.json', nrows=80000, lines=True) +# df = pd.read_csv('genotype-phenotype.csv', nrows=80000) +# df = pd.read_xml('genotype-phenotype.xml', parser='etree') +print(df.shape) From b6fd64d74924de70e9a2b7600603a71d3d0427bc Mon Sep 17 00:00:00 2001 From: dpavam Date: Tue, 27 Aug 2024 17:26:27 +0100 Subject: [PATCH 03/64] adds path customisation for downloads --- impc_api_helper/impc_api_helper/iterator_solr_request_2.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/impc_api_helper/impc_api_helper/iterator_solr_request_2.py b/impc_api_helper/impc_api_helper/iterator_solr_request_2.py index 201573b..0d5a84b 100644 --- a/impc_api_helper/impc_api_helper/iterator_solr_request_2.py +++ b/impc_api_helper/impc_api_helper/iterator_solr_request_2.py @@ -5,6 +5,7 @@ from tqdm import tqdm from solr_request import solr_request import logging +from pathlib import Path # Set logger logger = logging.getLogger(__name__) @@ -13,7 +14,7 @@ logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S') -def batch_solr_request(core, params, download=False, batch_size=5000): +def batch_solr_request(core, params, download=False, batch_size=5000, path_to_download='./'): # Set params for batch request params["start"] = 0 # Start at the first result params["rows"] = batch_size # Fetch results in chunks of 5000 @@ -47,7 +48,7 @@ def batch_solr_request(core, params, download=False, batch_size=5000): # If user decides to download, a generator is used to fetch data in batches without storing results in memory. if download: # Implement loop behaviour - filename = f"{core}.{params['wt']}" + filename = Path(path_to_download) / f"{core}.{params['wt']}" gen = _batch_solr_generator(core, params, num_results) _solr_downloader(params, filename, gen) logger.warning("Showing the first batch of results only. See downloaded file for the whole data.") From 7341e010baf9d1f5b528d0f7807a94c1eecffeb2 Mon Sep 17 00:00:00 2001 From: dpavam Date: Tue, 27 Aug 2024 17:28:47 +0100 Subject: [PATCH 04/64] replaced logger with print statements for testing simplicity --- .../impc_api_helper/iterator_solr_request_2.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/impc_api_helper/impc_api_helper/iterator_solr_request_2.py b/impc_api_helper/impc_api_helper/iterator_solr_request_2.py index 0d5a84b..b35a8c1 100644 --- a/impc_api_helper/impc_api_helper/iterator_solr_request_2.py +++ b/impc_api_helper/impc_api_helper/iterator_solr_request_2.py @@ -7,13 +7,6 @@ import logging from pathlib import Path -# Set logger -logger = logging.getLogger(__name__) -logger.setLevel(logging.WARNING) -FORMAT = "%(levelname)s: %(asctime)s - %(message)s" -logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S') - - def batch_solr_request(core, params, download=False, batch_size=5000, path_to_download='./'): # Set params for batch request params["start"] = 0 # Start at the first result @@ -51,7 +44,7 @@ def batch_solr_request(core, params, download=False, batch_size=5000, path_to_do filename = Path(path_to_download) / f"{core}.{params['wt']}" gen = _batch_solr_generator(core, params, num_results) _solr_downloader(params, filename, gen) - logger.warning("Showing the first batch of results only. See downloaded file for the whole data.") + print("Showing the first batch of results only. See downloaded file for the whole data.") match params["wt"]: case "json": return pd.read_json(filename, nrows=batch_size, lines=True) @@ -64,7 +57,7 @@ def batch_solr_request(core, params, download=False, batch_size=5000, path_to_do # If it's too big, warn the user and ask if they want to proceed. else: - logger.warning( + print( "Your request might exceed the available memory. We suggest setting 'download=True' and reading the file in batches" ) prompt = input( From e989379f9f2cce869517a689b09e276152ad5b22 Mon Sep 17 00:00:00 2001 From: dpavam Date: Tue, 27 Aug 2024 17:30:22 +0100 Subject: [PATCH 05/64] removes header arg from pd.read in batch download --- impc_api_helper/impc_api_helper/iterator_solr_request_2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/impc_api_helper/impc_api_helper/iterator_solr_request_2.py b/impc_api_helper/impc_api_helper/iterator_solr_request_2.py index b35a8c1..1b3c78a 100644 --- a/impc_api_helper/impc_api_helper/iterator_solr_request_2.py +++ b/impc_api_helper/impc_api_helper/iterator_solr_request_2.py @@ -49,7 +49,7 @@ def batch_solr_request(core, params, download=False, batch_size=5000, path_to_do case "json": return pd.read_json(filename, nrows=batch_size, lines=True) case "csv": - return pd.read_csv(filename, nrows=batch_size, header=None) + return pd.read_csv(filename, nrows=batch_size) # If the number of results is small enough and download is off, it's okay to show as df if num_results < 1000000 and not download: From ea82667bb8ffeaa883d0b5d21a07a0651bc7d9e1 Mon Sep 17 00:00:00 2001 From: dpavam Date: Tue, 27 Aug 2024 17:31:01 +0100 Subject: [PATCH 06/64] adds 'enter' as input to continue with request --- impc_api_helper/impc_api_helper/iterator_solr_request_2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/impc_api_helper/impc_api_helper/iterator_solr_request_2.py b/impc_api_helper/impc_api_helper/iterator_solr_request_2.py index 1b3c78a..cb421d5 100644 --- a/impc_api_helper/impc_api_helper/iterator_solr_request_2.py +++ b/impc_api_helper/impc_api_helper/iterator_solr_request_2.py @@ -67,7 +67,7 @@ def batch_solr_request(core, params, download=False, batch_size=5000, path_to_do case "n" | "exit": print("Exiting gracefully") exit() - case "y": + case "y" | '': print("Fetching data...") return _batch_to_df(core, params, num_results) From 72d5d228c2c412d2ebeed5753bf13a9e43b6697e Mon Sep 17 00:00:00 2001 From: dpavam Date: Tue, 27 Aug 2024 18:57:04 +0100 Subject: [PATCH 07/64] Initial unit tests for batch_solr_request --- .../tests/test_iterator_solr_request.py | 237 ++++++++++++++++++ 1 file changed, 237 insertions(+) diff --git a/impc_api_helper/tests/test_iterator_solr_request.py b/impc_api_helper/tests/test_iterator_solr_request.py index e69de29..a0ae503 100644 --- a/impc_api_helper/tests/test_iterator_solr_request.py +++ b/impc_api_helper/tests/test_iterator_solr_request.py @@ -0,0 +1,237 @@ +import pytest +from pathlib import Path +from unittest.mock import patch, call +from impc_api_helper.iterator_solr_request_2 import batch_solr_request, _batch_solr_generator, solr_request, _batch_to_df +import io +import pandas as pd + +class TestBatchSolrRequest(): + + # Fixture containing the params of a normal batch_solr_request + @pytest.fixture + def common_params(self): + return {"start": 0, "rows": 10000, "wt": "json"} + + # Fixture containing the core + @pytest.fixture + def core(self): + return 'test_core' + + # Fixture containing the solr_request function mock + # We will be mocking solr_request with different numbers of numFound, therefore it is passed as param + @pytest.fixture + def mock_solr_request(self, request): + with patch('impc_api_helper.iterator_solr_request_2.solr_request') as mock: + # Mock expected return content of the solr_request (numFound and _) + mock.return_value = (request.param, pd.DataFrame()) + yield mock + + # Pytest fixture mocking _batch_to_df + @pytest.fixture + def mock_batch_to_df(self): + with patch('impc_api_helper.iterator_solr_request_2._batch_to_df') as mock: + yield mock + + # Parameters to determine the numFound of mock_solr_request + @pytest.mark.parametrize("mock_solr_request", [10000], indirect=True) + def test_batch_solr_request_no_download_small_request(self, mock_solr_request, core, common_params, capsys, mock_batch_to_df): + + # Call your function that uses the mocked request + # Set up mock_solr_request values + result = batch_solr_request(core, params=common_params, + download=False) + + # # Assert the mock was called with the expected parameters (start = 0, rows = 0) despite calling other values. + mock_solr_request.assert_called_with( + core=core, + params={**common_params, "start": 0, "rows": 0, "wt": "json"}, + silent=True + ) + + # Capture stoud + num_found = mock_solr_request.return_value[0] + captured = capsys.readouterr() + assert captured.out == f"Number of found documents: {num_found}\n" + + # Check _batch_to_df was called + mock_batch_to_df.assert_called_once() + + + # Set mock_solr_request to return a large numFound + @pytest.mark.parametrize("mock_solr_request", [1000001], indirect=True) + # Parameter to test 4 cases: when user selects 'y' or 'n' upon large download warning. + @pytest.mark.parametrize("user_input,expected_outcome", [ + ('y', 'continue'), + ('', 'continue'), + ('n', 'exit'), + ('exit', 'exit') + ]) + def test_batch_solr_request_no_download_large_request(self, core, common_params, capsys, monkeypatch, mock_batch_to_df, mock_solr_request, user_input, expected_outcome): + # Monkeypatch the input() function with parametrized user input + monkeypatch.setattr('builtins.input', lambda _: user_input) + + # When user selects 'n', exit should be triggered. + if expected_outcome == 'exit': + with pytest.raises(SystemExit): + batch_solr_request(core, params=common_params, download=False, batch_size=5000) + else: + result = batch_solr_request(core, params=common_params, download=False, batch_size=5000) + + # Capture the exit messages + captured = capsys.readouterr() + + # Assertions for continue case + num_found = mock_solr_request.return_value[0] + + assert f"Number of found documents: {num_found}" in captured.out + + if expected_outcome == 'continue': + assert "Your request might exceed the available memory. We suggest setting 'download=True' and reading the file in batches" in captured.out + mock_batch_to_df.assert_called_once_with('test_core', {'start': 0, 'rows': 5000, 'wt': 'json'}, 1000001) + + # Assertion for exit case + elif expected_outcome == 'exit': + assert "Exiting gracefully" in captured.out + mock_batch_to_df.assert_not_called() + + + + # TEST DOWNLOAD TRUE + # Fixture mocking the activity of _batch_solr_generator + @pytest.fixture + def mock_batch_solr_generator(self): + with patch('impc_api_helper.iterator_solr_request_2._batch_solr_generator') as mock: + yield mock + + @pytest.fixture + def mock_solr_downloader(self, tmp_path): + with patch('impc_api_helper.iterator_solr_request_2._solr_downloader') as mock: + temp_dir = Path(tmp_path) / 'temp_dir' + temp_dir.mkdir() + yield mock + + + # Mock response for test containing more than 2,000,000 docs + @pytest.mark.parametrize("mock_solr_request", [2000000], indirect=True) + # Parametrized decorator to simulate reading a json and csv files + @pytest.mark.parametrize( + "params_format, format, file_content, expected_columns", + [ + ({"start": 0, "rows": 2000000, "wt": "json"},"json", '{"id": "1", "city": "Houston"}\n{"id": "2", "city": "Prague"}\n', ['id', 'city']), + ({"start": 0, "rows": 2000000, "wt": "csv"},"csv", 'id,city\n1,Houston\n2,Prague\n', ['id', 'city']) + ] + ) + # Test download = True + def test_batch_solr_request_download_true(self, core, capsys, mock_solr_request, mock_batch_solr_generator, mock_solr_downloader, tmp_path, common_params, params_format, format, file_content, expected_columns): + + # Create temporary files for the test + temp_dir = tmp_path / 'temp_dir' + filename = f"{core}.{format}" + temp_file = temp_dir / filename + temp_file.write_text(file_content) + + + # First we call the function + # We patch solr_request to get the number of docs + result = batch_solr_request(core, params=params_format, download=True, path_to_download=temp_dir) + num_found = mock_solr_request.return_value[0] + # TODO: Check batch_solr_generator gets called at least twice with increasing start + + # Check _batch_solr_generator gets called once with correct args + mock_batch_solr_generator.assert_called_with(core, params_format, num_found) + + # Check _solr_downloader gets called once with correct args + mock_solr_downloader.assert_called_once_with(params_format, temp_file, mock_batch_solr_generator.return_value) + + # Check the print statements + captured = capsys.readouterr() + assert f"Number of found documents: {num_found}" in captured.out + assert "Showing the first batch of results only" in captured.out + + # Check a pd was created and its contents + assert isinstance(result, pd.DataFrame) + assert len(result) == 2 + assert list(result.columns) == expected_columns + assert result.loc[0,'id'] == 1 + assert result.loc[0,'city'] == 'Houston' + assert result.loc[1,'id'] == 2 + assert result.loc[1,'city'] == 'Prague' + + + + + + # TODO: +# _batch_to_df. + + + # Mock params + @pytest.fixture + def multiple_field_params(self): + return { + "q": "*:*", + "rows": 0, + "start": 0, + "field_list":['"orange"','apple','*berry'], + "field_type": "fruits", + "wt":"json" + + } + # Mock response for test containing more than 2,000,000 docs + @pytest.mark.parametrize("mock_solr_request", [(2000000),(10000)], indirect=True) + + @pytest.mark.parametrize( + "download_bool", + [ + (True), + (False) + ], + ) + # @pytest.mark.skip(reason="no way of currently testing this") + def test_batch_solr_request_multiple_fields(self, core, multiple_field_params, capsys, mock_solr_request, mock_batch_solr_generator, download_bool, monkeypatch, mock_batch_to_df, mock_solr_downloader, tmp_path): + # This test should make sure the request is formatted properly. Regardless of going to downloads or to _batch_to_df + + # Get num_found + num_found = mock_solr_request.return_value[0] + # In the case where download=False and numFound is > 1,000,001 we pass 'y' in this test case. + if not download_bool and num_found == 2000000: + monkeypatch.setattr('builtins.input', lambda _: 'y') + + # Call test function + # If downloads create a temporary file and call with the path_to_download + if download_bool: + temp_dir = tmp_path / 'temp_dir' + temp_file = temp_dir / f"{core}.json" + temp_file.write_text('{"id": "1", "city": "Cape Town"}\n') + result = batch_solr_request(core, params=multiple_field_params, download=download_bool, path_to_download=temp_dir) + else: + # Otherwise, call without the path_to_download + result = batch_solr_request(core, params=multiple_field_params, download=download_bool) + + # Check output which should be equal for both. + captured = capsys.readouterr() + assert f"Number of found documents: {num_found}" in captured.out + assert 'Queried field: fruits:("orange" OR apple OR *berry)' in captured.out + + # If download was true, check subsequent functions were executed + if download_bool: + assert "Showing the first batch of results only" in captured.out + # Check _batch_solr_generator gets called once with correct args + mock_batch_solr_generator.assert_called_with(core, multiple_field_params, num_found) + + # Check _solr_downloader gets called once with correct args + mock_solr_downloader.assert_called_once_with(multiple_field_params, temp_file, mock_batch_solr_generator.return_value) + + + # Otherwise, use the 'y' input at the start of the test and make sure the required function is executed. + if not download_bool and num_found == 2000000: + assert "Your request might exceed the available memory. We suggest setting 'download=True' and reading the file in batches" in captured.out + # Check _batch_to_df was called with correct params + mock_batch_to_df.assert_called_once_with(core, multiple_field_params, num_found) + + + + + + + From a8a55c32bf894854dd1ed3b3b754e2e011574c88 Mon Sep 17 00:00:00 2001 From: dpavam Date: Thu, 19 Sep 2024 16:49:59 +0100 Subject: [PATCH 08/64] Adjusted number of calls of mock_batch_solr_generator --- impc_api_helper/tests/test_iterator_solr_request.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/impc_api_helper/tests/test_iterator_solr_request.py b/impc_api_helper/tests/test_iterator_solr_request.py index a0ae503..91f00a1 100644 --- a/impc_api_helper/tests/test_iterator_solr_request.py +++ b/impc_api_helper/tests/test_iterator_solr_request.py @@ -135,10 +135,9 @@ def test_batch_solr_request_download_true(self, core, capsys, mock_solr_request, # We patch solr_request to get the number of docs result = batch_solr_request(core, params=params_format, download=True, path_to_download=temp_dir) num_found = mock_solr_request.return_value[0] - # TODO: Check batch_solr_generator gets called at least twice with increasing start # Check _batch_solr_generator gets called once with correct args - mock_batch_solr_generator.assert_called_with(core, params_format, num_found) + mock_batch_solr_generator.assert_called_once_with(core, params_format, num_found) # Check _solr_downloader gets called once with correct args mock_solr_downloader.assert_called_once_with(params_format, temp_file, mock_batch_solr_generator.return_value) From 7f153bf0ab4b35c075b842743d7442b0dab72c9d Mon Sep 17 00:00:00 2001 From: dpavam Date: Thu, 19 Sep 2024 16:50:57 +0100 Subject: [PATCH 09/64] Adds tests for _batch_to_df in a new set of classes --- .../tests/test_iterator_solr_request.py | 92 ++++++++++++++++--- 1 file changed, 80 insertions(+), 12 deletions(-) diff --git a/impc_api_helper/tests/test_iterator_solr_request.py b/impc_api_helper/tests/test_iterator_solr_request.py index 91f00a1..564601b 100644 --- a/impc_api_helper/tests/test_iterator_solr_request.py +++ b/impc_api_helper/tests/test_iterator_solr_request.py @@ -4,6 +4,13 @@ from impc_api_helper.iterator_solr_request_2 import batch_solr_request, _batch_solr_generator, solr_request, _batch_to_df import io import pandas as pd +from pandas.testing import assert_frame_equal + + +# Fixture containing the core +@pytest.fixture +def core(): + return 'test_core' class TestBatchSolrRequest(): @@ -12,11 +19,6 @@ class TestBatchSolrRequest(): def common_params(self): return {"start": 0, "rows": 10000, "wt": "json"} - # Fixture containing the core - @pytest.fixture - def core(self): - return 'test_core' - # Fixture containing the solr_request function mock # We will be mocking solr_request with different numbers of numFound, therefore it is passed as param @pytest.fixture @@ -157,13 +159,6 @@ def test_batch_solr_request_download_true(self, core, capsys, mock_solr_request, assert result.loc[1,'city'] == 'Prague' - - - - # TODO: -# _batch_to_df. - - # Mock params @pytest.fixture def multiple_field_params(self): @@ -228,9 +223,82 @@ def test_batch_solr_request_multiple_fields(self, core, multiple_field_params, c # Check _batch_to_df was called with correct params mock_batch_to_df.assert_called_once_with(core, multiple_field_params, num_found) +# Have helper functions in a different class to separate fixtures and parameters +class TestHelpersSolrBatchRequest(): + # Define a generator to produce DF's dynamically + def dataframe_generator(self): + """ Generator to produce dataframes dynamically (row by row)/ + + Yields: + pd.DataFrame: Dataframe with one row of data + """ + # Values for the dataframes + animals = ['Bull', 'Elephant', 'Rhino', 'Monkey', 'Snake'] + # Construct the dataframe and yield it + for i, a in enumerate(animals): + yield pd.DataFrame( + { + 'id': [i], + 'animal': [a] + } + ) + + # Fixture containing the solr_request function mock + # Num_found is passed dynamically as params in the test + # The DF is returned dynamically using the generator + @pytest.fixture + def mock_solr_request_generator(self, request): + """ Patches solr_request for _batch_to_df _batch_solr_generator producing a df dynamically. + Creates a df in chunks (row by row) mocking incoming batches of responses. + """ + with patch('impc_api_helper.iterator_solr_request_2.solr_request') as mock: + df_generator = self.dataframe_generator() + def side_effect(*args, **kwargs): + df = next(df_generator) + return request.param, df + mock.side_effect = side_effect + yield mock + # Fixture containing the params of a normal batch_solr_request + @pytest.fixture + def batch_params(self, rows): + return {"start": 0, "rows": rows, "wt": "json"} + + @pytest.fixture + def num_found(self, request): + return request.param + + # Parameters to be passsed to the test: the num_found for mock_solr_request_generator, the num_found separately, and rows (batch_size). + # Note num_found is returned by solr_request, when we access it using the generator function, it causes issues. + # Hence, we pass num_found separately as a fixture. + @pytest.mark.parametrize("mock_solr_request_generator,num_found,rows", [ + (50000, 50000, 10000), + (5, 5, 1), + (25000, 25000, 5000) + ], + indirect=['mock_solr_request_generator']) + + def test_batch_to_df(self, core, batch_params, num_found, mock_solr_request_generator, rows): + # Call the tested function + df = _batch_to_df(core, batch_params, num_found) + # Assert solr_request was called with the expected params and increasing start + expected_calls = [ + call(core=core, params={**batch_params, 'start': i * rows, 'rows': rows}, silent=True) for i in range(5) + ] + mock_solr_request_generator.assert_has_calls(expected_calls) + # Assert the structure of the final df + assert_frame_equal(df, pd.DataFrame( + { + 'id': [0, 1, 2, 3, 4], + 'animal': ['Bull', 'Elephant', 'Rhino', 'Monkey', 'Snake'] + } + ).reset_index(drop=True)) + + # TODO: + # _batch_solr_generator + # _solr_downloader From 949751511876722f39dc7a78ac778bb38b5bbf86 Mon Sep 17 00:00:00 2001 From: dpavam Date: Tue, 24 Sep 2024 12:29:30 +0100 Subject: [PATCH 10/64] tests for _batch_solr_generator --- .../tests/test_iterator_solr_request.py | 113 +++++++++++++++++- 1 file changed, 110 insertions(+), 3 deletions(-) diff --git a/impc_api_helper/tests/test_iterator_solr_request.py b/impc_api_helper/tests/test_iterator_solr_request.py index 564601b..2c7cb57 100644 --- a/impc_api_helper/tests/test_iterator_solr_request.py +++ b/impc_api_helper/tests/test_iterator_solr_request.py @@ -1,10 +1,11 @@ import pytest from pathlib import Path -from unittest.mock import patch, call +from unittest.mock import patch, call, MagicMock from impc_api_helper.iterator_solr_request_2 import batch_solr_request, _batch_solr_generator, solr_request, _batch_to_df import io import pandas as pd from pandas.testing import assert_frame_equal +from .test_helpers import check_url_status_code_and_params # Fixture containing the core @@ -259,7 +260,7 @@ def side_effect(*args, **kwargs): mock.side_effect = side_effect yield mock - # Fixture containing the params of a normal batch_solr_request + # Fixture containing the params of a normal batch_solr_request @pytest.fixture def batch_params(self, rows): return {"start": 0, "rows": rows, "wt": "json"} @@ -297,8 +298,114 @@ def test_batch_to_df(self, core, batch_params, num_found, mock_solr_request_gene ).reset_index(drop=True)) + # Test _batch_solr_generator + # This function uses the requests module so we need to patch it + # Main outcomes: + # The data is yielded back as json DONE + # The data is yielded back as text DONE + # The error raises and exception DONE + # The requests module is called multiple times + # The start param is increased by batch_size + # The URL, and compount url should be checked. + + # Fixture to mock the requests module + @pytest.fixture + def mock_requests_get(self, request): + with patch('impc_api_helper.iterator_solr_request_2.requests.get') as mock_get: + # Capture the format of the response + wt = request.param["wt"] + mock_get.return_value.format = wt + + # Get the status code and + mock_get.return_value.status_code = request.param["status_code"] + + # Get "response" data according to format + if wt == "json": + mock_get.return_value.json.return_value = request.param["response"] + + elif wt == "csv": + mock_get.return_value.text = request.param["response"] + + yield mock_get + + # Fixture containing the params for batch_solr_generator + @pytest.fixture + def batch_solr_generator_params(self): + return {"start": 0, "rows": 2} + @pytest.mark.parametrize( + "mock_requests_get", + [ + ({ + "status_code": 200, + "response": {"response": { + "docs": [ + {"id": "Gibson"}, + {"id": "Ibanez"}, + {"id": "Schecter"}, + {"id": "PRS"} + ] + }}, + "wt": "json" + }), + ({ + "status_code": 200, + "response": "id\nGibson\nIbanez\nSchecter\nPRS", + "wt": "csv" + }) + ], + indirect=['mock_requests_get'] + ) + def test_batch_solr_generator(self, core, batch_solr_generator_params, mock_requests_get): + + # Define num_found + num_results = 4 + # Define the wt and batch_size param for the test + batch_solr_generator_params["wt"] = mock_requests_get.return_value.format + batch_solr_generator_params["rows"] = 1 + + # Create the generator + result = _batch_solr_generator(core, batch_solr_generator_params, num_results) + + # Assertions for json data + if batch_solr_generator_params["wt"] == "json": + assert next(result) == [ + {"id": "Gibson"}, + {"id": "Ibanez"}, + {"id": "Schecter"}, + {"id": "PRS"} + ] + # Assertions for csv + elif batch_solr_generator_params["wt"] == "csv": + assert next(result) == "id\nGibson\nIbanez\nSchecter\nPRS" + + # General assertions + # Checks the url, status code, and params called are as expected. + check_url_status_code_and_params( + mock_response=mock_requests_get, + expected_status_code=200, + expected_core=core, + expected_params=batch_solr_generator_params, + ) + + # Simpler approach to test when status code is 404 + @pytest.fixture + def mock_requests_get_error(self, request): + with patch('impc_api_helper.iterator_solr_request_2.requests.get') as mock_get: + mock_get.return_value.status_code = request.param + yield mock_get + + # Set up test for _batch_solr_generator when status code is 404 + @pytest.mark.parametrize("mock_requests_get_error", [404, 500], indirect=['mock_requests_get_error']) + def test_batch_solr_generator_error(self, core, batch_solr_generator_params, mock_requests_get_error): + # Get status code: + status_code = mock_requests_get_error.return_value.status_code + # Call the generator and expect an exception to be raised + # Note the num_found is passed but the number itself does not matter + # Note list() is needed so that the generator is iterated otherwise exception is never reached. + with pytest.raises(Exception, match=f"Request failed. Status code: {status_code}"): + _ = list(_batch_solr_generator(core=core, params=batch_solr_generator_params, num_results=4)) + # TODO: - # _batch_solr_generator # _solr_downloader From c6c0795484639482974e172fe9a3a600f26bfde6 Mon Sep 17 00:00:00 2001 From: dpavam Date: Tue, 24 Sep 2024 14:03:17 +0100 Subject: [PATCH 11/64] Modifies dataframe_generator to generate data dynamically instead of df's for reusability. --- .../tests/test_iterator_solr_request.py | 36 +++++++++++-------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/impc_api_helper/tests/test_iterator_solr_request.py b/impc_api_helper/tests/test_iterator_solr_request.py index 2c7cb57..d842ec0 100644 --- a/impc_api_helper/tests/test_iterator_solr_request.py +++ b/impc_api_helper/tests/test_iterator_solr_request.py @@ -227,36 +227,44 @@ def test_batch_solr_request_multiple_fields(self, core, multiple_field_params, c # Have helper functions in a different class to separate fixtures and parameters class TestHelpersSolrBatchRequest(): # Define a generator to produce DF's dynamically - def dataframe_generator(self): - """ Generator to produce dataframes dynamically (row by row)/ + def data_generator(self): + """ Generator to produce data dynamically (row by row or doc by doc)/ Yields: - pd.DataFrame: Dataframe with one row of data + Tuple: tuple containing an id number and a value """ # Values for the dataframes animals = ['Bull', 'Elephant', 'Rhino', 'Monkey', 'Snake'] - # Construct the dataframe and yield it - for i, a in enumerate(animals): - yield pd.DataFrame( - { - 'id': [i], - 'animal': [a] - } - ) + # Yield a tuple containing an id number and an animal string + for i, a in enumerate(animals): + yield (i, a) # Fixture containing the solr_request function mock # Num_found is passed dynamically as params in the test - # The DF is returned dynamically using the generator + # Generates df's dynamically using the data generator @pytest.fixture def mock_solr_request_generator(self, request): """ Patches solr_request for _batch_to_df _batch_solr_generator producing a df dynamically. Creates a df in chunks (row by row) mocking incoming batches of responses. """ with patch('impc_api_helper.iterator_solr_request_2.solr_request') as mock: - df_generator = self.dataframe_generator() + + # Call the generator + data_generator = self.data_generator() + + # Use the side_effects to return num_found and the dfs def side_effect(*args, **kwargs): - df = next(df_generator) + # Get the tuple from the data generator + idx, animal = next(data_generator) + # Create a df + df = pd.DataFrame( + { + 'id': [idx], + 'animal': [animal] + } + ) return request.param, df + mock.side_effect = side_effect yield mock From a3b58fb1e49b0197eaff28cdb935b1e258c3cc06 Mon Sep 17 00:00:00 2001 From: dpavam Date: Tue, 1 Oct 2024 12:57:55 +0100 Subject: [PATCH 12/64] Modifies _solr_downloader_ to write regular json instead of line-delimited json. Adapts test accordingly --- .../iterator_solr_request_2.py | 21 +++++-- .../tests/test_iterator_solr_request.py | 62 ++++++++++++++++++- 2 files changed, 75 insertions(+), 8 deletions(-) diff --git a/impc_api_helper/impc_api_helper/iterator_solr_request_2.py b/impc_api_helper/impc_api_helper/iterator_solr_request_2.py index cb421d5..e9bc577 100644 --- a/impc_api_helper/impc_api_helper/iterator_solr_request_2.py +++ b/impc_api_helper/impc_api_helper/iterator_solr_request_2.py @@ -130,10 +130,21 @@ def _batch_solr_generator(core, params, num_results): # File writer def _solr_downloader(params, filename, solr_generator): with open(filename, "w", encoding="UTF-8") as f: - for chunk in solr_generator: - if params.get("wt") == "json": + + if params.get("wt") == "json": + f.write("[\n") + first_item = True + + for chunk in solr_generator: + # print('CHUNK',chunk,'\n') for item in chunk: - json.dump(item, f) - f.write("\n") - else: + if not first_item: + f.write(",\n") + # print('ITEM',item) + json.dump(item, f, ensure_ascii=False) + first_item = False + f.write("\n]\n") + + else: + for chunk in solr_generator: f.write(chunk) diff --git a/impc_api_helper/tests/test_iterator_solr_request.py b/impc_api_helper/tests/test_iterator_solr_request.py index d842ec0..753b7ae 100644 --- a/impc_api_helper/tests/test_iterator_solr_request.py +++ b/impc_api_helper/tests/test_iterator_solr_request.py @@ -1,8 +1,6 @@ import pytest from pathlib import Path -from unittest.mock import patch, call, MagicMock -from impc_api_helper.iterator_solr_request_2 import batch_solr_request, _batch_solr_generator, solr_request, _batch_to_df -import io +import json import pandas as pd from pandas.testing import assert_frame_equal from .test_helpers import check_url_status_code_and_params @@ -417,3 +415,61 @@ def test_batch_solr_generator_error(self, core, batch_solr_generator_params, moc # TODO: # _solr_downloader + + @pytest.fixture + def mock_solr_generator(self, request): + format = request.param + if format == "json": + def data_chunks(): + chunk_1 = [{"id": idx, "name": number} for idx, number in enumerate(range(0, 3))] + chunk_2 = [{"id": idx, "name": number} for idx, number in enumerate(range(100, 97, -1))] + + yield chunk_1 + yield chunk_2 + yield data_chunks() + # elif format == "csv": + # yield f"id,\n{animal}" + + + + @pytest.mark.parametrize( + "mock_solr_generator, expected_format", [("json", "json")], indirect=["mock_solr_generator"] + ) + def test_solr_downloader(self, mock_solr_generator, batch_solr_generator_params, expected_format, tmp_path): + # 1. Test that it gets called with the correct params + # 1a. test the filename is appropriate (path + core) + # 2. Check it iterates correctly over a generator + # 3. If json: check it writes the expceted thing + # 4. If csv: check it writes the expected thing + + + # Need a generator to pass it data. We can use the data_generator + # A fixture that gives it the adequate data in json or csv format + # Call the function with some params, custom filename (from temp_path) and the genrator + # Check the file contains what is excpected (perhaps use parameters here) + + solr_gen = mock_solr_generator + path = Path(tmp_path) + file = 'test.' + expected_format + test_file = path / file + # test_file = 'isaa.json' + _solr_downloader(params={**batch_solr_generator_params, "wt": expected_format}, + # filename= Path(tmp_path,'tmp_file'), + filename=test_file, + solr_generator=solr_gen) + + # TODO: Finish the CSV part + + # Read the downloaded file and check it contains the expected data + if expected_format == 'json': + with open(test_file, 'r') as f: + content = json.load(f) + + assert content == [ + {"id": 0, "name": 0}, + {"id": 1, "name": 1}, + {"id": 2, "name": 2}, + {"id": 0, "name": 100}, + {"id": 1, "name": 99}, + {"id": 2, "name": 98} + ] From 6c09fb96a18fdbf6934850b7e065c7c860fd0a18 Mon Sep 17 00:00:00 2001 From: dpavam Date: Tue, 1 Oct 2024 14:37:20 +0100 Subject: [PATCH 13/64] fix: _solr_downloader csv writer to write chunks without repeated headers --- .../iterator_solr_request_2.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/impc_api_helper/impc_api_helper/iterator_solr_request_2.py b/impc_api_helper/impc_api_helper/iterator_solr_request_2.py index e9bc577..ed770b0 100644 --- a/impc_api_helper/impc_api_helper/iterator_solr_request_2.py +++ b/impc_api_helper/impc_api_helper/iterator_solr_request_2.py @@ -133,18 +133,27 @@ def _solr_downloader(params, filename, solr_generator): if params.get("wt") == "json": f.write("[\n") - first_item = True + first_chunk = True for chunk in solr_generator: # print('CHUNK',chunk,'\n') for item in chunk: - if not first_item: + if not first_chunk: f.write(",\n") # print('ITEM',item) json.dump(item, f, ensure_ascii=False) - first_item = False + first_chunk = False f.write("\n]\n") - else: + elif params.get("wt") == "csv": + first_chunk = True for chunk in solr_generator: - f.write(chunk) + lines = chunk.splitlines() + if first_chunk: + # Write all lines in the first chunk + f.write(chunk) + first_chunk = False + else: + # Skip the first line (header) in subsequent chunks + f.write('\n' + '\n'.join(lines[1:]) + '\n') + From 634adb8203bc21797af7061c7d51df629973f20c Mon Sep 17 00:00:00 2001 From: dpavam Date: Tue, 1 Oct 2024 14:39:12 +0100 Subject: [PATCH 14/64] test: adds_solr_downloader basic tests --- .../tests/test_iterator_solr_request.py | 112 ++++++++++++------ 1 file changed, 73 insertions(+), 39 deletions(-) diff --git a/impc_api_helper/tests/test_iterator_solr_request.py b/impc_api_helper/tests/test_iterator_solr_request.py index 753b7ae..c0d95b5 100644 --- a/impc_api_helper/tests/test_iterator_solr_request.py +++ b/impc_api_helper/tests/test_iterator_solr_request.py @@ -420,56 +420,90 @@ def test_batch_solr_generator_error(self, core, batch_solr_generator_params, moc def mock_solr_generator(self, request): format = request.param if format == "json": + def data_chunks(): - chunk_1 = [{"id": idx, "name": number} for idx, number in enumerate(range(0, 3))] - chunk_2 = [{"id": idx, "name": number} for idx, number in enumerate(range(100, 97, -1))] + chunk_1 = [ + {"id": idx, "number": number} + for idx, number in enumerate(range(0, 3)) + ] + chunk_2 = [ + {"id": idx, "number": number} + for idx, number in enumerate(range(100, 97, -1), start=3) + ] yield chunk_1 yield chunk_2 - yield data_chunks() - # elif format == "csv": - # yield f"id,\n{animal}" - + yield data_chunks() + elif format == "csv": - @pytest.mark.parametrize( - "mock_solr_generator, expected_format", [("json", "json")], indirect=["mock_solr_generator"] - ) - def test_solr_downloader(self, mock_solr_generator, batch_solr_generator_params, expected_format, tmp_path): - # 1. Test that it gets called with the correct params - # 1a. test the filename is appropriate (path + core) - # 2. Check it iterates correctly over a generator - # 3. If json: check it writes the expceted thing - # 4. If csv: check it writes the expected thing + def data_chunks(): + chunk_1 = "id,number\n" + "\n".join( + f"{idx},{number}" for idx, number in enumerate(range(0, 3)) + ) + chunk_2 = "id,number\n" + "\n".join( + f"{idx},{number}" + for idx, number in enumerate(range(100, 97, -1), start=3) + ) + yield chunk_1 + yield chunk_2 - # Need a generator to pass it data. We can use the data_generator - # A fixture that gives it the adequate data in json or csv format - # Call the function with some params, custom filename (from temp_path) and the genrator - # Check the file contains what is excpected (perhaps use parameters here) + yield data_chunks() + @pytest.mark.parametrize( + "mock_solr_generator, expected_format", + [("json", "json"), ("csv", "csv")], + indirect=["mock_solr_generator"], + ) + def test_solr_downloader( + self, + mock_solr_generator, + batch_solr_generator_params, + expected_format, + tmp_path, + ): + # Define the data generator and path to the temporary file to write solr_gen = mock_solr_generator path = Path(tmp_path) - file = 'test.' + expected_format + file = "test." + expected_format test_file = path / file - # test_file = 'isaa.json' - _solr_downloader(params={**batch_solr_generator_params, "wt": expected_format}, - # filename= Path(tmp_path,'tmp_file'), - filename=test_file, - solr_generator=solr_gen) - - # TODO: Finish the CSV part - # Read the downloaded file and check it contains the expected data - if expected_format == 'json': - with open(test_file, 'r') as f: - content = json.load(f) + # Call the tested function + _solr_downloader( + params={**batch_solr_generator_params, "wt": expected_format}, + filename=test_file, + solr_generator=solr_gen, + ) - assert content == [ - {"id": 0, "name": 0}, - {"id": 1, "name": 1}, - {"id": 2, "name": 2}, - {"id": 0, "name": 100}, - {"id": 1, "name": 99}, - {"id": 2, "name": 98} - ] + # Read the downloaded file and check it contains the expected data for json and csv. + with open(test_file, "r", encoding="UTF-8") as f: + if expected_format == "json": + content = json.load(f) + assert content == [ + {"id": 0, "number": 0}, + {"id": 1, "number": 1}, + {"id": 2, "number": 2}, + {"id": 3, "number": 100}, + {"id": 4, "number": 99}, + {"id": 5, "number": 98}, + ] + # Check it loads into a pd.DataFrame + test_df = pd.read_json(test_file) + + elif expected_format == "csv": + content = f.read() + + assert content == "id,number\n0,0\n1,1\n2,2\n3,100\n4,99\n5,98\n" + test_df = pd.read_csv(test_file) + + # Assert the structure of the final df + assert_frame_equal( + test_df, + pd.DataFrame( + { + "id": [0, 1, 2, 3, 4, 5], + "number": [0, 1, 2, 100, 99, 98], + } + ).reset_index(drop=True), + ) From 28e2701f83d0384e39760293ef6a6bdde6cfeaa0 Mon Sep 17 00:00:00 2001 From: dpavam Date: Tue, 1 Oct 2024 14:41:19 +0100 Subject: [PATCH 15/64] fix: loading bar _batch_solr_generator, printing URL and including all elements in num_results --- .../impc_api_helper/iterator_solr_request_2.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/impc_api_helper/impc_api_helper/iterator_solr_request_2.py b/impc_api_helper/impc_api_helper/iterator_solr_request_2.py index ed770b0..01e57f3 100644 --- a/impc_api_helper/impc_api_helper/iterator_solr_request_2.py +++ b/impc_api_helper/impc_api_helper/iterator_solr_request_2.py @@ -110,7 +110,7 @@ def _batch_solr_generator(core, params, num_results): batch_size = params["rows"] with tqdm(total=num_results) as pbar: - while start < num_results: + while start <= num_results: params["start"] = start response = requests.get(solr_url, params=params, timeout=10) @@ -119,12 +119,18 @@ def _batch_solr_generator(core, params, num_results): data = response.json()["response"]["docs"] else: data = response.text + + # Update and refresh the progress bar after getting the data + pbar.update(batch_size) + pbar.refresh() yield data + else: raise Exception(f"Request failed. Status code: {response.status_code}") - pbar.update(batch_size) + # pbar.update(batch_size) start += batch_size + print(f"Your request URL after the last call:{response.url}") # File writer From f147d587405985e74d834954afc14e378b05886c Mon Sep 17 00:00:00 2001 From: dpavam Date: Tue, 1 Oct 2024 15:40:30 +0100 Subject: [PATCH 16/64] fix: batch_solr_request download=True no longer reads recently generated files, only downloads. Test adapted accordingly. --- .../iterator_solr_request_2.py | 8 +- .../tests/test_iterator_solr_request.py | 76 +++++++++++-------- 2 files changed, 47 insertions(+), 37 deletions(-) diff --git a/impc_api_helper/impc_api_helper/iterator_solr_request_2.py b/impc_api_helper/impc_api_helper/iterator_solr_request_2.py index 01e57f3..fe1003c 100644 --- a/impc_api_helper/impc_api_helper/iterator_solr_request_2.py +++ b/impc_api_helper/impc_api_helper/iterator_solr_request_2.py @@ -44,12 +44,8 @@ def batch_solr_request(core, params, download=False, batch_size=5000, path_to_do filename = Path(path_to_download) / f"{core}.{params['wt']}" gen = _batch_solr_generator(core, params, num_results) _solr_downloader(params, filename, gen) - print("Showing the first batch of results only. See downloaded file for the whole data.") - match params["wt"]: - case "json": - return pd.read_json(filename, nrows=batch_size, lines=True) - case "csv": - return pd.read_csv(filename, nrows=batch_size) + print(f"File saved as: {filename}") + return None # If the number of results is small enough and download is off, it's okay to show as df if num_results < 1000000 and not download: diff --git a/impc_api_helper/tests/test_iterator_solr_request.py b/impc_api_helper/tests/test_iterator_solr_request.py index c0d95b5..7dd2399 100644 --- a/impc_api_helper/tests/test_iterator_solr_request.py +++ b/impc_api_helper/tests/test_iterator_solr_request.py @@ -95,70 +95,84 @@ def test_batch_solr_request_no_download_large_request(self, core, common_params, assert "Exiting gracefully" in captured.out mock_batch_to_df.assert_not_called() - - # TEST DOWNLOAD TRUE # Fixture mocking the activity of _batch_solr_generator @pytest.fixture def mock_batch_solr_generator(self): - with patch('impc_api_helper.iterator_solr_request_2._batch_solr_generator') as mock: + with patch( + "impc_api_helper.iterator_solr_request_2._batch_solr_generator" + ) as mock: yield mock @pytest.fixture def mock_solr_downloader(self, tmp_path): - with patch('impc_api_helper.iterator_solr_request_2._solr_downloader') as mock: - temp_dir = Path(tmp_path) / 'temp_dir' + with patch("impc_api_helper.iterator_solr_request_2._solr_downloader") as mock: + temp_dir = Path(tmp_path) / "temp_dir" temp_dir.mkdir() yield mock - # Mock response for test containing more than 2,000,000 docs @pytest.mark.parametrize("mock_solr_request", [2000000], indirect=True) # Parametrized decorator to simulate reading a json and csv files @pytest.mark.parametrize( - "params_format, format, file_content, expected_columns", + "params_format, format, file_content", [ - ({"start": 0, "rows": 2000000, "wt": "json"},"json", '{"id": "1", "city": "Houston"}\n{"id": "2", "city": "Prague"}\n', ['id', 'city']), - ({"start": 0, "rows": 2000000, "wt": "csv"},"csv", 'id,city\n1,Houston\n2,Prague\n', ['id', 'city']) - ] + ( + {"start": 0, "rows": 2000000, "wt": "json"}, + "json", + '[{"id": "1", "city": "Houston"},{"id": "2", "city": "Prague"}]', + ), + ( + {"start": 0, "rows": 2000000, "wt": "csv"}, + "csv", + "id,city\n1,Houston\n2,Prague\n", + ), + ], ) - # Test download = True - def test_batch_solr_request_download_true(self, core, capsys, mock_solr_request, mock_batch_solr_generator, mock_solr_downloader, tmp_path, common_params, params_format, format, file_content, expected_columns): - + # This test should check the correct helper functions and print statements are called. + # Calling the API and writing the file are tested within the helpers. + def test_batch_solr_request_download_true( + self, + core, + capsys, + mock_solr_request, + mock_batch_solr_generator, + mock_solr_downloader, + tmp_path, + params_format, + format, + file_content, + ): # Create temporary files for the test - temp_dir = tmp_path / 'temp_dir' + temp_dir = tmp_path / "temp_dir" filename = f"{core}.{format}" temp_file = temp_dir / filename temp_file.write_text(file_content) - # First we call the function # We patch solr_request to get the number of docs - result = batch_solr_request(core, params=params_format, download=True, path_to_download=temp_dir) + result = batch_solr_request( + core, params=params_format, download=True, path_to_download=temp_dir + ) num_found = mock_solr_request.return_value[0] # Check _batch_solr_generator gets called once with correct args - mock_batch_solr_generator.assert_called_once_with(core, params_format, num_found) - + mock_batch_solr_generator.assert_called_once_with( + core, params_format, num_found + ) + # Check _solr_downloader gets called once with correct args - mock_solr_downloader.assert_called_once_with(params_format, temp_file, mock_batch_solr_generator.return_value) + mock_solr_downloader.assert_called_once_with( + params_format, temp_file, mock_batch_solr_generator.return_value + ) # Check the print statements captured = capsys.readouterr() assert f"Number of found documents: {num_found}" in captured.out - assert "Showing the first batch of results only" in captured.out - - # Check a pd was created and its contents - assert isinstance(result, pd.DataFrame) - assert len(result) == 2 - assert list(result.columns) == expected_columns - assert result.loc[0,'id'] == 1 - assert result.loc[0,'city'] == 'Houston' - assert result.loc[1,'id'] == 2 - assert result.loc[1,'city'] == 'Prague' - + assert f"File saved as: {temp_file}" in captured.out - # Mock params + # Check the function returns None + assert result is None @pytest.fixture def multiple_field_params(self): return { From 32ec2cc3a6ccd8e48207bc95c368b52dc5f6eca1 Mon Sep 17 00:00:00 2001 From: dpavam Date: Tue, 1 Oct 2024 15:51:47 +0100 Subject: [PATCH 17/64] fix: Adapts test_batch_solr_request_multiple_fields to changes in batch_solr_request where files are downloaded but not read. --- .../tests/test_iterator_solr_request.py | 110 +++++++++++------- 1 file changed, 71 insertions(+), 39 deletions(-) diff --git a/impc_api_helper/tests/test_iterator_solr_request.py b/impc_api_helper/tests/test_iterator_solr_request.py index 7dd2399..21dfd65 100644 --- a/impc_api_helper/tests/test_iterator_solr_request.py +++ b/impc_api_helper/tests/test_iterator_solr_request.py @@ -30,23 +30,25 @@ def mock_solr_request(self, request): # Pytest fixture mocking _batch_to_df @pytest.fixture def mock_batch_to_df(self): - with patch('impc_api_helper.iterator_solr_request_2._batch_to_df') as mock: + with patch("impc_api_helper.iterator_solr_request_2._batch_to_df") as mock: + # Mock expected return content of the _batch_to_df (pd.DataFrame) + mock.return_value = pd.DataFrame() yield mock # Parameters to determine the numFound of mock_solr_request @pytest.mark.parametrize("mock_solr_request", [10000], indirect=True) - def test_batch_solr_request_no_download_small_request(self, mock_solr_request, core, common_params, capsys, mock_batch_to_df): - + def test_batch_solr_request_no_download_small_request( + self, mock_solr_request, core, common_params, capsys, mock_batch_to_df + ): # Call your function that uses the mocked request # Set up mock_solr_request values - result = batch_solr_request(core, params=common_params, - download=False) - + result = batch_solr_request(core, params=common_params, download=False) + # # Assert the mock was called with the expected parameters (start = 0, rows = 0) despite calling other values. mock_solr_request.assert_called_with( - core=core, - params={**common_params, "start": 0, "rows": 0, "wt": "json"}, - silent=True + core=core, + params={**common_params, "start": 0, "rows": 0, "wt": "json"}, + silent=True, ) # Capture stoud @@ -173,68 +175,98 @@ def test_batch_solr_request_download_true( # Check the function returns None assert result is None + + # Mock params @pytest.fixture def multiple_field_params(self): return { "q": "*:*", "rows": 0, "start": 0, - "field_list":['"orange"','apple','*berry'], + "field_list": ['"orange"', "apple", "*berry"], "field_type": "fruits", - "wt":"json" - + "wt": "json", } - # Mock response for test containing more than 2,000,000 docs - @pytest.mark.parametrize("mock_solr_request", [(2000000),(10000)], indirect=True) + # Mock response for test containing more than 2,000,000 docs + @pytest.mark.parametrize("mock_solr_request", [(2000000), (10000)], indirect=True) @pytest.mark.parametrize( "download_bool", - [ - (True), - (False) - ], + [(True), (False)], ) - # @pytest.mark.skip(reason="no way of currently testing this") - def test_batch_solr_request_multiple_fields(self, core, multiple_field_params, capsys, mock_solr_request, mock_batch_solr_generator, download_bool, monkeypatch, mock_batch_to_df, mock_solr_downloader, tmp_path): - # This test should make sure the request is formatted properly. Regardless of going to downloads or to _batch_to_df - + + def test_batch_solr_request_multiple_fields( + self, + core, + multiple_field_params, + capsys, + mock_solr_request, + mock_batch_solr_generator, + download_bool, + monkeypatch, + mock_batch_to_df, + mock_solr_downloader, + tmp_path, + ): + # This test should make sure the request is formatted properly. Regardless of going to downloads or to _batch_to_df # Get num_found num_found = mock_solr_request.return_value[0] - # In the case where download=False and numFound is > 1,000,001 we pass 'y' in this test case. + # In the case where download=False and numFound is > 1,000,001 we pass 'y' in this test case. if not download_bool and num_found == 2000000: - monkeypatch.setattr('builtins.input', lambda _: 'y') - + monkeypatch.setattr("builtins.input", lambda _: "y") + # Call test function - # If downloads create a temporary file and call with the path_to_download + # If download is True create a temporary file and call with the path_to_download if download_bool: - temp_dir = tmp_path / 'temp_dir' + temp_dir = tmp_path / "temp_dir" temp_file = temp_dir / f"{core}.json" temp_file.write_text('{"id": "1", "city": "Cape Town"}\n') - result = batch_solr_request(core, params=multiple_field_params, download=download_bool, path_to_download=temp_dir) + result = batch_solr_request( + core, + params=multiple_field_params, + download=download_bool, + path_to_download=temp_dir, + ) else: # Otherwise, call without the path_to_download - result = batch_solr_request(core, params=multiple_field_params, download=download_bool) - + result = batch_solr_request( + core, params=multiple_field_params, download=download_bool + ) + # Check output which should be equal for both. captured = capsys.readouterr() assert f"Number of found documents: {num_found}" in captured.out assert 'Queried field: fruits:("orange" OR apple OR *berry)' in captured.out - # If download was true, check subsequent functions were executed + # If download was true, check subsequent functions were executed if download_bool: - assert "Showing the first batch of results only" in captured.out # Check _batch_solr_generator gets called once with correct args - mock_batch_solr_generator.assert_called_with(core, multiple_field_params, num_found) - - # Check _solr_downloader gets called once with correct args - mock_solr_downloader.assert_called_once_with(multiple_field_params, temp_file, mock_batch_solr_generator.return_value) + mock_batch_solr_generator.assert_called_with( + core, multiple_field_params, num_found + ) + # Check _solr_downloader gets called once with correct args + mock_solr_downloader.assert_called_once_with( + multiple_field_params, temp_file, mock_batch_solr_generator.return_value + ) + # Check the function returns None + assert result is None - # Otherwise, use the 'y' input at the start of the test and make sure the required function is executed. + # Otherwise, use the 'y' input at the start of the test and make sure the required function is executed. if not download_bool and num_found == 2000000: - assert "Your request might exceed the available memory. We suggest setting 'download=True' and reading the file in batches" in captured.out + assert ( + "Your request might exceed the available memory. We suggest setting 'download=True' and reading the file in batches" + in captured.out + ) # Check _batch_to_df was called with correct params - mock_batch_to_df.assert_called_once_with(core, multiple_field_params, num_found) + mock_batch_to_df.assert_called_once_with( + core, multiple_field_params, num_found + ) + + # Check the function returns a dataframe + assert result is not None + assert isinstance(result, pd.DataFrame) is True + # Have helper functions in a different class to separate fixtures and parameters class TestHelpersSolrBatchRequest(): From 2b93b986b8b558fabdff21c9ba8d1f0128c72b2b Mon Sep 17 00:00:00 2001 From: dpavam Date: Tue, 1 Oct 2024 16:39:42 +0100 Subject: [PATCH 18/64] test: adds baisc test set for _batch_solr_request --- .../tests/test_iterator_solr_request.py | 166 ++++++++++-------- 1 file changed, 91 insertions(+), 75 deletions(-) diff --git a/impc_api_helper/tests/test_iterator_solr_request.py b/impc_api_helper/tests/test_iterator_solr_request.py index 21dfd65..07f7c0b 100644 --- a/impc_api_helper/tests/test_iterator_solr_request.py +++ b/impc_api_helper/tests/test_iterator_solr_request.py @@ -351,116 +351,132 @@ def test_batch_to_df(self, core, batch_params, num_found, mock_solr_request_gene # Test _batch_solr_generator - # This function uses the requests module so we need to patch it - # Main outcomes: - # The data is yielded back as json DONE - # The data is yielded back as text DONE - # The error raises and exception DONE - # The requests module is called multiple times - # The start param is increased by batch_size - # The URL, and compount url should be checked. - # Fixture to mock the requests module @pytest.fixture def mock_requests_get(self, request): - with patch('impc_api_helper.iterator_solr_request_2.requests.get') as mock_get: + with patch("impc_api_helper.iterator_solr_request_2.requests.get") as mock_get: # Capture the format of the response wt = request.param["wt"] mock_get.return_value.format = wt - + # Get the status code and mock_get.return_value.status_code = request.param["status_code"] - # Get "response" data according to format - if wt == "json": - mock_get.return_value.json.return_value = request.param["response"] - - elif wt == "csv": - mock_get.return_value.text = request.param["response"] - + # Call the generator + data_generator = self.data_generator() + + # Use the side_effects to return num_found and the dfs + def side_effect(*args, **kwargs): + # Create a mock response object + mock_response = Mock() + mock_response.status_code = 200 + + # Get the tuple from the data generator + _, animal = next(data_generator) + + # Create type of response + # if json + if wt == "json": + mock_response.json.return_value = { + "response": {"docs": [{"id": animal}]} + } + # if csv + if wt == "csv": + mock_response.text = f"id,\n{animal}" + + return mock_response + + # Assign the side effect + mock_get.side_effect = side_effect + yield mock_get + # Fixture containing the params for batch_solr_generator @pytest.fixture def batch_solr_generator_params(self): - return {"start": 0, "rows": 2} + return {"start": 0, "rows": 1} + @pytest.mark.parametrize( - "mock_requests_get", - [ - ({ - "status_code": 200, - "response": {"response": { - "docs": [ - {"id": "Gibson"}, - {"id": "Ibanez"}, - {"id": "Schecter"}, - {"id": "PRS"} - ] - }}, - "wt": "json" - }), - ({ - "status_code": 200, - "response": "id\nGibson\nIbanez\nSchecter\nPRS", - "wt": "csv" - }) + "mock_requests_get,expected_results", + [ + ( + {"wt": "json", "status_code": 200}, + [ + [{"id": "Bull"}], + [{"id": "Elephant"}], + [{"id": "Rhino"}], + [{"id": "Monkey"}], + [{"id": "Snake"}], + ], + ), + ( + {"wt": "csv", "status_code": 200}, + [ + "id,\nBull", + "id,\nElephant", + "id,\nRhino", + "id,\nMonkey", + "id,\nSnake", + ], + ), ], - indirect=['mock_requests_get'] + indirect=["mock_requests_get"], ) - def test_batch_solr_generator(self, core, batch_solr_generator_params, mock_requests_get): - + def test_batch_solr_generator( + self, core, batch_solr_generator_params, mock_requests_get, expected_results + ): # Define num_found - num_results = 4 + num_results = 5 # Define the wt and batch_size param for the test batch_solr_generator_params["wt"] = mock_requests_get.return_value.format - batch_solr_generator_params["rows"] = 1 + rows = batch_solr_generator_params["rows"] # Create the generator result = _batch_solr_generator(core, batch_solr_generator_params, num_results) - # Assertions for json data - if batch_solr_generator_params["wt"] == "json": - assert next(result) == [ - {"id": "Gibson"}, - {"id": "Ibanez"}, - {"id": "Schecter"}, - {"id": "PRS"} - ] - # Assertions for csv - elif batch_solr_generator_params["wt"] == "csv": - assert next(result) == "id\nGibson\nIbanez\nSchecter\nPRS" - - # General assertions - # Checks the url, status code, and params called are as expected. - check_url_status_code_and_params( - mock_response=mock_requests_get, - expected_status_code=200, - expected_core=core, - expected_params=batch_solr_generator_params, - ) + # # Assertions for json data + # if batch_solr_generator_params["wt"] == "json": + # Loop over the expected results and check corresponding calls + for idx, exp_result in enumerate(expected_results): + # Call the next iteration + + assert next(result) == exp_result + + # Check requests.get was called with the correct url, params [especially, the 'start' param], and timeout. + mock_requests_get.assert_called_with( + "https://www.ebi.ac.uk/mi/impc/solr/test_core/select", + params={**batch_solr_generator_params, "start": idx, "rows": rows}, + timeout=10, + ) - # Simpler approach to test when status code is 404 + # Simpler approach to test when status code is not 200 @pytest.fixture def mock_requests_get_error(self, request): - with patch('impc_api_helper.iterator_solr_request_2.requests.get') as mock_get: + with patch("impc_api_helper.iterator_solr_request_2.requests.get") as mock_get: mock_get.return_value.status_code = request.param yield mock_get - # Set up test for _batch_solr_generator when status code is 404 - @pytest.mark.parametrize("mock_requests_get_error", [404, 500], indirect=['mock_requests_get_error']) - def test_batch_solr_generator_error(self, core, batch_solr_generator_params, mock_requests_get_error): + # Set up test for _batch_solr_generator when status code is not 200 + @pytest.mark.parametrize( + "mock_requests_get_error", [404, 500], indirect=["mock_requests_get_error"] + ) + def test_batch_solr_generator_error( + self, core, batch_solr_generator_params, mock_requests_get_error + ): # Get status code: status_code = mock_requests_get_error.return_value.status_code # Call the generator and expect an exception to be raised # Note the num_found is passed but the number itself does not matter # Note list() is needed so that the generator is iterated otherwise exception is never reached. - with pytest.raises(Exception, match=f"Request failed. Status code: {status_code}"): - _ = list(_batch_solr_generator(core=core, params=batch_solr_generator_params, num_results=4)) - - - - # TODO: - # _solr_downloader + with pytest.raises( + Exception, match=f"Request failed. Status code: {status_code}" + ): + _ = list( + _batch_solr_generator( + core=core, params=batch_solr_generator_params, num_results=4 + ) + ) @pytest.fixture def mock_solr_generator(self, request): From aef72971c2c01c0c1bb50be651b3e46fc7198cd4 Mon Sep 17 00:00:00 2001 From: dpavam Date: Wed, 2 Oct 2024 15:21:48 +0100 Subject: [PATCH 19/64] feat: implementation of new batch_solr_request --- impc_api_helper/impc_api_helper/__init__.py | 6 +- .../impc_api_helper/iterator_solr_request.py | 285 +++++++++--------- .../iterator_solr_request_2.py | 161 ---------- 3 files changed, 153 insertions(+), 299 deletions(-) delete mode 100644 impc_api_helper/impc_api_helper/iterator_solr_request_2.py diff --git a/impc_api_helper/impc_api_helper/__init__.py b/impc_api_helper/impc_api_helper/__init__.py index 4531621..f091c93 100644 --- a/impc_api_helper/impc_api_helper/__init__.py +++ b/impc_api_helper/impc_api_helper/__init__.py @@ -1,5 +1,5 @@ -from .solr_request import solr_request, batch_request -from .iterator_solr_request import iterator_solr_request +from .solr_request import solr_request +from .iterator_solr_request import batch_solr_request # Control what gets imported by client -__all__ = ["solr_request", "batch_request", "iterator_solr_request"] +__all__ = ["solr_request", "batch_solr_request"] diff --git a/impc_api_helper/impc_api_helper/iterator_solr_request.py b/impc_api_helper/impc_api_helper/iterator_solr_request.py index 6967384..c02f68c 100644 --- a/impc_api_helper/impc_api_helper/iterator_solr_request.py +++ b/impc_api_helper/impc_api_helper/iterator_solr_request.py @@ -1,145 +1,160 @@ -import csv +from IPython.display import display import json - +import pandas as pd import requests +from tqdm import tqdm +from .solr_request import solr_request +from pathlib import Path +def batch_solr_request(core, params, download=False, batch_size=5000, path_to_download='./'): + # Set params for batch request + params["start"] = 0 # Start at the first result + params["rows"] = batch_size # Fetch results in chunks of 5000 -# Helper function to fetch results. This function is used by the 'iterator_solr_request' function. -def entity_iterator(base_url, params): - """Generator function to fetch results from the SOLR server in chunks using pagination - - Args: - base_url (str): The base URL of the Solr server to fetch documents from. - params (dict): A dictionary of parameters to include in the GET request. Must include - 'start' and 'rows' keys, which represent the index of the first document - to fetch and the number of documents to fetch per request, respectively. - - Yields: - dict: The next document in the response from the Solr server. - """ - # Initialise variable to check the first request - first_request = True - - # Call the API in chunks and yield the documents in each chunk - while True: - response = requests.get(base_url, params=params) - data = response.json() - docs = data["response"]["docs"] - - # Print the first request only - if first_request: - print(f"Your first request: {response.url}") - first_request = False - - # Yield the documents in the current chunk - for doc in docs: - yield doc - - # Check if there are more results to fetch - start = params["start"] + params["rows"] - num_found = data["response"]["numFound"] - if start >= num_found: - break - - # Update the start parameter for the next request - params["start"] = start - - # Print last request and total number of documents retrieved - print(f"Your last request: {response.url}") - print(f'Number of found documents: {data["response"]["numFound"]}\n') - - -# Function to iterate over field list and write results to a file. -def iterator_solr_request( - core, params, filename="iteration_solr_request", format="json" -): - """Function to fetch results in batches from the Solr API and write them to a file. - Defaults to fetching 5000 rows at a time. - Avoids cluttering local memory, ideal for large requests. - - Args: - core (str): The name of the Solr core to fetch results from. - params (dict): A dictionary of parameters to use in the filter query. Must include - 'field_list' and 'field_type' keys, which represent the list of field items (i.e., list of MGI model identifiers) - to fetch and the type of the field (i.e., model_id) to filter on, respectively. - filename (str): The name of the file/path to write the results to. Defaults to 'iteration_solr_request'. - format (str): The format of the output file. Can be 'csv' or 'json'. Defaults to 'json'. - - Returns: None - - Example use case: - # List of model IDs. - models = ['MGI:3587188', 'MGI:3587185', 'MGI:3605874', 'MGI:2668213'] - - # Call iterator function - iterator_solr_request( - core='phenodigm', - params = { - 'q': 'type:disease_model_summary', - 'fl': 'model_id,marker_id,disease_id', - 'field_list': models, - 'field_type': 'model_id' - }, - filename='model_ids', - format='csv' - ) - """ + # If user did not specify format, defaults to json. + if params.get("wt") is None: + params["wt"] = "json" - # Validate format - if format not in ["json", "csv"]: - raise ValueError("Invalid format. Please use 'json' or 'csv'") + # Check if it's multiple request + if params.get("field_list") is not None: + # Extract entities_list and entity_type from params + field_list = params.pop("field_list") + field_type = params.pop("field_type") - # Base URL - base_url = "https://www.ebi.ac.uk/mi/impc/solr/" - solr_url = base_url + core + "/select" + # Construct the filter query with grouped model IDs + fq = "{}:({})".format( + field_type, " OR ".join(["{}".format(id) for id in field_list]) + ) + # Show users the field and field values they passed to the function + print("Queried field:", fq) + # Set internal params the users should not change + params["fq"] = fq + + # Determine the total number of rows. Note that we do not request any data (rows = 0). + num_results, _ = solr_request( + core=core, params={**params, "start": 0, "rows": 0, "wt": "json"}, silent=True + ) + print(f"Number of found documents: {num_results}") + + # Download/display only logic + # If user decides to download, a generator is used to fetch data in batches without storing results in memory. + if download: + # Implement loop behaviour + filename = Path(path_to_download) / f"{core}.{params['wt']}" + gen = _batch_solr_generator(core, params, num_results) + _solr_downloader(params, filename, gen) + print(f"File saved as: {filename}") + return None + + # If the number of results is small enough and download is off, it's okay to show as df + if num_results < 1000000 and not download: + return _batch_to_df(core, params, num_results) + + # If it's too big, warn the user and ask if they want to proceed. + else: + print( + "Your request might exceed the available memory. We suggest setting 'download=True' and reading the file in batches" + ) + prompt = input( + "Do you wish to proceed anyway? press ('y' or enter to proceed) / type('n' or 'exit' to cancel)" + ) + match prompt: + case "n" | "exit": + print("Exiting gracefully") + exit() + case "y" | '': + print("Fetching data...") + return _batch_to_df(core, params, num_results) - # Extract entities_list and entity_type from params - field_list = params.pop("field_list") - field_type = params.pop("field_type") - # Construct the filter query with grouped model IDs - fq = "{}:({})".format( - field_type, " OR ".join(['"{}"'.format(id) for id in field_list]) - ) - # Show users the field and field values they passed to the function - print("Queried field:", fq) - # Set internal params the users should not change - params["fq"] = fq + +# Helper batch_to_df +def _batch_to_df(core, params, num_results): + start = params["start"] + batch_size = params["rows"] + chunks = [] + # If the 'wt' param was changed by error, we set it to 'json' params["wt"] = "json" - params["start"] = 0 # Start at the first result - params["rows"] = 5000 # Fetch results in chunks of 5000 - - try: - # Fetch results using a generator function - results_generator = entity_iterator(solr_url, params) - except Exception as e: - raise Exception("An error occurred while downloading the data: " + str(e)) - - # Append extension to the filename - filename = f"{filename}.{format}" - - try: - # Open the file in write mode - with open(filename, "w", newline="") as f: - if format == "csv": - writer = None - for item in results_generator: - # Initialize the CSV writer with the keys of the first item as the field names - if writer is None: - writer = csv.DictWriter(f, fieldnames=item.keys()) - writer.writeheader() - # Write the item to the CSV file - writer.writerow(item) - # Write to json without loading to memory. - elif format == "json": - f.write("[") - for i, item in enumerate(results_generator): - if i != 0: - f.write(",") - json.dump(item, f) - f.write("]") - except Exception as e: - raise Exception("An error occurred while writing the file: " + str(e)) - - print(f"File {filename} was created.") + + # Request chunks until we have complete data. + with tqdm(total=num_results) as pbar: + while start < num_results: + # Update progress bar with the number of rows requested. + pbar.update(batch_size) + + # Request chunk. We don't need num_results anymore because it does not change. + _, df_chunk = solr_request( + core=core, + params={**params, "start": start, "rows": batch_size}, + silent=True, + ) + + # Record chunk. + chunks.append(df_chunk) + # Increment start. + start += batch_size + # Prepare final dataframe. + return pd.concat(chunks, ignore_index=True) + + +def _batch_solr_generator(core, params, num_results): + base_url = "https://www.ebi.ac.uk/mi/impc/solr/" + solr_url = base_url + core + "/select" + start = params["start"] + batch_size = params["rows"] + + with tqdm(total=num_results) as pbar: + while start <= num_results: + params["start"] = start + response = requests.get(solr_url, params=params, timeout=10) + + if response.status_code == 200: + if params.get("wt") == "json": + data = response.json()["response"]["docs"] + else: + data = response.text + + # Update and refresh the progress bar after getting the data + pbar.update(batch_size) + pbar.refresh() + yield data + + else: + raise Exception(f"Request failed. Status code: {response.status_code}") + + # pbar.update(batch_size) + start += batch_size + print(f"Your request URL after the last call:{response.url}") + + +# File writer +def _solr_downloader(params, filename, solr_generator): + with open(filename, "w", encoding="UTF-8") as f: + + if params.get("wt") == "json": + f.write("[\n") + first_chunk = True + + for chunk in solr_generator: + # print('CHUNK',chunk,'\n') + for item in chunk: + if not first_chunk: + f.write(",\n") + # print('ITEM',item) + json.dump(item, f, ensure_ascii=False) + first_chunk = False + f.write("\n]\n") + + elif params.get("wt") == "csv": + first_chunk = True + for chunk in solr_generator: + lines = chunk.splitlines() + if first_chunk: + # Write all lines in the first chunk + f.write(chunk) + first_chunk = False + else: + # Skip the first line (header) in subsequent chunks + f.write('\n' + '\n'.join(lines[1:]) + '\n') + diff --git a/impc_api_helper/impc_api_helper/iterator_solr_request_2.py b/impc_api_helper/impc_api_helper/iterator_solr_request_2.py deleted file mode 100644 index fe1003c..0000000 --- a/impc_api_helper/impc_api_helper/iterator_solr_request_2.py +++ /dev/null @@ -1,161 +0,0 @@ -from IPython.display import display -import json -import pandas as pd -import requests -from tqdm import tqdm -from solr_request import solr_request -import logging -from pathlib import Path - -def batch_solr_request(core, params, download=False, batch_size=5000, path_to_download='./'): - # Set params for batch request - params["start"] = 0 # Start at the first result - params["rows"] = batch_size # Fetch results in chunks of 5000 - - # If user did not specify format, defaults to json. - if params.get("wt") is None: - params["wt"] = "json" - - # Check if it's multiple request - if params.get("field_list") is not None: - # Extract entities_list and entity_type from params - field_list = params.pop("field_list") - field_type = params.pop("field_type") - - # Construct the filter query with grouped model IDs - fq = "{}:({})".format( - field_type, " OR ".join(["{}".format(id) for id in field_list]) - ) - # Show users the field and field values they passed to the function - print("Queried field:", fq) - # Set internal params the users should not change - params["fq"] = fq - - # Determine the total number of rows. Note that we do not request any data (rows = 0). - num_results, _ = solr_request( - core=core, params={**params, "start": 0, "rows": 0, "wt": "json"}, silent=True - ) - print(f"Number of found documents: {num_results}") - - # Download/display only logic - # If user decides to download, a generator is used to fetch data in batches without storing results in memory. - if download: - # Implement loop behaviour - filename = Path(path_to_download) / f"{core}.{params['wt']}" - gen = _batch_solr_generator(core, params, num_results) - _solr_downloader(params, filename, gen) - print(f"File saved as: {filename}") - return None - - # If the number of results is small enough and download is off, it's okay to show as df - if num_results < 1000000 and not download: - return _batch_to_df(core, params, num_results) - - # If it's too big, warn the user and ask if they want to proceed. - else: - print( - "Your request might exceed the available memory. We suggest setting 'download=True' and reading the file in batches" - ) - prompt = input( - "Do you wish to proceed anyway? press ('y' or enter to proceed) / type('n' or 'exit' to cancel)" - ) - match prompt: - case "n" | "exit": - print("Exiting gracefully") - exit() - case "y" | '': - print("Fetching data...") - return _batch_to_df(core, params, num_results) - - - - -# Helper batch_to_df -def _batch_to_df(core, params, num_results): - start = params["start"] - batch_size = params["rows"] - chunks = [] - # If the 'wt' param was changed by error, we set it to 'json' - params["wt"] = "json" - - # Request chunks until we have complete data. - with tqdm(total=num_results) as pbar: - while start < num_results: - # Update progress bar with the number of rows requested. - pbar.update(batch_size) - - # Request chunk. We don't need num_results anymore because it does not change. - _, df_chunk = solr_request( - core=core, - params={**params, "start": start, "rows": batch_size}, - silent=True, - ) - - # Record chunk. - chunks.append(df_chunk) - # Increment start. - start += batch_size - # Prepare final dataframe. - return pd.concat(chunks, ignore_index=True) - - -def _batch_solr_generator(core, params, num_results): - base_url = "https://www.ebi.ac.uk/mi/impc/solr/" - solr_url = base_url + core + "/select" - start = params["start"] - batch_size = params["rows"] - - with tqdm(total=num_results) as pbar: - while start <= num_results: - params["start"] = start - response = requests.get(solr_url, params=params, timeout=10) - - if response.status_code == 200: - if params.get("wt") == "json": - data = response.json()["response"]["docs"] - else: - data = response.text - - # Update and refresh the progress bar after getting the data - pbar.update(batch_size) - pbar.refresh() - yield data - - else: - raise Exception(f"Request failed. Status code: {response.status_code}") - - # pbar.update(batch_size) - start += batch_size - print(f"Your request URL after the last call:{response.url}") - - -# File writer -def _solr_downloader(params, filename, solr_generator): - with open(filename, "w", encoding="UTF-8") as f: - - if params.get("wt") == "json": - f.write("[\n") - first_chunk = True - - for chunk in solr_generator: - # print('CHUNK',chunk,'\n') - for item in chunk: - if not first_chunk: - f.write(",\n") - # print('ITEM',item) - json.dump(item, f, ensure_ascii=False) - first_chunk = False - f.write("\n]\n") - - elif params.get("wt") == "csv": - first_chunk = True - for chunk in solr_generator: - lines = chunk.splitlines() - if first_chunk: - # Write all lines in the first chunk - f.write(chunk) - first_chunk = False - else: - # Skip the first line (header) in subsequent chunks - f.write('\n' + '\n'.join(lines[1:]) + '\n') - From e545bb258cdf53b64139bcf206085bf298400042 Mon Sep 17 00:00:00 2001 From: dpavam Date: Wed, 2 Oct 2024 15:27:45 +0100 Subject: [PATCH 20/64] chore: changes module name iterator_solr_request to batch_solr_request and linted test module --- impc_api_helper/impc_api_helper/__init__.py | 2 +- ..._solr_request.py => batch_solr_request.py} | 0 ..._request.py => test_batch_solr_request.py} | 149 ++++++++++-------- 3 files changed, 88 insertions(+), 63 deletions(-) rename impc_api_helper/impc_api_helper/{iterator_solr_request.py => batch_solr_request.py} (100%) rename impc_api_helper/tests/{test_iterator_solr_request.py => test_batch_solr_request.py} (84%) diff --git a/impc_api_helper/impc_api_helper/__init__.py b/impc_api_helper/impc_api_helper/__init__.py index f091c93..3c83952 100644 --- a/impc_api_helper/impc_api_helper/__init__.py +++ b/impc_api_helper/impc_api_helper/__init__.py @@ -1,5 +1,5 @@ from .solr_request import solr_request -from .iterator_solr_request import batch_solr_request +from .batch_solr_request import batch_solr_request # Control what gets imported by client __all__ = ["solr_request", "batch_solr_request"] diff --git a/impc_api_helper/impc_api_helper/iterator_solr_request.py b/impc_api_helper/impc_api_helper/batch_solr_request.py similarity index 100% rename from impc_api_helper/impc_api_helper/iterator_solr_request.py rename to impc_api_helper/impc_api_helper/batch_solr_request.py diff --git a/impc_api_helper/tests/test_iterator_solr_request.py b/impc_api_helper/tests/test_batch_solr_request.py similarity index 84% rename from impc_api_helper/tests/test_iterator_solr_request.py rename to impc_api_helper/tests/test_batch_solr_request.py index 07f7c0b..45cb220 100644 --- a/impc_api_helper/tests/test_iterator_solr_request.py +++ b/impc_api_helper/tests/test_batch_solr_request.py @@ -1,5 +1,13 @@ import pytest from pathlib import Path +from unittest.mock import patch, call, Mock +from impc_api_helper.batch_solr_request import ( + batch_solr_request, + _batch_solr_generator, + solr_request, + _batch_to_df, + _solr_downloader, +) import json import pandas as pd from pandas.testing import assert_frame_equal @@ -9,20 +17,20 @@ # Fixture containing the core @pytest.fixture def core(): - return 'test_core' + return "test_core" -class TestBatchSolrRequest(): +class TestBatchSolrRequest: # Fixture containing the params of a normal batch_solr_request @pytest.fixture def common_params(self): return {"start": 0, "rows": 10000, "wt": "json"} - + # Fixture containing the solr_request function mock # We will be mocking solr_request with different numbers of numFound, therefore it is passed as param @pytest.fixture def mock_solr_request(self, request): - with patch('impc_api_helper.iterator_solr_request_2.solr_request') as mock: + with patch("impc_api_helper.batch_solr_request.solr_request") as mock: # Mock expected return content of the solr_request (numFound and _) mock.return_value = (request.param, pd.DataFrame()) yield mock @@ -30,7 +38,7 @@ def mock_solr_request(self, request): # Pytest fixture mocking _batch_to_df @pytest.fixture def mock_batch_to_df(self): - with patch("impc_api_helper.iterator_solr_request_2._batch_to_df") as mock: + with patch("impc_api_helper.batch_solr_request._batch_to_df") as mock: # Mock expected return content of the _batch_to_df (pd.DataFrame) mock.return_value = pd.DataFrame() yield mock @@ -59,41 +67,57 @@ def test_batch_solr_request_no_download_small_request( # Check _batch_to_df was called mock_batch_to_df.assert_called_once() - # Set mock_solr_request to return a large numFound @pytest.mark.parametrize("mock_solr_request", [1000001], indirect=True) # Parameter to test 4 cases: when user selects 'y' or 'n' upon large download warning. - @pytest.mark.parametrize("user_input,expected_outcome", [ - ('y', 'continue'), - ('', 'continue'), - ('n', 'exit'), - ('exit', 'exit') - ]) - def test_batch_solr_request_no_download_large_request(self, core, common_params, capsys, monkeypatch, mock_batch_to_df, mock_solr_request, user_input, expected_outcome): + @pytest.mark.parametrize( + "user_input,expected_outcome", + [("y", "continue"), ("", "continue"), ("n", "exit"), ("exit", "exit")], + ) + def test_batch_solr_request_no_download_large_request( + self, + core, + common_params, + capsys, + monkeypatch, + mock_batch_to_df, + mock_solr_request, + user_input, + expected_outcome, + ): # Monkeypatch the input() function with parametrized user input - monkeypatch.setattr('builtins.input', lambda _: user_input) + monkeypatch.setattr("builtins.input", lambda _: user_input) # When user selects 'n', exit should be triggered. - if expected_outcome == 'exit': + if expected_outcome == "exit": with pytest.raises(SystemExit): - batch_solr_request(core, params=common_params, download=False, batch_size=5000) + batch_solr_request( + core, params=common_params, download=False, batch_size=5000 + ) else: - result = batch_solr_request(core, params=common_params, download=False, batch_size=5000) + result = batch_solr_request( + core, params=common_params, download=False, batch_size=5000 + ) - # Capture the exit messages + # Capture the exit messages captured = capsys.readouterr() - + # Assertions for continue case num_found = mock_solr_request.return_value[0] assert f"Number of found documents: {num_found}" in captured.out - if expected_outcome == 'continue': - assert "Your request might exceed the available memory. We suggest setting 'download=True' and reading the file in batches" in captured.out - mock_batch_to_df.assert_called_once_with('test_core', {'start': 0, 'rows': 5000, 'wt': 'json'}, 1000001) + if expected_outcome == "continue": + assert ( + "Your request might exceed the available memory. We suggest setting 'download=True' and reading the file in batches" + in captured.out + ) + mock_batch_to_df.assert_called_once_with( + "test_core", {"start": 0, "rows": 5000, "wt": "json"}, 1000001 + ) # Assertion for exit case - elif expected_outcome == 'exit': + elif expected_outcome == "exit": assert "Exiting gracefully" in captured.out mock_batch_to_df.assert_not_called() @@ -102,13 +126,13 @@ def test_batch_solr_request_no_download_large_request(self, core, common_params, @pytest.fixture def mock_batch_solr_generator(self): with patch( - "impc_api_helper.iterator_solr_request_2._batch_solr_generator" + "impc_api_helper.batch_solr_request._batch_solr_generator" ) as mock: yield mock @pytest.fixture def mock_solr_downloader(self, tmp_path): - with patch("impc_api_helper.iterator_solr_request_2._solr_downloader") as mock: + with patch("impc_api_helper.batch_solr_request._solr_downloader") as mock: temp_dir = Path(tmp_path) / "temp_dir" temp_dir.mkdir() yield mock @@ -269,30 +293,29 @@ def test_batch_solr_request_multiple_fields( # Have helper functions in a different class to separate fixtures and parameters -class TestHelpersSolrBatchRequest(): +class TestHelpersSolrBatchRequest: # Define a generator to produce DF's dynamically def data_generator(self): - """ Generator to produce data dynamically (row by row or doc by doc)/ + """Generator to produce data dynamically (row by row or doc by doc)/ Yields: Tuple: tuple containing an id number and a value """ # Values for the dataframes - animals = ['Bull', 'Elephant', 'Rhino', 'Monkey', 'Snake'] + animals = ["Bull", "Elephant", "Rhino", "Monkey", "Snake"] # Yield a tuple containing an id number and an animal string for i, a in enumerate(animals): yield (i, a) - + # Fixture containing the solr_request function mock # Num_found is passed dynamically as params in the test # Generates df's dynamically using the data generator @pytest.fixture def mock_solr_request_generator(self, request): - """ Patches solr_request for _batch_to_df _batch_solr_generator producing a df dynamically. - Creates a df in chunks (row by row) mocking incoming batches of responses. + """Patches solr_request for _batch_to_df _batch_solr_generator producing a df dynamically. + Creates a df in chunks (row by row) mocking incoming batches of responses. """ - with patch('impc_api_helper.iterator_solr_request_2.solr_request') as mock: - + with patch("impc_api_helper.batch_solr_request.solr_request") as mock: # Call the generator data_generator = self.data_generator() @@ -301,14 +324,9 @@ def side_effect(*args, **kwargs): # Get the tuple from the data generator idx, animal = next(data_generator) # Create a df - df = pd.DataFrame( - { - 'id': [idx], - 'animal': [animal] - } - ) + df = pd.DataFrame({"id": [idx], "animal": [animal]}) return request.param, df - + mock.side_effect = side_effect yield mock @@ -316,45 +334,52 @@ def side_effect(*args, **kwargs): @pytest.fixture def batch_params(self, rows): return {"start": 0, "rows": rows, "wt": "json"} - + @pytest.fixture def num_found(self, request): return request.param # Parameters to be passsed to the test: the num_found for mock_solr_request_generator, the num_found separately, and rows (batch_size). # Note num_found is returned by solr_request, when we access it using the generator function, it causes issues. - # Hence, we pass num_found separately as a fixture. - @pytest.mark.parametrize("mock_solr_request_generator,num_found,rows", [ - (50000, 50000, 10000), - (5, 5, 1), - (25000, 25000, 5000) - ], - indirect=['mock_solr_request_generator']) - - def test_batch_to_df(self, core, batch_params, num_found, mock_solr_request_generator, rows): + # Hence, we pass num_found separately as a fixture. + @pytest.mark.parametrize( + "mock_solr_request_generator,num_found,rows", + [(50000, 50000, 10000), (5, 5, 1), (25000, 25000, 5000)], + indirect=["mock_solr_request_generator"], + ) + def test_batch_to_df( + self, core, batch_params, num_found, mock_solr_request_generator, rows + ): # Call the tested function df = _batch_to_df(core, batch_params, num_found) - # Assert solr_request was called with the expected params and increasing start + # Assert solr_request was called with the expected params and increasing start expected_calls = [ - call(core=core, params={**batch_params, 'start': i * rows, 'rows': rows}, silent=True) for i in range(5) + call( + core=core, + params={**batch_params, "start": i * rows, "rows": rows}, + silent=True, + ) + for i in range(5) ] mock_solr_request_generator.assert_has_calls(expected_calls) - # Assert the structure of the final df - assert_frame_equal(df, pd.DataFrame( - { - 'id': [0, 1, 2, 3, 4], - 'animal': ['Bull', 'Elephant', 'Rhino', 'Monkey', 'Snake'] - } - ).reset_index(drop=True)) + # Assert the structure of the final df + assert_frame_equal( + df, + pd.DataFrame( + { + "id": [0, 1, 2, 3, 4], + "animal": ["Bull", "Elephant", "Rhino", "Monkey", "Snake"], + } + ).reset_index(drop=True), + ) - # Test _batch_solr_generator # Fixture to mock the requests module @pytest.fixture def mock_requests_get(self, request): - with patch("impc_api_helper.iterator_solr_request_2.requests.get") as mock_get: + with patch("impc_api_helper.batch_solr_request.requests.get") as mock_get: # Capture the format of the response wt = request.param["wt"] mock_get.return_value.format = wt @@ -453,7 +478,7 @@ def test_batch_solr_generator( # Simpler approach to test when status code is not 200 @pytest.fixture def mock_requests_get_error(self, request): - with patch("impc_api_helper.iterator_solr_request_2.requests.get") as mock_get: + with patch("impc_api_helper.batch_solr_request.requests.get") as mock_get: mock_get.return_value.status_code = request.param yield mock_get From 6a560a7a978ffc005f97555357603a4684881ab1 Mon Sep 17 00:00:00 2001 From: dpavam Date: Wed, 2 Oct 2024 16:24:23 +0100 Subject: [PATCH 21/64] chore: update README.md with the use of batch_solr_request --- impc_api_helper/README.md | 87 ++++++++++++++++++++++++++++++++------- 1 file changed, 71 insertions(+), 16 deletions(-) diff --git a/impc_api_helper/README.md b/impc_api_helper/README.md index aa8635b..a33b7b9 100644 --- a/impc_api_helper/README.md +++ b/impc_api_helper/README.md @@ -28,28 +28,83 @@ num_found, df = solr_request( core='genotype-phenotype', params={ ### Batch request For larger requests, use the batch request function to query the API responsibly. ``` -df = batch_request( - core="genotype-phenotype", +### Batch Solr Request +`batch_solr_request` is available for large queries. This solves issues where a request is too large to fit into memory or where it puts a lot of strain on the API. + +Use `batch_solr_request` for: +- Large queries (>1,000,000) +- Querying multiple items in a list +- Downloading data in `json` or `csv` format. + +#### Large queries +For large queries you can choose between seeing them in a DataFrame or downloading them in `json` or `csv` format. + +##### Large query - see in DataFrame +This will fetch your data using the API responsibly and return a Pandas DataFrame + +When your request is larger than recommended and you have not opted for downloading the data, a warning will be presented and you should follow the instructions to proceed. + +``` +df = batch_solr_request( + core='genotype-phenotype', params={ - 'q': 'top_level_mp_term_name:"cardiovascular system phenotype" AND effect_size:[* TO *] AND life_stage_name:"Late adult"', - 'fl': 'allele_accession_id,life_stage_name,marker_symbol,mp_term_name,p_value,parameter_name,parameter_stable_id,phenotyping_center,statistical_method,top_level_mp_term_name,effect_size' + 'q':'*:*' }, - batch_size=100 + download=False ) +print(df.head()) ``` -### Iterator solr request -To pass a list of different fields and download a file with the information +##### Large query - Download +When using the `download=True` option, no DataFrame will be returned, instead a file with the requested information will be saved to the path specified in `path_to_download`. + ``` -# Genes example +batch_solr_request( + core='genotype-phenotype', + params={ + 'q':'*:*', + 'wt':'csv' + }, + download=True, + path_to_download = 'downloads' +) +``` + +#### Query by multiple values +`batch_solr_request` also allows to search multiple items in a list provided they belong to them same field. +Pass the list to the `field_list` param and specify the type of `fl` in `field_type`. + +``` +# List of gene symbols +genes = ["Zfp580","Firrm","Gpld1","Mbip"] + +df = batch_solr_request( + core='genotype-phenotype', + params={ + 'q':'*:*', + 'fl': 'marker_symbol,mp_term_name,p_value', + 'field_list': genes, + 'field_type': 'marker_symbol' + }, + download = False +print(df.head()) +) +``` +This too can be downloaded + +``` +# List of gene symbols genes = ["Zfp580","Firrm","Gpld1","Mbip"] -# Initial query parameters -params = { - 'q': "*:*", - 'fl': 'marker_symbol,allele_symbol,parameter_stable_id', - 'field_list': genes, - 'field_type': "marker_symbol" -} -iterator_solr_request(core='genotype-phenotype', params=params, filename='marker_symbol', format ='csv') +batch_solr_request( + core='genotype-phenotype', + params={ + 'q':'*:*', + 'fl': 'marker_symbol,mp_term_name,p_value', + 'field_list': genes, + 'field_type': 'marker_symbol' + }, + download = True, + path_to_download='downloads' +) ``` From be3b29ac39f6f1f2e0c146f9bbd9d73099d7a0f4 Mon Sep 17 00:00:00 2001 From: dpavam Date: Wed, 2 Oct 2024 16:24:49 +0100 Subject: [PATCH 22/64] chore: update import path in README.md --- impc_api_helper/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/impc_api_helper/README.md b/impc_api_helper/README.md index a33b7b9..f7c5ae3 100644 --- a/impc_api_helper/README.md +++ b/impc_api_helper/README.md @@ -12,7 +12,7 @@ The functions in this package are intended for use on a Jupyter Notebook. ### Available functions The available functions can be imported as: -`from impc_api_helper import solr_request, batch_request, iterator_solr_request` +`from impc_api_helper import solr_request, batch_solr_request,` ### Solr request The most basic request to the IMPC solr API From 8dd0325e43e570628a80a6e92ebd00b49ce9007a Mon Sep 17 00:00:00 2001 From: dpavam Date: Wed, 2 Oct 2024 16:25:14 +0100 Subject: [PATCH 23/64] chore: add facet usecase from solr_request to README.md --- impc_api_helper/README.md | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/impc_api_helper/README.md b/impc_api_helper/README.md index f7c5ae3..eb41247 100644 --- a/impc_api_helper/README.md +++ b/impc_api_helper/README.md @@ -25,9 +25,23 @@ num_found, df = solr_request( core='genotype-phenotype', params={ ) ``` -### Batch request -For larger requests, use the batch request function to query the API responsibly. +#### Facet request +`solr_request` allows facet requests + +``` +num_found, df = solr_request( + core="genotype-phenotype", + params={ + "q": "*:*", + "rows": 0, + "facet": "on", + "facet.field": "zygosity", + "facet.limit": 15, + "facet.mincount": 1, + }, + ) ``` + ### Batch Solr Request `batch_solr_request` is available for large queries. This solves issues where a request is too large to fit into memory or where it puts a lot of strain on the API. @@ -108,3 +122,6 @@ batch_solr_request( path_to_download='downloads' ) ``` + + + From a9dc0973f1b24f4bc8182e54fa9f22e8d13c8d85 Mon Sep 17 00:00:00 2001 From: dpavam Date: Wed, 2 Oct 2024 16:55:58 +0100 Subject: [PATCH 24/64] docs: adds doc string to batch_solr_request functions --- .../impc_api_helper/batch_solr_request.py | 49 +++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/impc_api_helper/impc_api_helper/batch_solr_request.py b/impc_api_helper/impc_api_helper/batch_solr_request.py index c02f68c..58c776e 100644 --- a/impc_api_helper/impc_api_helper/batch_solr_request.py +++ b/impc_api_helper/impc_api_helper/batch_solr_request.py @@ -6,7 +6,25 @@ from .solr_request import solr_request from pathlib import Path + def batch_solr_request(core, params, download=False, batch_size=5000, path_to_download='./'): + """Function for large API requests (>1,000,000 results). Fetches the data in batches and + produces a Pandas DataFrame or downloads a file in json or csv formats. + + Additionally, allows to search multiple items in a list provided they belong to them same field. + + Args: + core (str): name of IMPC solr core. + params (dict): dictionary containing the API call parameters. + download (bool, optional): True for download a local file, False to display results as a DataFrame. Defaults to False. + batch_size (int, optional): Size of batches to fetch the data. Defaults to 5000. + path_to_download (str, optional): When download=True, select the path to download the file. Defaults to './'. + + + Returns: + pd.DataFrame: if download=False, displays a DataFrame with the results. + None: if download=True, displays a statement on the console and returns None. + """ # Set params for batch request params["start"] = 0 # Start at the first result params["rows"] = batch_size # Fetch results in chunks of 5000 @@ -71,6 +89,16 @@ def batch_solr_request(core, params, download=False, batch_size=5000, path_to_do # Helper batch_to_df def _batch_to_df(core, params, num_results): + """Helper function to fetch data in batches and display them in a DataFrame + + Args: + core (str): name of IMPC solr core. + params (dict): dictionary containing the API call parameters. + num_results (int): Number of docs available + + Returns: + pd.DataFrame: DataFrame with the results. + """ start = params["start"] batch_size = params["rows"] chunks = [] @@ -99,6 +127,19 @@ def _batch_to_df(core, params, num_results): def _batch_solr_generator(core, params, num_results): + """"Generator function to fetch results from the SOLR API in batches using pagination. + + Args: + core (str): name of IMPC solr core. + params (dict): dictionary containing the API call parameters. + num_results (int): Number of docs available + + Raises: + Exception: If a problem occurs during the download, an exception is raised. + + Yields: + ([dict, str]): A JSON object or plain text with the results. + """ base_url = "https://www.ebi.ac.uk/mi/impc/solr/" solr_url = base_url + core + "/select" start = params["start"] @@ -130,6 +171,14 @@ def _batch_solr_generator(core, params, num_results): # File writer def _solr_downloader(params, filename, solr_generator): + """Function to write the data from the generator into the specified format. + Currently supports json and csv only. + + Args: + params (dict): dictionary containing the API call parameters. + filename (str): name for the file to be downloaded. Defaults to core.format as per parent function. + solr_generator ([dict, str]): Generator object with the results. + """ with open(filename, "w", encoding="UTF-8") as f: if params.get("wt") == "json": From 48955b3a97e6544314871a2b84f31245281362c0 Mon Sep 17 00:00:00 2001 From: dpavam Date: Wed, 2 Oct 2024 18:27:08 +0100 Subject: [PATCH 25/64] chore: cleaned up comments and notes on test_batch_solr_request --- .../tests/test_batch_solr_request.py | 81 +++++++++++-------- 1 file changed, 46 insertions(+), 35 deletions(-) diff --git a/impc_api_helper/tests/test_batch_solr_request.py b/impc_api_helper/tests/test_batch_solr_request.py index 45cb220..709c5fa 100644 --- a/impc_api_helper/tests/test_batch_solr_request.py +++ b/impc_api_helper/tests/test_batch_solr_request.py @@ -11,7 +11,6 @@ import json import pandas as pd from pandas.testing import assert_frame_equal -from .test_helpers import check_url_status_code_and_params # Fixture containing the core @@ -26,16 +25,16 @@ class TestBatchSolrRequest: def common_params(self): return {"start": 0, "rows": 10000, "wt": "json"} - # Fixture containing the solr_request function mock - # We will be mocking solr_request with different numbers of numFound, therefore it is passed as param + # Fixture mocking solr_request within the batch_solr_request module. + # solr_request will be mocked with different values for numFound, therefore it is passed as param @pytest.fixture def mock_solr_request(self, request): with patch("impc_api_helper.batch_solr_request.solr_request") as mock: - # Mock expected return content of the solr_request (numFound and _) + # Mock expected return content of the solr_request (numFound and df) mock.return_value = (request.param, pd.DataFrame()) yield mock - # Pytest fixture mocking _batch_to_df + # Fixture mocking _batch_to_df @pytest.fixture def mock_batch_to_df(self): with patch("impc_api_helper.batch_solr_request._batch_to_df") as mock: @@ -43,33 +42,35 @@ def mock_batch_to_df(self): mock.return_value = pd.DataFrame() yield mock + # Test no download - small request # Parameters to determine the numFound of mock_solr_request @pytest.mark.parametrize("mock_solr_request", [10000], indirect=True) def test_batch_solr_request_no_download_small_request( self, mock_solr_request, core, common_params, capsys, mock_batch_to_df ): - # Call your function that uses the mocked request - # Set up mock_solr_request values + # Call tested function result = batch_solr_request(core, params=common_params, download=False) - # # Assert the mock was called with the expected parameters (start = 0, rows = 0) despite calling other values. + # Assert the mock was called with the expected parameters (start = 0, rows = 0) despite calling other values. mock_solr_request.assert_called_with( core=core, params={**common_params, "start": 0, "rows": 0, "wt": "json"}, silent=True, ) - # Capture stoud + # Retrieve the numFound num_found = mock_solr_request.return_value[0] + # Capture stoud captured = capsys.readouterr() assert captured.out == f"Number of found documents: {num_found}\n" # Check _batch_to_df was called mock_batch_to_df.assert_called_once() + # Test no download - large request # Set mock_solr_request to return a large numFound @pytest.mark.parametrize("mock_solr_request", [1000001], indirect=True) - # Parameter to test 4 cases: when user selects 'y' or 'n' upon large download warning. + # Parameter to test 4 cases: when user selects 'y','' or 'n','exit' upon large download warning. @pytest.mark.parametrize( "user_input,expected_outcome", [("y", "continue"), ("", "continue"), ("n", "exit"), ("exit", "exit")], @@ -88,7 +89,7 @@ def test_batch_solr_request_no_download_large_request( # Monkeypatch the input() function with parametrized user input monkeypatch.setattr("builtins.input", lambda _: user_input) - # When user selects 'n', exit should be triggered. + # When user types 'n' or 'exit', exit should be triggered. if expected_outcome == "exit": with pytest.raises(SystemExit): batch_solr_request( @@ -102,9 +103,10 @@ def test_batch_solr_request_no_download_large_request( # Capture the exit messages captured = capsys.readouterr() - # Assertions for continue case + # Retrieve numFound num_found = mock_solr_request.return_value[0] + # Assertions for continue case assert f"Number of found documents: {num_found}" in captured.out if expected_outcome == "continue": @@ -121,8 +123,8 @@ def test_batch_solr_request_no_download_large_request( assert "Exiting gracefully" in captured.out mock_batch_to_df.assert_not_called() - # TEST DOWNLOAD TRUE - # Fixture mocking the activity of _batch_solr_generator + # Test download - large request + # Fixture mocking _batch_solr_generator @pytest.fixture def mock_batch_solr_generator(self): with patch( @@ -130,6 +132,7 @@ def mock_batch_solr_generator(self): ) as mock: yield mock + # Fixture mocking _solr_downloader. Yields a tmp_path to write a file for the duration of the test. @pytest.fixture def mock_solr_downloader(self, tmp_path): with patch("impc_api_helper.batch_solr_request._solr_downloader") as mock: @@ -137,7 +140,7 @@ def mock_solr_downloader(self, tmp_path): temp_dir.mkdir() yield mock - # Mock response for test containing more than 2,000,000 docs + # Mock response for test containing 2,000,000 docs @pytest.mark.parametrize("mock_solr_request", [2000000], indirect=True) # Parametrized decorator to simulate reading a json and csv files @pytest.mark.parametrize( @@ -200,7 +203,8 @@ def test_batch_solr_request_download_true( # Check the function returns None assert result is None - # Mock params + # Test download - multiple fields - large and small + # Mock params for a multiple field query @pytest.fixture def multiple_field_params(self): return { @@ -212,7 +216,7 @@ def multiple_field_params(self): "wt": "json", } - # Mock response for test containing more than 2,000,000 docs + # Mock response for test containing a large request and a small request @pytest.mark.parametrize("mock_solr_request", [(2000000), (10000)], indirect=True) @pytest.mark.parametrize( "download_bool", @@ -232,15 +236,15 @@ def test_batch_solr_request_multiple_fields( mock_solr_downloader, tmp_path, ): - # This test should make sure the request is formatted properly. Regardless of going to downloads or to _batch_to_df - # Get num_found + # This test should ensure the request is formatted properly. Regardless of going to downloads or to _batch_to_df + # Retrieve num_found num_found = mock_solr_request.return_value[0] - # In the case where download=False and numFound is > 1,000,001 we pass 'y' in this test case. + # When download=False and numFound is > 1,000,001 we pass 'y' in this test case. if not download_bool and num_found == 2000000: monkeypatch.setattr("builtins.input", lambda _: "y") # Call test function - # If download is True create a temporary file and call with the path_to_download + # If download==True, create a temporary file and call with the path_to_download if download_bool: temp_dir = tmp_path / "temp_dir" temp_file = temp_dir / f"{core}.json" @@ -262,9 +266,9 @@ def test_batch_solr_request_multiple_fields( assert f"Number of found documents: {num_found}" in captured.out assert 'Queried field: fruits:("orange" OR apple OR *berry)' in captured.out - # If download was true, check subsequent functions were executed + # If download==True, check subsequent functions were executed if download_bool: - # Check _batch_solr_generator gets called once with correct args + # Check _batch_solr_generator gets called with correct args mock_batch_solr_generator.assert_called_with( core, multiple_field_params, num_found ) @@ -294,7 +298,7 @@ def test_batch_solr_request_multiple_fields( # Have helper functions in a different class to separate fixtures and parameters class TestHelpersSolrBatchRequest: - # Define a generator to produce DF's dynamically + # Define a generator to produce df's dynamically def data_generator(self): """Generator to produce data dynamically (row by row or doc by doc)/ @@ -307,8 +311,8 @@ def data_generator(self): for i, a in enumerate(animals): yield (i, a) - # Fixture containing the solr_request function mock - # Num_found is passed dynamically as params in the test + # Fixture mocking solr_request in the batch_solr_request module + # Num_found is passed dynamically as params during the test # Generates df's dynamically using the data generator @pytest.fixture def mock_solr_request_generator(self, request): @@ -330,16 +334,17 @@ def side_effect(*args, **kwargs): mock.side_effect = side_effect yield mock - # Fixture containing the params of a normal batch_solr_request + # Fixture containing the params of a normal batch_solr_request with flexible number of rows (batch_size). @pytest.fixture def batch_params(self, rows): return {"start": 0, "rows": rows, "wt": "json"} + # Fixture to pass different num_found values per test @pytest.fixture def num_found(self, request): return request.param - # Parameters to be passsed to the test: the num_found for mock_solr_request_generator, the num_found separately, and rows (batch_size). + # Parameters to be passsed to the test: a num_found value for mock_solr_request_generator, a num_found separately, and rows (batch_size). # Note num_found is returned by solr_request, when we access it using the generator function, it causes issues. # Hence, we pass num_found separately as a fixture. @pytest.mark.parametrize( @@ -390,7 +395,7 @@ def mock_requests_get(self, request): # Call the generator data_generator = self.data_generator() - # Use the side_effects to return num_found and the dfs + # Use the side_effects to return num_found and the response data def side_effect(*args, **kwargs): # Create a mock response object mock_response = Mock() @@ -422,6 +427,7 @@ def side_effect(*args, **kwargs): def batch_solr_generator_params(self): return {"start": 0, "rows": 1} + # Parameters with the params for fixtures and the expected results @pytest.mark.parametrize( "mock_requests_get,expected_results", [ @@ -457,15 +463,12 @@ def test_batch_solr_generator( batch_solr_generator_params["wt"] = mock_requests_get.return_value.format rows = batch_solr_generator_params["rows"] - # Create the generator + # Call the generator result = _batch_solr_generator(core, batch_solr_generator_params, num_results) - # # Assertions for json data - # if batch_solr_generator_params["wt"] == "json": # Loop over the expected results and check corresponding calls for idx, exp_result in enumerate(expected_results): # Call the next iteration - assert next(result) == exp_result # Check requests.get was called with the correct url, params [especially, the 'start' param], and timeout. @@ -476,13 +479,14 @@ def test_batch_solr_generator( ) # Simpler approach to test when status code is not 200 + # Fixture to mock requests.get returning a status code. @pytest.fixture def mock_requests_get_error(self, request): with patch("impc_api_helper.batch_solr_request.requests.get") as mock_get: mock_get.return_value.status_code = request.param yield mock_get - # Set up test for _batch_solr_generator when status code is not 200 + # Params for _batch_solr_generator when status code is not 200 @pytest.mark.parametrize( "mock_requests_get_error", [404, 500], indirect=["mock_requests_get_error"] ) @@ -503,8 +507,12 @@ def test_batch_solr_generator_error( ) ) + # Fixture to mock _solr_generator. @pytest.fixture def mock_solr_generator(self, request): + """ + Mocks a generator yielding 2 batches/chunks to the tested function + """ format = request.param if format == "json": @@ -538,11 +546,13 @@ def data_chunks(): yield data_chunks() + # Parameters for test function, one for the fixture and one as the expected format @pytest.mark.parametrize( "mock_solr_generator, expected_format", [("json", "json"), ("csv", "csv")], indirect=["mock_solr_generator"], ) + # Test the writer def test_solr_downloader( self, mock_solr_generator, @@ -575,13 +585,14 @@ def test_solr_downloader( {"id": 4, "number": 99}, {"id": 5, "number": 98}, ] - # Check it loads into a pd.DataFrame + # Load data into a df test_df = pd.read_json(test_file) elif expected_format == "csv": content = f.read() assert content == "id,number\n0,0\n1,1\n2,2\n3,100\n4,99\n5,98\n" + # Load data into a df test_df = pd.read_csv(test_file) # Assert the structure of the final df From 94350bb09bd130a27563a593e94983e78d98bee3 Mon Sep 17 00:00:00 2001 From: dpavam Date: Thu, 3 Oct 2024 10:21:15 +0100 Subject: [PATCH 26/64] chore: linted and clarified comments on batch_solr_request --- .../impc_api_helper/batch_solr_request.py | 27 +++++++++---------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/impc_api_helper/impc_api_helper/batch_solr_request.py b/impc_api_helper/impc_api_helper/batch_solr_request.py index 58c776e..d4fef18 100644 --- a/impc_api_helper/impc_api_helper/batch_solr_request.py +++ b/impc_api_helper/impc_api_helper/batch_solr_request.py @@ -7,8 +7,10 @@ from pathlib import Path -def batch_solr_request(core, params, download=False, batch_size=5000, path_to_download='./'): - """Function for large API requests (>1,000,000 results). Fetches the data in batches and +def batch_solr_request( + core, params, download=False, batch_size=5000, path_to_download="./" +): + """Function for large API requests (>1,000,000 results). Fetches the data in batches and produces a Pandas DataFrame or downloads a file in json or csv formats. Additionally, allows to search multiple items in a list provided they belong to them same field. @@ -54,7 +56,7 @@ def batch_solr_request(core, params, download=False, batch_size=5000, path_to_do ) print(f"Number of found documents: {num_results}") - # Download/display only logic + # Download only logic # If user decides to download, a generator is used to fetch data in batches without storing results in memory. if download: # Implement loop behaviour @@ -80,13 +82,11 @@ def batch_solr_request(core, params, download=False, batch_size=5000, path_to_do case "n" | "exit": print("Exiting gracefully") exit() - case "y" | '': + case "y" | "": print("Fetching data...") return _batch_to_df(core, params, num_results) - - # Helper batch_to_df def _batch_to_df(core, params, num_results): """Helper function to fetch data in batches and display them in a DataFrame @@ -127,7 +127,7 @@ def _batch_to_df(core, params, num_results): def _batch_solr_generator(core, params, num_results): - """"Generator function to fetch results from the SOLR API in batches using pagination. + """Generator function to fetch results from the SOLR API in batches using pagination. Args: core (str): name of IMPC solr core. @@ -155,12 +155,12 @@ def _batch_solr_generator(core, params, num_results): data = response.json()["response"]["docs"] else: data = response.text - + # Update and refresh the progress bar after getting the data pbar.update(batch_size) pbar.refresh() yield data - + else: raise Exception(f"Request failed. Status code: {response.status_code}") @@ -172,15 +172,15 @@ def _batch_solr_generator(core, params, num_results): # File writer def _solr_downloader(params, filename, solr_generator): """Function to write the data from the generator into the specified format. - Currently supports json and csv only. + Supports json and csv only. Args: params (dict): dictionary containing the API call parameters. - filename (str): name for the file to be downloaded. Defaults to core.format as per parent function. + filename (Path): name for the file to be downloaded. Defaults to "core.format" as passed by parent function. solr_generator ([dict, str]): Generator object with the results. """ with open(filename, "w", encoding="UTF-8") as f: - + if params.get("wt") == "json": f.write("[\n") first_chunk = True @@ -205,5 +205,4 @@ def _solr_downloader(params, filename, solr_generator): first_chunk = False else: # Skip the first line (header) in subsequent chunks - f.write('\n' + '\n'.join(lines[1:]) + '\n') - + f.write("\n" + "\n".join(lines[1:]) + "\n") From a21fe1bba12ba4bd56a578ff40e585bd4429c31c Mon Sep 17 00:00:00 2001 From: dpavam Date: Thu, 3 Oct 2024 12:50:53 +0100 Subject: [PATCH 27/64] chore: clarification with batch_size and params["rows"] --- impc_api_helper/impc_api_helper/batch_solr_request.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/impc_api_helper/impc_api_helper/batch_solr_request.py b/impc_api_helper/impc_api_helper/batch_solr_request.py index d4fef18..d9aa930 100644 --- a/impc_api_helper/impc_api_helper/batch_solr_request.py +++ b/impc_api_helper/impc_api_helper/batch_solr_request.py @@ -29,7 +29,8 @@ def batch_solr_request( """ # Set params for batch request params["start"] = 0 # Start at the first result - params["rows"] = batch_size # Fetch results in chunks of 5000 + # Override param rows in case there was input. Defines batch size to 5000 by default + params["rows"] = batch_size # If user did not specify format, defaults to json. if params.get("wt") is None: @@ -186,11 +187,9 @@ def _solr_downloader(params, filename, solr_generator): first_chunk = True for chunk in solr_generator: - # print('CHUNK',chunk,'\n') for item in chunk: if not first_chunk: f.write(",\n") - # print('ITEM',item) json.dump(item, f, ensure_ascii=False) first_chunk = False f.write("\n]\n") From 2a79240d7a08b03b70cf837ad45c8ea4d44bc73b Mon Sep 17 00:00:00 2001 From: dpavam Date: Thu, 3 Oct 2024 12:52:05 +0100 Subject: [PATCH 28/64] fix: enformce the use of arg batch_size to avoid clashes with params["rows"] --- .../tests/test_batch_solr_request.py | 43 ++++++++++++------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/impc_api_helper/tests/test_batch_solr_request.py b/impc_api_helper/tests/test_batch_solr_request.py index 709c5fa..1ae6e45 100644 --- a/impc_api_helper/tests/test_batch_solr_request.py +++ b/impc_api_helper/tests/test_batch_solr_request.py @@ -23,7 +23,7 @@ class TestBatchSolrRequest: # Fixture containing the params of a normal batch_solr_request @pytest.fixture def common_params(self): - return {"start": 0, "rows": 10000, "wt": "json"} + return {"start": 0, "rows": 0, "wt": "json"} # Fixture mocking solr_request within the batch_solr_request module. # solr_request will be mocked with different values for numFound, therefore it is passed as param @@ -75,7 +75,7 @@ def test_batch_solr_request_no_download_small_request( "user_input,expected_outcome", [("y", "continue"), ("", "continue"), ("n", "exit"), ("exit", "exit")], ) - def test_batch_solr_request_no_download_large_request( + def test_batch_solr_request_download_false_large_request( self, core, common_params, @@ -89,15 +89,18 @@ def test_batch_solr_request_no_download_large_request( # Monkeypatch the input() function with parametrized user input monkeypatch.setattr("builtins.input", lambda _: user_input) + # Set a batch_size for clarity + batch_size = 500000 + # When user types 'n' or 'exit', exit should be triggered. if expected_outcome == "exit": with pytest.raises(SystemExit): batch_solr_request( - core, params=common_params, download=False, batch_size=5000 + core, params=common_params, download=False, batch_size=batch_size ) else: result = batch_solr_request( - core, params=common_params, download=False, batch_size=5000 + core, params=common_params, download=False, batch_size=batch_size ) # Capture the exit messages @@ -114,8 +117,8 @@ def test_batch_solr_request_no_download_large_request( "Your request might exceed the available memory. We suggest setting 'download=True' and reading the file in batches" in captured.out ) - mock_batch_to_df.assert_called_once_with( - "test_core", {"start": 0, "rows": 5000, "wt": "json"}, 1000001 + mock_batch_to_df.assert_called_with( + "test_core", {"start": 0, "rows": batch_size, "wt": "json"}, num_found ) # Assertion for exit case @@ -185,6 +188,8 @@ def test_batch_solr_request_download_true( ) num_found = mock_solr_request.return_value[0] + # Assert params rows has been updated to the value of batch_size + # Check _batch_solr_generator gets called once with correct args mock_batch_solr_generator.assert_called_once_with( core, params_format, num_found @@ -258,7 +263,7 @@ def test_batch_solr_request_multiple_fields( else: # Otherwise, call without the path_to_download result = batch_solr_request( - core, params=multiple_field_params, download=download_bool + core, params=multiple_field_params, download=download_bool, batch_size=5000 ) # Check output which should be equal for both. @@ -336,8 +341,8 @@ def side_effect(*args, **kwargs): # Fixture containing the params of a normal batch_solr_request with flexible number of rows (batch_size). @pytest.fixture - def batch_params(self, rows): - return {"start": 0, "rows": rows, "wt": "json"} + def batch_params(self, batch_size): + return {"start": 0, "rows": batch_size, "wt": "json"} # Fixture to pass different num_found values per test @pytest.fixture @@ -348,13 +353,14 @@ def num_found(self, request): # Note num_found is returned by solr_request, when we access it using the generator function, it causes issues. # Hence, we pass num_found separately as a fixture. @pytest.mark.parametrize( - "mock_solr_request_generator,num_found,rows", + "mock_solr_request_generator,num_found,batch_size", [(50000, 50000, 10000), (5, 5, 1), (25000, 25000, 5000)], indirect=["mock_solr_request_generator"], ) def test_batch_to_df( - self, core, batch_params, num_found, mock_solr_request_generator, rows + self, core, batch_params, num_found, mock_solr_request_generator, batch_size ): + # Call the tested function df = _batch_to_df(core, batch_params, num_found) @@ -362,7 +368,7 @@ def test_batch_to_df( expected_calls = [ call( core=core, - params={**batch_params, "start": i * rows, "rows": rows}, + params={**batch_params, "start": i * batch_size, "rows": batch_size}, silent=True, ) for i in range(5) @@ -425,7 +431,7 @@ def side_effect(*args, **kwargs): # Fixture containing the params for batch_solr_generator @pytest.fixture def batch_solr_generator_params(self): - return {"start": 0, "rows": 1} + return {"q": "*:*", "start": 0, "rows": 1} # Parameters with the params for fixtures and the expected results @pytest.mark.parametrize( @@ -461,20 +467,25 @@ def test_batch_solr_generator( num_results = 5 # Define the wt and batch_size param for the test batch_solr_generator_params["wt"] = mock_requests_get.return_value.format - rows = batch_solr_generator_params["rows"] + batch_size = 1 + + # Override rows as the parent function would + batch_solr_generator_params["rows"] = batch_size # Call the generator result = _batch_solr_generator(core, batch_solr_generator_params, num_results) # Loop over the expected results and check corresponding calls - for idx, exp_result in enumerate(expected_results): + for idx, exp_result in enumerate(expected_results, start=0): # Call the next iteration assert next(result) == exp_result # Check requests.get was called with the correct url, params [especially, the 'start' param], and timeout. + # The first call will always be with the params["rows"] value, 1 in this case. + # Since the function mock_requests_get.assert_called_with( "https://www.ebi.ac.uk/mi/impc/solr/test_core/select", - params={**batch_solr_generator_params, "start": idx, "rows": rows}, + params={**batch_solr_generator_params, "start": idx, "rows": batch_size}, timeout=10, ) From 6d6600a9c6a9e5b995b1e9dd9f8336f9effb4d67 Mon Sep 17 00:00:00 2001 From: dpavam Date: Thu, 3 Oct 2024 12:58:54 +0100 Subject: [PATCH 29/64] test: adds assertion to check params["rows"] takes the value of batch_size arg --- impc_api_helper/tests/test_batch_solr_request.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/impc_api_helper/tests/test_batch_solr_request.py b/impc_api_helper/tests/test_batch_solr_request.py index 1ae6e45..d24ef8c 100644 --- a/impc_api_helper/tests/test_batch_solr_request.py +++ b/impc_api_helper/tests/test_batch_solr_request.py @@ -49,7 +49,10 @@ def test_batch_solr_request_no_download_small_request( self, mock_solr_request, core, common_params, capsys, mock_batch_to_df ): # Call tested function - result = batch_solr_request(core, params=common_params, download=False) + result = batch_solr_request(core, params=common_params, download=False, batch_size=100) + + # Assert the value of params was changed to batch_size + assert common_params["rows"] == 100 # Assert the mock was called with the expected parameters (start = 0, rows = 0) despite calling other values. mock_solr_request.assert_called_with( @@ -150,12 +153,12 @@ def mock_solr_downloader(self, tmp_path): "params_format, format, file_content", [ ( - {"start": 0, "rows": 2000000, "wt": "json"}, + {"start": 0, "rows": 0, "wt": "json"}, "json", '[{"id": "1", "city": "Houston"},{"id": "2", "city": "Prague"}]', ), ( - {"start": 0, "rows": 2000000, "wt": "csv"}, + {"start": 0, "rows": 0, "wt": "csv"}, "csv", "id,city\n1,Houston\n2,Prague\n", ), @@ -184,11 +187,12 @@ def test_batch_solr_request_download_true( # First we call the function # We patch solr_request to get the number of docs result = batch_solr_request( - core, params=params_format, download=True, path_to_download=temp_dir + core, params=params_format, download=True, path_to_download=temp_dir, batch_size=2000000 ) num_found = mock_solr_request.return_value[0] - # Assert params rows has been updated to the value of batch_size + # Assert params["rows"] == batch size and not the original value (0) + assert params_format["rows"] == 2000000 # Check _batch_solr_generator gets called once with correct args mock_batch_solr_generator.assert_called_once_with( From 660770d161f0d3a9a2a2972741bf31cfe5eab0fd Mon Sep 17 00:00:00 2001 From: dpavam Date: Thu, 3 Oct 2024 15:26:26 +0100 Subject: [PATCH 30/64] fix: update progress bar in _batch_to_df correctly --- impc_api_helper/impc_api_helper/batch_solr_request.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/impc_api_helper/impc_api_helper/batch_solr_request.py b/impc_api_helper/impc_api_helper/batch_solr_request.py index d9aa930..b3aa845 100644 --- a/impc_api_helper/impc_api_helper/batch_solr_request.py +++ b/impc_api_helper/impc_api_helper/batch_solr_request.py @@ -109,8 +109,6 @@ def _batch_to_df(core, params, num_results): # Request chunks until we have complete data. with tqdm(total=num_results) as pbar: while start < num_results: - # Update progress bar with the number of rows requested. - pbar.update(batch_size) # Request chunk. We don't need num_results anymore because it does not change. _, df_chunk = solr_request( @@ -119,6 +117,9 @@ def _batch_to_df(core, params, num_results): silent=True, ) + # Update progress bar with the number of rows requested. + pbar.update(batch_size) + pbar.refresh() # Record chunk. chunks.append(df_chunk) # Increment start. From 3cdc9e203869a632406f008ac6d447de20b00166 Mon Sep 17 00:00:00 2001 From: dpavam Date: Thu, 3 Oct 2024 15:31:07 +0100 Subject: [PATCH 31/64] chore: updates README with usage examples of all functions --- impc_api_helper/README.md | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/impc_api_helper/README.md b/impc_api_helper/README.md index eb41247..f72e721 100644 --- a/impc_api_helper/README.md +++ b/impc_api_helper/README.md @@ -12,14 +12,16 @@ The functions in this package are intended for use on a Jupyter Notebook. ### Available functions The available functions can be imported as: -`from impc_api_helper import solr_request, batch_solr_request,` +``` +from impc_api_helper import solr_request, batch_solr_request +``` ### Solr request The most basic request to the IMPC solr API ``` num_found, df = solr_request( core='genotype-phenotype', params={ - 'q': '*:*' - 'rows': 10 + 'q': '*:*', + 'rows': 10, 'fl': 'marker_symbol,allele_symbol,parameter_stable_id' } ) @@ -64,7 +66,8 @@ df = batch_solr_request( params={ 'q':'*:*' }, - download=False + download=False, + batch_size=30000 ) print(df.head()) ``` @@ -80,7 +83,8 @@ batch_solr_request( 'wt':'csv' }, download=True, - path_to_download = 'downloads' + path_to_download = '.', + batch_size=20000 ) ``` From f7c68e16143efe7d31af101fdd4f3e90f3cd5dc5 Mon Sep 17 00:00:00 2001 From: dpavam Date: Fri, 4 Oct 2024 17:22:36 +0100 Subject: [PATCH 32/64] build: includes pydantic for validation --- impc_api_helper/pyproject.toml | 3 ++- impc_api_helper/setup.py | 1 + requirements.txt | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/impc_api_helper/pyproject.toml b/impc_api_helper/pyproject.toml index 59bd62f..7449db5 100644 --- a/impc_api_helper/pyproject.toml +++ b/impc_api_helper/pyproject.toml @@ -14,7 +14,8 @@ authors = [ dependencies = [ "pandas>=2.2.0", "requests>=2.31.0", - "tqdm>=4.66.4" + "tqdm>=4.66.4", + "pydantic>=2.9" ] readme = "README.md" diff --git a/impc_api_helper/setup.py b/impc_api_helper/setup.py index d4d03f0..452d1f1 100644 --- a/impc_api_helper/setup.py +++ b/impc_api_helper/setup.py @@ -11,6 +11,7 @@ 'pandas>=2.2.0', 'requests>=2.31.0', 'tqdm>=4.66.4', + 'pydantic>=2.9' ], extras_require={ diff --git a/requirements.txt b/requirements.txt index c12e5ab..9157acd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ pandas>=2.2.0 requests>=2.31.0 tqdm>=4.66.4 +pydantic>=2.9 From 7b64ffebb1d5296127712fc6595deb6dd5e0d509 Mon Sep 17 00:00:00 2001 From: dpavam Date: Fri, 4 Oct 2024 17:50:14 +0100 Subject: [PATCH 33/64] feat: handles core or param[fl] errors more gracefully for users. NOTE: Issue with PATH in utils.py depending on IDE used. --- .../impc_api_helper/solr_request.py | 19 ++++++- .../impc_api_helper/utils/__init__.py | 0 .../impc_api_helper/utils/core_fields.json | 15 +++++ .../impc_api_helper/utils/utils.py | 57 +++++++++++++++++++ 4 files changed, 89 insertions(+), 2 deletions(-) create mode 100644 impc_api_helper/impc_api_helper/utils/__init__.py create mode 100644 impc_api_helper/impc_api_helper/utils/core_fields.json create mode 100644 impc_api_helper/impc_api_helper/utils/utils.py diff --git a/impc_api_helper/impc_api_helper/solr_request.py b/impc_api_helper/impc_api_helper/solr_request.py index eea3ee4..cf93135 100644 --- a/impc_api_helper/impc_api_helper/solr_request.py +++ b/impc_api_helper/impc_api_helper/solr_request.py @@ -1,9 +1,11 @@ from IPython.display import display +from pydantic import ValidationError from tqdm import tqdm import pandas as pd import requests +from .utils.utils import CoreParamsValidator # Display the whole dataframe <15 pd.set_option("display.max_rows", 15) @@ -61,6 +63,18 @@ def solr_request(core, params, silent=False): ) """ + # Validate core and params + print("Validating core and params...") + try: + CoreParamsValidator( + core=core, + params=params + ) + print("Validation passed ") + except ValidationError as e: + print(f"Validation failed: {e.errors()[0].get('msg')}") + + base_url = "https://www.ebi.ac.uk/mi/impc/solr/" solr_url = base_url + core + "/select" @@ -95,8 +109,9 @@ def solr_request(core, params, silent=False): return num_found, df else: - print("Error:", response.status_code, response.text) - + # print("Error:", response.status_code, response.text) + print("Error:", response.status_code) + exit() def _process_faceting(data, params): """Processes the faceting data from an API response. diff --git a/impc_api_helper/impc_api_helper/utils/__init__.py b/impc_api_helper/impc_api_helper/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/impc_api_helper/impc_api_helper/utils/core_fields.json b/impc_api_helper/impc_api_helper/utils/core_fields.json new file mode 100644 index 0000000..d3ce064 --- /dev/null +++ b/impc_api_helper/impc_api_helper/utils/core_fields.json @@ -0,0 +1,15 @@ +{ + "experimental": [ + "id", "observation_id", "specimen_id", "phenotyping_center_id", "phenotyping_center", "production_center_id", "production_center", "specimen_project_id", "specimen_project_name", "gene_accession_id", "gene_symbol", "allele_accession_id", "allele_symbol", "zygosity", "sex", "biological_model_id", "biological_sample_id", "biological_sample_group", "strain_accession_id", "strain_name", "genetic_background", "allelic_composition", "colony_id", "litter_id", "date_of_birth", "external_sample_id", "life_stage_name", "life_stage_acc", "datasource_id", "datasource_name", "project_id", "project_name", "pipeline_id", "pipeline_name", "pipeline_stable_id", "procedure_id", "procedure_name", "procedure_stable_id", "procedure_group", "parameter_id", "parameter_name", "parameter_stable_id", "procedure_sequence_id", "experiment_id", "observation_type", "data_type", "experiment_source_id", "date_of_experiment", "weight_parameter_stable_id", "weight_date", "weight_days_old", "weight", "data_point", "order_index", "dimension", "time_point", "discrete_point", "category", "raw_category", "metadata", "metadata_group", "anatomy_id", "anatomy_term", "anatomy_id_term", "anatomy_term_synonym", "top_level_anatomy_id", "top_level_anatomy_term", "top_level_anatomy_term_synonym", "selected_top_level_anatomy_id", "selected_top_level_anatomy_term", "selected_top_level_anatomy_term_synonym", "intermediate_anatomy_id", "intermediate_anatomy_term", "intermediate_anatomy_term_synonym", "parent_anatomy_id", "parent_anatomy_term", "parent_anatomy_term_synonym", "child_anatomy_id", "child_anatomy_term", "child_anatomy_term_synonym", "download_file_path", "image_link", "file_type", "increment_value", "parameter_association_stable_id", "parameter_association_sequence_id", "parameter_association_dim_id", "parameter_association_name", "parameter_association_value", "developmental_stage_acc", "developmental_stage_name", "text_value", "sub_term_id", "sub_term_name", "sub_term_description", "age_in_days", "age_in_weeks" + ], + "genotype-phenotype": [ + "doc_id", "ontology_db_id", "assertion_type", "assertion_type_id", "mpath_term_id", "mpath_term_name", "anatomy_term_id", "anatomy_term_name", "intermediate_anatomy_term_id", "intermediate_anatomy_term_name", "top_level_anatomy_term_id", "top_level_anatomy_term_name", "mp_term_id", "mp_term_name", "alt_mp_term_id", "top_level_mp_term_id", "top_level_mp_term_name", "intermediate_mp_term_id", "intermediate_mp_term_name", "marker_symbol", "marker_accession_id", "colony_id", "allele_name", "allele_symbol", "allele_accession_id", "strain_name", "strain_accession_id", "phenotyping_center", "project_external_id", "project_name", "project_fullname", "resource_name", "resource_fullname", "sex", "zygosity", "pipeline_name", "pipeline_stable_id", "pipeline_stable_key", "procedure_name", "procedure_stable_id", "procedure_stable_key", "parameter_name", "parameter_stable_id", "parameter_stable_key", "statistical_method", "percentage_change", "p_value", "effect_size", "external_id", "life_stage_acc", "life_stage_name" + ], + "impc_images": [ + "id", "observation_id", "specimen_id", "phenotyping_center_id", "phenotyping_center", "production_center_id", "production_center", "specimen_project_id", "specimen_project_name", "gene_accession_id", "gene_symbol", "allele_accession_id", "allele_symbol", "zygosity", "sex", "biological_model_id", "biological_sample_id", "biological_sample_group", "strain_accession_id", "strain_name", "genetic_background", "allelic_composition", "colony_id", "litter_id", "date_of_birth", "external_sample_id", "life_stage_name", "life_stage_acc", "datasource_id", "datasource_name", "project_id", "project_name", "pipeline_id", "pipeline_name", "pipeline_stable_id", "procedure_id", "procedure_name", "procedure_stable_id", "procedure_group", "parameter_id", "parameter_name", "parameter_stable_id", "procedure_sequence_id", "experiment_id", "observation_type", "data_type", "experiment_source_id", "date_of_experiment", "weight_parameter_stable_id", "weight_date", "weight_days_old", "weight", "data_point", "order_index", "dimension", "time_point", "discrete_point", "category", "raw_category", "metadata", "metadata_group", "mp_id", "mp_term", "top_level_mp_id", "top_level_mp_term", "intermediate_mp_id", "intermediate_mp_term", "anatomy_id", "anatomy_term", "anatomy_id_term", "anatomy_term_synonym", "top_level_anatomy_id", "top_level_anatomy_term", "top_level_anatomy_term_synonym", "selected_top_level_anatomy_id", "selected_top_level_anatomy_term", "selected_top_level_anatomy_term_synonym", "intermediate_anatomy_id", "intermediate_anatomy_term", "intermediate_anatomy_term_synonym", "parent_anatomy_id", "parent_anatomy_term", "parent_anatomy_term_synonym", "child_anatomy_id", "child_anatomy_term", "child_anatomy_term_synonym", "download_file_path", "image_link", "file_type", "parameter_association_stable_id", "parameter_association_sequence_id", "parameter_association_dim_id", "parameter_association_name", "parameter_association_value", "developmental_stage_acc", "developmental_stage_name", "text_value", "sub_term_id", "sub_term_name", "sub_term_description", "sequence_id", "age_in_days", "age_in_weeks", "download_url", "jpeg_url", "thumbnail_url", "omero_id" + ], + "phenodigm": [ + "type", "disease_id", "disease_source", "disease_term", "disease_alts", "disease_locus", "disease_classes", "disease_phenotypes", "gene_id", "gene_symbol", "gene_symbols_withdrawn", "gene_locus", "hgnc_gene_id", "hgnc_gene_symbol", "hgnc_gene_symbols_withdrawn", "hgnc_gene_locus", "mouse_model", "impc_model", "model_id", "model_source", "model_description", "model_genetic_background", "marker_id", "marker_symbol", "marker_locus", "marker_num_models", "model_phenotypes", "ontology", "phenotype_id", "phenotype_term", "phenotype_synonym", "hp_id", "hp_term", "mp_id", "mp_term", "association_curated", "association_ortholog", "marker_symbols_withdrawn", "disease_matched_phenotypes", "model_matched_phenotypes", "disease_model_avg_raw", "disease_model_avg_norm", "disease_model_max_raw", "disease_model_max_norm", "search_qf", "human_curated_gene", "impc_model_with_curated_gene", "mgi_model_with_curated_gene", "impc_model_with_computed_association", "mgi_model_with_computed_association" + ], + "statistical-result": ["doc_id", "db_id", "data_type", "anatomy_term_id", "anatomy_term_name", "intermediate_anatomy_term_id", "intermediate_anatomy_term_name", "top_level_anatomy_term_id", "top_level_anatomy_term_name", "mp_term_id_options", "mp_term_name_options", "mp_term_id", "mp_term_name", "top_level_mp_term_id", "top_level_mp_term_name", "intermediate_mp_term_id", "intermediate_mp_term_name", "male_mp_term_id", "male_mp_term_name", "male_top_level_mp_term_id", "male_top_level_mp_term_name", "male_intermediate_mp_term_id", "male_intermediate_mp_term_name", "female_mp_term_id", "female_mp_term_name", "female_top_level_mp_term_id", "female_top_level_mp_term_name", "female_intermediate_mp_term_id", "female_intermediate_mp_term_name", "resource_name", "resource_fullname", "resource_id", "project_name", "phenotyping_center", "pipeline_stable_id", "pipeline_stable_key", "pipeline_name", "pipeline_id", "procedure_stable_id", "procedure_stable_key", "procedure_name", "procedure_id", "parameter_stable_id", "parameter_stable_key", "parameter_name", "parameter_id", "colony_id", "marker_symbol", "marker_accession_id", "allele_symbol", "allele_name", "allele_accession_id", "strain_name", "strain_accession_id", "sex", "zygosity", "control_selection_method", "dependent_variable", "metadata_group", "data_frame", "genetic_background", "production_center", "external_db_id", "id", "organisation_id", "phenotyping_center_id", "project_id", "male_control_mean", "male_mutant_mean", "female_control_mean", "female_mutant_mean", "genotype_p_value_low_vs_normal_high", "genotype_p_value_low_normal_vs_high", "genotype_effect_size_low_vs_normal_high", "genotype_effect_size_low_normal_vs_high", "female_p_value_low_vs_normal_high", "female_p_value_low_normal_vs_high", "female_effect_size_low_vs_normal_high", "female_effect_size_low_normal_vs_high", "male_p_value_low_vs_normal_high", "male_p_value_low_normal_vs_high", "male_effect_size_low_vs_normal_high", "male_effect_size_low_normal_vs_high", "categories", "categorical_p_value", "categorical_effect_size", "batch_significant", "variance_significant", "null_test_p_value", "genotype_effect_p_value", "genotype_effect_stderr_estimate", "genotype_effect_parameter_estimate", "male_percentage_change", "female_percentage_change", "sex_effect_p_value", "sex_effect_stderr_estimate", "sex_effect_parameter_estimate", "weight_effect_p_value", "weight_effect_stderr_estimate", "weight_effect_parameter_estimate", "group1_genotype", "group1_residuals_normality_test", "group2_genotype", "group2_residuals_normality_test", "blups_test", "rotated_residuals_test", "intercept_estimate", "intercept_estimate_stderr_estimate", "interaction_significant", "interaction_effect_p_value", "female_ko_effect_p_value", "female_ko_effect_stderr_estimate", "female_ko_parameter_estimate", "female_effect_size", "male_ko_effect_p_value", "male_ko_effect_stderr_estimate", "male_ko_parameter_estimate", "male_effect_size", "classification_tag", "phenotype_sex", "life_stage_acc", "life_stage_name", "significant", "soft_windowing_bandwidth", "soft_windowing_shape", "soft_windowing_peaks", "soft_windowing_min_obs_required", "soft_windowing_total_obs_or_weight", "soft_windowing_threshold", "soft_windowing_number_of_doe", "soft_windowing_doe_note", "metadata"] +} diff --git a/impc_api_helper/impc_api_helper/utils/utils.py b/impc_api_helper/impc_api_helper/utils/utils.py new file mode 100644 index 0000000..81f6598 --- /dev/null +++ b/impc_api_helper/impc_api_helper/utils/utils.py @@ -0,0 +1,57 @@ +from pydantic import BaseModel, model_validator +import json +from typing import List, Dict +from pathlib import Path +import warnings + +CORE_FILE = Path('impc_api_helper', 'impc_api_helper', 'utils', 'core_fields.json') + +# Define the dictionary with available options: core:fields +def load_core_fields(filename: Path): + with open(filename, "r") as f: + validation_dict = json.load(f) + return validation_dict + +# Define the validation dict +validation_json = load_core_fields(CORE_FILE) + +# Function to parse the fields (fl) params in params +def get_fields(fields: str) -> List[str]: + return fields.split(",") + + +class CoreParamsValidator(BaseModel): + core: str + params: Dict + + @model_validator(mode='before') + @classmethod + def validate_core_and_fields(cls, values): + core = values.get("core") + params = values.get("params") + + # Validate core + if core not in validation_json.keys(): + raise ValueError(f'Invalid core: "{core}", select from the available:\n{validation_json.keys()})') + + # Compare passed fl values vs the allowed fl values for a given core + fields: str = params.get("fl") + + # If no fields were specified, pass + if fields is None: + print("No fields passed, skipping field validation...") + return values + + # Get the fields passed to params and the expected fields for the core + field_list: List[str] = get_fields(fields) + accepted_core_fields: List[str] = validation_json.get(core, []) + + # Validate each field in params + for fl in field_list: + if fl not in accepted_core_fields: + warnings.warn(f'Invalid field: "{fl}". Check spelling of fields. To see available fields check the documentation at: https://www.ebi.ac.uk/mi/impc/solrdoc/', + category=UserWarning) + # Return validated values + return values + +# Check and raise error as needed. From 04dace84c449a1950979d068e702eb347582d615 Mon Sep 17 00:00:00 2001 From: dpavam Date: Tue, 8 Oct 2024 11:19:54 +0100 Subject: [PATCH 34/64] build: update setup.py and pyproject.toml settings to include the utils module --- impc_api_helper/pyproject.toml | 4 ++-- impc_api_helper/setup.py | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/impc_api_helper/pyproject.toml b/impc_api_helper/pyproject.toml index 7449db5..ee6fc5e 100644 --- a/impc_api_helper/pyproject.toml +++ b/impc_api_helper/pyproject.toml @@ -26,9 +26,9 @@ dev = [ "pytest>=8.2.2" ] +[tool.setuptools.packages.find] +include = ["impc_api_helper", "impc_api_helper.*"] [project.urls] "Homepage" = "https://github.com/mpi2/impc-data-api-workshop" -[tool.setuptools] -packages = ["impc_api_helper"] \ No newline at end of file diff --git a/impc_api_helper/setup.py b/impc_api_helper/setup.py index 452d1f1..3307bf7 100644 --- a/impc_api_helper/setup.py +++ b/impc_api_helper/setup.py @@ -1,12 +1,14 @@ from setuptools import setup, find_packages + setup( name='impc_api_helper', version='0.1.0', description='A package to facilitate making API request to the IMPC Solr API', author='MPI2, Marina Kan, Diego Pava', url='https://github.com/mpi2/impc-data-api-workshop', - packages=find_packages(), + packages=find_packages(include=["impc_api_helper", "impc_api_helper.*"]), + include_package_data=True, install_requires=[ 'pandas>=2.2.0', 'requests>=2.31.0', From e41f6e70ce43e59046c553fae8a9c99206d7ab27 Mon Sep 17 00:00:00 2001 From: dpavam Date: Tue, 8 Oct 2024 11:20:46 +0100 Subject: [PATCH 35/64] fix: adds utils to __init__.py for correct imports --- impc_api_helper/impc_api_helper/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/impc_api_helper/impc_api_helper/__init__.py b/impc_api_helper/impc_api_helper/__init__.py index 3c83952..7216b9b 100644 --- a/impc_api_helper/impc_api_helper/__init__.py +++ b/impc_api_helper/impc_api_helper/__init__.py @@ -1,5 +1,6 @@ from .solr_request import solr_request from .batch_solr_request import batch_solr_request +from .utils import validators, warnings # Control what gets imported by client __all__ = ["solr_request", "batch_solr_request"] From 9791824d504e8239b3a99708c718417853ecd311 Mon Sep 17 00:00:00 2001 From: dpavam Date: Tue, 8 Oct 2024 11:22:03 +0100 Subject: [PATCH 36/64] feat: changes utils to validators. Adds a json validation class --- .../impc_api_helper/utils/utils.py | 57 ------------- .../impc_api_helper/utils/validators.py | 79 +++++++++++++++++++ 2 files changed, 79 insertions(+), 57 deletions(-) delete mode 100644 impc_api_helper/impc_api_helper/utils/utils.py create mode 100644 impc_api_helper/impc_api_helper/utils/validators.py diff --git a/impc_api_helper/impc_api_helper/utils/utils.py b/impc_api_helper/impc_api_helper/utils/utils.py deleted file mode 100644 index 81f6598..0000000 --- a/impc_api_helper/impc_api_helper/utils/utils.py +++ /dev/null @@ -1,57 +0,0 @@ -from pydantic import BaseModel, model_validator -import json -from typing import List, Dict -from pathlib import Path -import warnings - -CORE_FILE = Path('impc_api_helper', 'impc_api_helper', 'utils', 'core_fields.json') - -# Define the dictionary with available options: core:fields -def load_core_fields(filename: Path): - with open(filename, "r") as f: - validation_dict = json.load(f) - return validation_dict - -# Define the validation dict -validation_json = load_core_fields(CORE_FILE) - -# Function to parse the fields (fl) params in params -def get_fields(fields: str) -> List[str]: - return fields.split(",") - - -class CoreParamsValidator(BaseModel): - core: str - params: Dict - - @model_validator(mode='before') - @classmethod - def validate_core_and_fields(cls, values): - core = values.get("core") - params = values.get("params") - - # Validate core - if core not in validation_json.keys(): - raise ValueError(f'Invalid core: "{core}", select from the available:\n{validation_json.keys()})') - - # Compare passed fl values vs the allowed fl values for a given core - fields: str = params.get("fl") - - # If no fields were specified, pass - if fields is None: - print("No fields passed, skipping field validation...") - return values - - # Get the fields passed to params and the expected fields for the core - field_list: List[str] = get_fields(fields) - accepted_core_fields: List[str] = validation_json.get(core, []) - - # Validate each field in params - for fl in field_list: - if fl not in accepted_core_fields: - warnings.warn(f'Invalid field: "{fl}". Check spelling of fields. To see available fields check the documentation at: https://www.ebi.ac.uk/mi/impc/solrdoc/', - category=UserWarning) - # Return validated values - return values - -# Check and raise error as needed. diff --git a/impc_api_helper/impc_api_helper/utils/validators.py b/impc_api_helper/impc_api_helper/utils/validators.py new file mode 100644 index 0000000..25627d5 --- /dev/null +++ b/impc_api_helper/impc_api_helper/utils/validators.py @@ -0,0 +1,79 @@ +from pydantic import BaseModel, model_validator +import json +from typing import List, Dict +from pathlib import Path +import warnings +from dataclasses import dataclass, field +from impc_api_helper.utils.warnings import warning_config, InvalidCoreWarning, InvalidFieldWarning + +# Initialise warning config +warning_config() + +# Dataclass for the json validator +@dataclass +class ValidationJson: + CORE_FILE: Path = Path('impc_api_helper', 'impc_api_helper', 'utils', 'core_fields.json') + _validation_json: Dict[str, List[str]] = field(default_factory=dict, init=False) + + # Eager initialisation + def __post_init__(self): + self._validation_json = self.load_core_fields(self.CORE_FILE) + + def load_core_fields(self, filename: Path) -> Dict[str, List[str]]: + with open(filename, "r") as f: + return json.load(f) + + def valid_cores(self): + return self._validation_json.keys() + + def valid_fields(self, core: str) -> List[str]: + return self._validation_json.get(core, []) + +# Function to parse the fields (fl) params in params +def get_fields(fields: str) -> List[str]: + return fields.split(",") + + +class CoreParamsValidator(BaseModel): + core: str + params: Dict + + @model_validator(mode='before') + @classmethod + def validate_core_and_fields(cls, values): + core = values.get("core") + params = values.get("params") + + # Call the Validator Object + jv = ValidationJson() + + # Validate core + if core not in jv.valid_cores(): + warnings.warn( + message=f'Invalid core: "{core}", select from the available cores:\n{jv.valid_cores()})\n', + category=InvalidCoreWarning) + + # Compare passed fl values vs the allowed fl values for a given core + fields: str = params.get("fl") + + # If no fields were specified, pass + if fields is None: + print("No fields passed, skipping field validation...") + return values + + # Get the fields passed to params and the expected fields for the core + field_list: List[str] = get_fields(fields) + + + # Validate each field in params + # TODO: perhaps pass al invalid fields as a list, instead of many warning messages + for fl in field_list: + if fl not in jv.valid_fields(core): + warnings.warn(message=f"""Unexpected field name: "{fl}". Check the spelling of fields.\nTo see expected fields check the documentation at: https://www.ebi.ac.uk/mi/impc/solrdoc/""", + category=InvalidFieldWarning) + # Return validated values + return values + + + + From f15e305d31626d11f426c42dfce91658598d12e1 Mon Sep 17 00:00:00 2001 From: dpavam Date: Tue, 8 Oct 2024 11:22:18 +0100 Subject: [PATCH 37/64] feat: creates custom warnings to inform the user about validation errors --- .../impc_api_helper/utils/warnings.py | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 impc_api_helper/impc_api_helper/utils/warnings.py diff --git a/impc_api_helper/impc_api_helper/utils/warnings.py b/impc_api_helper/impc_api_helper/utils/warnings.py new file mode 100644 index 0000000..d344092 --- /dev/null +++ b/impc_api_helper/impc_api_helper/utils/warnings.py @@ -0,0 +1,24 @@ +"""Module for warnings and excepton utils""" + +import warnings + + +# Custom warnings +class InvalidCoreWarning(Warning): + """Exception raised when the core is not in the expected core names""" + + +class InvalidFieldWarning(Warning): + """Exception raised when the field name is not in the expected fields""" + + +# Custom warning function +def warning_config(): + """Customises formatting and filters for warnings""" + + def custom_warning(message, category, filename, lineno, line=None): + return f'{category.__name__}: {message}\n' + + warnings.formatwarning = custom_warning + warnings.simplefilter("always", Warning) + From d2731a01ca0aa2264420b1987dc11b822ba9d60f Mon Sep 17 00:00:00 2001 From: dpavam Date: Tue, 8 Oct 2024 11:22:50 +0100 Subject: [PATCH 38/64] feat: Adds an optional validation argument to check params --- .../impc_api_helper/solr_request.py | 22 +++++++------------ 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/impc_api_helper/impc_api_helper/solr_request.py b/impc_api_helper/impc_api_helper/solr_request.py index cf93135..cc4664a 100644 --- a/impc_api_helper/impc_api_helper/solr_request.py +++ b/impc_api_helper/impc_api_helper/solr_request.py @@ -1,11 +1,8 @@ from IPython.display import display -from pydantic import ValidationError from tqdm import tqdm - - import pandas as pd import requests -from .utils.utils import CoreParamsValidator +from impc_api_helper.utils.validators import CoreParamsValidator # Display the whole dataframe <15 pd.set_option("display.max_rows", 15) @@ -13,7 +10,7 @@ # Create helper function -def solr_request(core, params, silent=False): +def solr_request(core, params, silent=False, validate=False): """Performs a single Solr request to the IMPC Solr API. Args: @@ -21,7 +18,10 @@ def solr_request(core, params, silent=False): params (dict): dictionary containing the API call parameters. silent (bool, optional): default False If True, displays: URL of API call, the number of found docs - and a portion of the DataFrame. + and a portion of the DataFrame. + validate (bool, optional): default False + If True, validates the parameters against the core schema and raises warnings + if any parameter seems invalid. Returns: @@ -63,17 +63,11 @@ def solr_request(core, params, silent=False): ) """ - # Validate core and params - print("Validating core and params...") - try: + if validate: CoreParamsValidator( core=core, params=params ) - print("Validation passed ") - except ValidationError as e: - print(f"Validation failed: {e.errors()[0].get('msg')}") - base_url = "https://www.ebi.ac.uk/mi/impc/solr/" solr_url = base_url + core + "/select" @@ -111,7 +105,7 @@ def solr_request(core, params, silent=False): else: # print("Error:", response.status_code, response.text) print("Error:", response.status_code) - exit() + def _process_faceting(data, params): """Processes the faceting data from an API response. From ea17aa63516ee7918df033e37a916430bd9ea3e5 Mon Sep 17 00:00:00 2001 From: dpavam Date: Fri, 11 Oct 2024 16:10:08 +0100 Subject: [PATCH 39/64] fix: path to core_fields.json changed for relative imports and MANIFEST.ini to be included when building the package --- impc_api_helper/MANIFEST.in | 1 + impc_api_helper/impc_api_helper/utils/validators.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) create mode 100644 impc_api_helper/MANIFEST.in diff --git a/impc_api_helper/MANIFEST.in b/impc_api_helper/MANIFEST.in new file mode 100644 index 0000000..e833b2e --- /dev/null +++ b/impc_api_helper/MANIFEST.in @@ -0,0 +1 @@ +include impc_api_helper/utils/core_fields.json \ No newline at end of file diff --git a/impc_api_helper/impc_api_helper/utils/validators.py b/impc_api_helper/impc_api_helper/utils/validators.py index 25627d5..64a49cc 100644 --- a/impc_api_helper/impc_api_helper/utils/validators.py +++ b/impc_api_helper/impc_api_helper/utils/validators.py @@ -12,7 +12,7 @@ # Dataclass for the json validator @dataclass class ValidationJson: - CORE_FILE: Path = Path('impc_api_helper', 'impc_api_helper', 'utils', 'core_fields.json') + CORE_FILE: Path = Path(__file__).resolve().parent / 'core_fields.json' _validation_json: Dict[str, List[str]] = field(default_factory=dict, init=False) # Eager initialisation From d25b1f2055c8963ff91a1e7c4933bdb5b2a0865b Mon Sep 17 00:00:00 2001 From: dpavam Date: Fri, 11 Oct 2024 16:10:26 +0100 Subject: [PATCH 40/64] fix: corrects typo in experiment core key --- impc_api_helper/impc_api_helper/utils/core_fields.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/impc_api_helper/impc_api_helper/utils/core_fields.json b/impc_api_helper/impc_api_helper/utils/core_fields.json index d3ce064..48454d9 100644 --- a/impc_api_helper/impc_api_helper/utils/core_fields.json +++ b/impc_api_helper/impc_api_helper/utils/core_fields.json @@ -1,5 +1,5 @@ { - "experimental": [ + "experiment": [ "id", "observation_id", "specimen_id", "phenotyping_center_id", "phenotyping_center", "production_center_id", "production_center", "specimen_project_id", "specimen_project_name", "gene_accession_id", "gene_symbol", "allele_accession_id", "allele_symbol", "zygosity", "sex", "biological_model_id", "biological_sample_id", "biological_sample_group", "strain_accession_id", "strain_name", "genetic_background", "allelic_composition", "colony_id", "litter_id", "date_of_birth", "external_sample_id", "life_stage_name", "life_stage_acc", "datasource_id", "datasource_name", "project_id", "project_name", "pipeline_id", "pipeline_name", "pipeline_stable_id", "procedure_id", "procedure_name", "procedure_stable_id", "procedure_group", "parameter_id", "parameter_name", "parameter_stable_id", "procedure_sequence_id", "experiment_id", "observation_type", "data_type", "experiment_source_id", "date_of_experiment", "weight_parameter_stable_id", "weight_date", "weight_days_old", "weight", "data_point", "order_index", "dimension", "time_point", "discrete_point", "category", "raw_category", "metadata", "metadata_group", "anatomy_id", "anatomy_term", "anatomy_id_term", "anatomy_term_synonym", "top_level_anatomy_id", "top_level_anatomy_term", "top_level_anatomy_term_synonym", "selected_top_level_anatomy_id", "selected_top_level_anatomy_term", "selected_top_level_anatomy_term_synonym", "intermediate_anatomy_id", "intermediate_anatomy_term", "intermediate_anatomy_term_synonym", "parent_anatomy_id", "parent_anatomy_term", "parent_anatomy_term_synonym", "child_anatomy_id", "child_anatomy_term", "child_anatomy_term_synonym", "download_file_path", "image_link", "file_type", "increment_value", "parameter_association_stable_id", "parameter_association_sequence_id", "parameter_association_dim_id", "parameter_association_name", "parameter_association_value", "developmental_stage_acc", "developmental_stage_name", "text_value", "sub_term_id", "sub_term_name", "sub_term_description", "age_in_days", "age_in_weeks" ], "genotype-phenotype": [ From 3a9bb5fd27b3966b09fb24a3794dc3ea91a723d0 Mon Sep 17 00:00:00 2001 From: dpavam Date: Fri, 11 Oct 2024 16:11:00 +0100 Subject: [PATCH 41/64] feat: adds a boolean value to avoid raising a field warning if the core is invalid --- impc_api_helper/impc_api_helper/utils/validators.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/impc_api_helper/impc_api_helper/utils/validators.py b/impc_api_helper/impc_api_helper/utils/validators.py index 64a49cc..ef2d751 100644 --- a/impc_api_helper/impc_api_helper/utils/validators.py +++ b/impc_api_helper/impc_api_helper/utils/validators.py @@ -41,6 +41,7 @@ class CoreParamsValidator(BaseModel): @model_validator(mode='before') @classmethod def validate_core_and_fields(cls, values): + invalid_core: bool = False core = values.get("core") params = values.get("params") @@ -49,6 +50,7 @@ def validate_core_and_fields(cls, values): # Validate core if core not in jv.valid_cores(): + invalid_core = True warnings.warn( message=f'Invalid core: "{core}", select from the available cores:\n{jv.valid_cores()})\n', category=InvalidCoreWarning) @@ -67,10 +69,11 @@ def validate_core_and_fields(cls, values): # Validate each field in params # TODO: perhaps pass al invalid fields as a list, instead of many warning messages - for fl in field_list: - if fl not in jv.valid_fields(core): - warnings.warn(message=f"""Unexpected field name: "{fl}". Check the spelling of fields.\nTo see expected fields check the documentation at: https://www.ebi.ac.uk/mi/impc/solrdoc/""", - category=InvalidFieldWarning) + if invalid_core is not True: + for fl in field_list: + if fl not in jv.valid_fields(core): + warnings.warn(message=f"""Unexpected field name: "{fl}". Check the spelling of fields.\nTo see expected fields check the documentation at: https://www.ebi.ac.uk/mi/impc/solrdoc/""", + category=InvalidFieldWarning) # Return validated values return values From 4db24b65d33ae548c002262ddce60dee814440d4 Mon Sep 17 00:00:00 2001 From: dpavam Date: Fri, 11 Oct 2024 16:12:04 +0100 Subject: [PATCH 42/64] tests: adds basic tests for core and params["fl"] validation --- impc_api_helper/tests/test_solr_request.py | 47 +++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/impc_api_helper/tests/test_solr_request.py b/impc_api_helper/tests/test_solr_request.py index a0a33e0..68f5ed4 100644 --- a/impc_api_helper/tests/test_solr_request.py +++ b/impc_api_helper/tests/test_solr_request.py @@ -2,7 +2,7 @@ from unittest.mock import patch from solr_request import solr_request, _process_faceting from .test_helpers import check_url_status_code_and_params - +from impc_api_helper.utils.warnings import InvalidCoreWarning, InvalidFieldWarning class TestSolrRequest: """Test class for the Solr Request function @@ -286,3 +286,48 @@ def test_process_faceting(self, params, data): assert df.iloc[1, 1] == 9 assert df.iloc[2, 0] == "banana" assert df.iloc[2, 1] == 24 + + + @pytest.mark.parametrize( + "mock_response", + [ + { + "status_code": 200, + "json": { + "response": { + "numFound": 101, + "docs": [ + ], + } + }, + }, + ], + indirect=['mock_response'] + ) + def test_solr_request_core_validation(self, common_params, mock_response): + with pytest.warns(InvalidCoreWarning): + _ = solr_request(core='invalid_core', params=common_params, validate=True) + + @pytest.mark.parametrize( + "mock_response", + [ + { + "status_code": 200, + "json": { + "response": { + "numFound": 101, + "docs": [ + ], + } + }, + }, + ], + indirect=['mock_response'] + ) + def test_solr_request_fields_validation(self, mock_response): + with pytest.warns(InvalidFieldWarning): + _ = solr_request(core="experiment", + params={'q':'*:*', + 'fl': 'invalid_field,another_invalid_field'}, + validate=True) + \ No newline at end of file From bd1dfe02fbd26bb3077166a54ccffc559e1dc9e7 Mon Sep 17 00:00:00 2001 From: dpavam Date: Fri, 11 Oct 2024 16:26:54 +0100 Subject: [PATCH 43/64] refactor: consice params for the test_solr_validation tests --- impc_api_helper/tests/test_solr_request.py | 34 +++++----------------- 1 file changed, 7 insertions(+), 27 deletions(-) diff --git a/impc_api_helper/tests/test_solr_request.py b/impc_api_helper/tests/test_solr_request.py index 68f5ed4..92d2024 100644 --- a/impc_api_helper/tests/test_solr_request.py +++ b/impc_api_helper/tests/test_solr_request.py @@ -287,43 +287,23 @@ def test_process_faceting(self, params, data): assert df.iloc[2, 0] == "banana" assert df.iloc[2, 1] == 24 - - @pytest.mark.parametrize( - "mock_response", - [ - { + # Validation tests + def _validation_response(): + return { "status_code": 200, "json": { "response": { "numFound": 101, - "docs": [ - ], + "docs": [], } }, - }, - ], - indirect=['mock_response'] - ) + } + @pytest.mark.parametrize("mock_response",[_validation_response()], indirect=['mock_response']) def test_solr_request_core_validation(self, common_params, mock_response): with pytest.warns(InvalidCoreWarning): _ = solr_request(core='invalid_core', params=common_params, validate=True) - @pytest.mark.parametrize( - "mock_response", - [ - { - "status_code": 200, - "json": { - "response": { - "numFound": 101, - "docs": [ - ], - } - }, - }, - ], - indirect=['mock_response'] - ) + @pytest.mark.parametrize("mock_response",[_validation_response()], indirect=['mock_response']) def test_solr_request_fields_validation(self, mock_response): with pytest.warns(InvalidFieldWarning): _ = solr_request(core="experiment", From 8a9f85c5e25d33300aec521cadb8f3da45bea061 Mon Sep 17 00:00:00 2001 From: dpavam Date: Fri, 11 Oct 2024 16:27:44 +0100 Subject: [PATCH 44/64] style: formatted test_solr_request module --- impc_api_helper/tests/test_solr_request.py | 36 +++++++++++++--------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/impc_api_helper/tests/test_solr_request.py b/impc_api_helper/tests/test_solr_request.py index 92d2024..b0df4ca 100644 --- a/impc_api_helper/tests/test_solr_request.py +++ b/impc_api_helper/tests/test_solr_request.py @@ -4,6 +4,7 @@ from .test_helpers import check_url_status_code_and_params from impc_api_helper.utils.warnings import InvalidCoreWarning, InvalidFieldWarning + class TestSolrRequest: """Test class for the Solr Request function @@ -290,24 +291,29 @@ def test_process_faceting(self, params, data): # Validation tests def _validation_response(): return { - "status_code": 200, - "json": { - "response": { - "numFound": 101, - "docs": [], - } - }, + "status_code": 200, + "json": { + "response": { + "numFound": 101, + "docs": [], } - @pytest.mark.parametrize("mock_response",[_validation_response()], indirect=['mock_response']) + }, + } + + @pytest.mark.parametrize( + "mock_response", [_validation_response()], indirect=["mock_response"] + ) def test_solr_request_core_validation(self, common_params, mock_response): with pytest.warns(InvalidCoreWarning): - _ = solr_request(core='invalid_core', params=common_params, validate=True) + _ = solr_request(core="invalid_core", params=common_params, validate=True) - @pytest.mark.parametrize("mock_response",[_validation_response()], indirect=['mock_response']) + @pytest.mark.parametrize( + "mock_response", [_validation_response()], indirect=["mock_response"] + ) def test_solr_request_fields_validation(self, mock_response): with pytest.warns(InvalidFieldWarning): - _ = solr_request(core="experiment", - params={'q':'*:*', - 'fl': 'invalid_field,another_invalid_field'}, - validate=True) - \ No newline at end of file + _ = solr_request( + core="experiment", + params={"q": "*:*", "fl": "invalid_field,another_invalid_field"}, + validate=True, + ) From 0a9b171e91d1b9a6ed4f4396064f070f65a51b62 Mon Sep 17 00:00:00 2001 From: dpavam Date: Fri, 11 Oct 2024 17:01:14 +0100 Subject: [PATCH 45/64] chore: update README.md with the validators use cases --- impc_api_helper/README.md | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/impc_api_helper/README.md b/impc_api_helper/README.md index f72e721..bbe89b3 100644 --- a/impc_api_helper/README.md +++ b/impc_api_helper/README.md @@ -27,6 +27,35 @@ num_found, df = solr_request( core='genotype-phenotype', params={ ) ``` +#### Solr request validation +A common pitfall when writing a query is the misspelling of `core` and `fields` arguments. For this, we have included an `validate` argument that raises a warning when these values are not as expected. Note this does not prevent you from executing a query; it just alerts you to a potential issue. + +##### Core validation +``` +num_found, df = solr_request( core='invalid_core', params={ + 'q': '*:*', + 'rows': 10 + }, + validate=True +) + +> InvalidCoreWarning: Invalid core: "genotype-phenotyp", select from the available cores: +> dict_keys(['experiment', 'genotype-phenotype', 'impc_images', 'phenodigm', 'statistical-result'])) +``` + +##### Field list validation +``` +num_found, df = solr_request( core='genotype-phenotype', params={ + 'q': '*:*', + 'rows': 10, + 'fl': 'invalid_field,marker_symbol,allele_symbol' + }, + validate=True +) +> InvalidFieldWarning: Unexpected field name: "invalid_field". Check the spelling of fields. +> To see expected fields check the documentation at: https://www.ebi.ac.uk/mi/impc/solrdoc/ +``` + #### Facet request `solr_request` allows facet requests From 5d9c7dbd354aa3e0a4e3df562a0dcbd19fee2e02 Mon Sep 17 00:00:00 2001 From: dpavam Date: Mon, 14 Oct 2024 15:18:42 +0100 Subject: [PATCH 46/64] feat: Reverts to showing full error message to users --- impc_api_helper/impc_api_helper/solr_request.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/impc_api_helper/impc_api_helper/solr_request.py b/impc_api_helper/impc_api_helper/solr_request.py index cc4664a..888bf14 100644 --- a/impc_api_helper/impc_api_helper/solr_request.py +++ b/impc_api_helper/impc_api_helper/solr_request.py @@ -103,8 +103,7 @@ def solr_request(core, params, silent=False, validate=False): return num_found, df else: - # print("Error:", response.status_code, response.text) - print("Error:", response.status_code) + print("Error:", response.status_code, response.text) def _process_faceting(data, params): From 1203ed162bae93a7ea173615cae9b0c9b1d757e9 Mon Sep 17 00:00:00 2001 From: dpavam Date: Mon, 14 Oct 2024 15:25:47 +0100 Subject: [PATCH 47/64] chore: update gitignore --- .gitignore | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.gitignore b/.gitignore index c47ace4..835c29e 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,7 @@ dist/ *.pytest* *.pytest_cache __pycache__ + + +# Local notes +notes.md \ No newline at end of file From 248a40ebbf4a9595a9f6d5c3cc8bf5718f34db5a Mon Sep 17 00:00:00 2001 From: dpavam <93325413+dpavam@users.noreply.github.com> Date: Mon, 14 Oct 2024 15:27:31 +0100 Subject: [PATCH 48/64] Update impc_api_helper/impc_api_helper/utils/warnings.py Co-authored-by: Marina Kan <113850522+marinak-ebi@users.noreply.github.com> --- impc_api_helper/impc_api_helper/utils/warnings.py | 1 - 1 file changed, 1 deletion(-) diff --git a/impc_api_helper/impc_api_helper/utils/warnings.py b/impc_api_helper/impc_api_helper/utils/warnings.py index d344092..6f154f9 100644 --- a/impc_api_helper/impc_api_helper/utils/warnings.py +++ b/impc_api_helper/impc_api_helper/utils/warnings.py @@ -21,4 +21,3 @@ def custom_warning(message, category, filename, lineno, line=None): warnings.formatwarning = custom_warning warnings.simplefilter("always", Warning) - From eacec4306c6e02e716b7b363d9acad542ccf60cd Mon Sep 17 00:00:00 2001 From: dpavam <93325413+dpavam@users.noreply.github.com> Date: Mon, 14 Oct 2024 15:27:39 +0100 Subject: [PATCH 49/64] Update impc_api_helper/impc_api_helper/utils/validators.py Co-authored-by: Marina Kan <113850522+marinak-ebi@users.noreply.github.com> --- impc_api_helper/impc_api_helper/utils/validators.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/impc_api_helper/impc_api_helper/utils/validators.py b/impc_api_helper/impc_api_helper/utils/validators.py index ef2d751..1bc959a 100644 --- a/impc_api_helper/impc_api_helper/utils/validators.py +++ b/impc_api_helper/impc_api_helper/utils/validators.py @@ -76,7 +76,3 @@ def validate_core_and_fields(cls, values): category=InvalidFieldWarning) # Return validated values return values - - - - From c81c46df71a9d666cbdafb4db40d1d7432f6eca4 Mon Sep 17 00:00:00 2001 From: dpavam Date: Wed, 2 Oct 2024 15:27:45 +0100 Subject: [PATCH 50/64] chore: changes module name iterator_solr_request to batch_solr_request and linted test module --- impc_api_helper/tests/test_batch_solr_request.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/impc_api_helper/tests/test_batch_solr_request.py b/impc_api_helper/tests/test_batch_solr_request.py index d24ef8c..eaa5243 100644 --- a/impc_api_helper/tests/test_batch_solr_request.py +++ b/impc_api_helper/tests/test_batch_solr_request.py @@ -8,6 +8,14 @@ _batch_to_df, _solr_downloader, ) +from unittest.mock import patch, call, Mock +from impc_api_helper.batch_solr_request import ( + batch_solr_request, + _batch_solr_generator, + solr_request, + _batch_to_df, + _solr_downloader, +) import json import pandas as pd from pandas.testing import assert_frame_equal From d7a3a3eaaa1e79515fac3deea6e902ef24f1fc55 Mon Sep 17 00:00:00 2001 From: dpavam Date: Wed, 2 Oct 2024 16:25:14 +0100 Subject: [PATCH 51/64] chore: add facet usecase from solr_request to README.md --- impc_api_helper/README.md | 58 ++++++++++++++++++++++++++++++--------- 1 file changed, 45 insertions(+), 13 deletions(-) diff --git a/impc_api_helper/README.md b/impc_api_helper/README.md index bbe89b3..98ec80b 100644 --- a/impc_api_helper/README.md +++ b/impc_api_helper/README.md @@ -27,6 +27,24 @@ num_found, df = solr_request( core='genotype-phenotype', params={ ) ``` + +#### Facet request +`solr_request` allows facet requests + +``` +num_found, df = solr_request( + core="genotype-phenotype", + params={ + "q": "*:*", + "rows": 0, + "facet": "on", + "facet.field": "zygosity", + "facet.limit": 15, + "facet.mincount": 1, + }, + ) +``` + #### Solr request validation A common pitfall when writing a query is the misspelling of `core` and `fields` arguments. For this, we have included an `validate` argument that raises a warning when these values are not as expected. Note this does not prevent you from executing a query; it just alerts you to a potential issue. @@ -56,23 +74,37 @@ num_found, df = solr_request( core='genotype-phenotype', params={ > To see expected fields check the documentation at: https://www.ebi.ac.uk/mi/impc/solrdoc/ ``` -#### Facet request -`solr_request` allows facet requests +#### Solr request validation +A common pitfall when writing a query is the misspelling of `core` and `fields` arguments. For this, we have included an `validate` argument that raises a warning when these values are not as expected. Note this does not prevent you from executing a query; it just alerts you to a potential issue. +##### Core validation ``` -num_found, df = solr_request( - core="genotype-phenotype", - params={ - "q": "*:*", - "rows": 0, - "facet": "on", - "facet.field": "zygosity", - "facet.limit": 15, - "facet.mincount": 1, - }, - ) +num_found, df = solr_request( core='invalid_core', params={ + 'q': '*:*', + 'rows': 10 + }, + validate=True +) + +> InvalidCoreWarning: Invalid core: "genotype-phenotyp", select from the available cores: +> dict_keys(['experiment', 'genotype-phenotype', 'impc_images', 'phenodigm', 'statistical-result'])) ``` +##### Field list validation +``` +num_found, df = solr_request( core='genotype-phenotype', params={ + 'q': '*:*', + 'rows': 10, + 'fl': 'invalid_field,marker_symbol,allele_symbol' + }, + validate=True +) +> InvalidFieldWarning: Unexpected field name: "invalid_field". Check the spelling of fields. +> To see expected fields check the documentation at: https://www.ebi.ac.uk/mi/impc/solrdoc/ +``` + + + ### Batch Solr Request `batch_solr_request` is available for large queries. This solves issues where a request is too large to fit into memory or where it puts a lot of strain on the API. From 6c9f967d605035ac257df9f44195fe2a74f77038 Mon Sep 17 00:00:00 2001 From: dpavam Date: Fri, 18 Oct 2024 16:55:53 +0100 Subject: [PATCH 52/64] feat: warns the user using params["rows"] in batch_solr_request is ignored. --- .../impc_api_helper/batch_solr_request.py | 14 +++++++++++ .../impc_api_helper/solr_request.py | 1 + .../impc_api_helper/utils/validators.py | 1 + .../impc_api_helper/utils/warnings.py | 4 +++ .../tests/test_batch_solr_request.py | 25 ++++++++++++------- 5 files changed, 36 insertions(+), 9 deletions(-) diff --git a/impc_api_helper/impc_api_helper/batch_solr_request.py b/impc_api_helper/impc_api_helper/batch_solr_request.py index b3aa845..c264d88 100644 --- a/impc_api_helper/impc_api_helper/batch_solr_request.py +++ b/impc_api_helper/impc_api_helper/batch_solr_request.py @@ -5,6 +5,13 @@ from tqdm import tqdm from .solr_request import solr_request from pathlib import Path +import warnings +from impc_api_helper.utils.warnings import warning_config, RowsParamIgnored +# from .utils.warnings import warning_config, RowsParamIgnored + + +# Initialise warning config +warning_config() def batch_solr_request( @@ -27,6 +34,13 @@ def batch_solr_request( pd.DataFrame: if download=False, displays a DataFrame with the results. None: if download=True, displays a statement on the console and returns None. """ + + # If params["rows"] is passed, the user is warned about no effect + if params.get("rows") is not None: + warnings.warn( + message='The "rows" param will be ignored in batch_solr_request. To set a batch size, specify a "batch_size" argument.', + category=RowsParamIgnored) + # Set params for batch request params["start"] = 0 # Start at the first result # Override param rows in case there was input. Defines batch size to 5000 by default diff --git a/impc_api_helper/impc_api_helper/solr_request.py b/impc_api_helper/impc_api_helper/solr_request.py index 888bf14..a6a0f18 100644 --- a/impc_api_helper/impc_api_helper/solr_request.py +++ b/impc_api_helper/impc_api_helper/solr_request.py @@ -3,6 +3,7 @@ import pandas as pd import requests from impc_api_helper.utils.validators import CoreParamsValidator +# from .utils.validators import CoreParamsValidator # Display the whole dataframe <15 pd.set_option("display.max_rows", 15) diff --git a/impc_api_helper/impc_api_helper/utils/validators.py b/impc_api_helper/impc_api_helper/utils/validators.py index 1bc959a..c58f2df 100644 --- a/impc_api_helper/impc_api_helper/utils/validators.py +++ b/impc_api_helper/impc_api_helper/utils/validators.py @@ -5,6 +5,7 @@ import warnings from dataclasses import dataclass, field from impc_api_helper.utils.warnings import warning_config, InvalidCoreWarning, InvalidFieldWarning +# from .warnings import warning_config, InvalidCoreWarning, InvalidFieldWarning # Initialise warning config warning_config() diff --git a/impc_api_helper/impc_api_helper/utils/warnings.py b/impc_api_helper/impc_api_helper/utils/warnings.py index 6f154f9..3ce10df 100644 --- a/impc_api_helper/impc_api_helper/utils/warnings.py +++ b/impc_api_helper/impc_api_helper/utils/warnings.py @@ -12,6 +12,10 @@ class InvalidFieldWarning(Warning): """Exception raised when the field name is not in the expected fields""" +class RowsParamIgnored(Warning): + """Warning raised when the row param is ignored""" + + # Custom warning function def warning_config(): """Customises formatting and filters for warnings""" diff --git a/impc_api_helper/tests/test_batch_solr_request.py b/impc_api_helper/tests/test_batch_solr_request.py index eaa5243..9649a77 100644 --- a/impc_api_helper/tests/test_batch_solr_request.py +++ b/impc_api_helper/tests/test_batch_solr_request.py @@ -8,19 +8,16 @@ _batch_to_df, _solr_downloader, ) -from unittest.mock import patch, call, Mock -from impc_api_helper.batch_solr_request import ( - batch_solr_request, - _batch_solr_generator, - solr_request, - _batch_to_df, - _solr_downloader, -) +from impc_api_helper.utils.warnings import RowsParamIgnored import json import pandas as pd from pandas.testing import assert_frame_equal +# When rows is passed to batch solr request, a warning is raised. +# Let's ignore this warning in all tests except the one that asserts the warning +pytestmark = pytest.mark.filterwarnings("ignore::impc_api_helper.utils.warnings.RowsParamIgnored") + # Fixture containing the core @pytest.fixture def core(): @@ -32,7 +29,7 @@ class TestBatchSolrRequest: @pytest.fixture def common_params(self): return {"start": 0, "rows": 0, "wt": "json"} - + # Fixture mocking solr_request within the batch_solr_request module. # solr_request will be mocked with different values for numFound, therefore it is passed as param @pytest.fixture @@ -313,6 +310,16 @@ def test_batch_solr_request_multiple_fields( assert isinstance(result, pd.DataFrame) is True + # Test the warning when params["rows"] is passed + @pytest.mark.filterwarnings("default::impc_api_helper.utils.warnings.RowsParamIgnored") + @pytest.mark.parametrize("mock_solr_request", [10000], indirect=True) + def test_param_rows_warning(core, common_params, mock_solr_request): + with pytest.warns(RowsParamIgnored): + batch_solr_request( + core, params=common_params + ) + + # Have helper functions in a different class to separate fixtures and parameters class TestHelpersSolrBatchRequest: # Define a generator to produce df's dynamically From d979fa79a71eb7955e28f8d18b0e2345c6f4badb Mon Sep 17 00:00:00 2001 From: dpavam Date: Mon, 21 Oct 2024 17:13:27 +0100 Subject: [PATCH 53/64] feat: user can now pass a custom filename to solr_batch_request --- impc_api_helper/README.md | 6 ++--- .../impc_api_helper/batch_solr_request.py | 10 +++---- .../tests/test_batch_solr_request.py | 26 +++++++++---------- 3 files changed, 21 insertions(+), 21 deletions(-) diff --git a/impc_api_helper/README.md b/impc_api_helper/README.md index 7f3c5c2..f4cb322 100644 --- a/impc_api_helper/README.md +++ b/impc_api_helper/README.md @@ -166,7 +166,7 @@ print(df.head()) ``` ##### Large query - Download -When using the `download=True` option, no DataFrame will be returned, instead a file with the requested information will be saved to the path specified in `path_to_download`. +When using the `download=True` option, no DataFrame will be returned, instead a file with the requested information will be saved as `filename`. The format is selected based on the `wt` parameter. ``` batch_solr_request( @@ -176,7 +176,7 @@ batch_solr_request( 'wt':'csv' }, download=True, - path_to_download = '.', + filename='geno_pheno_query', batch_size=20000 ) ``` @@ -216,7 +216,7 @@ batch_solr_request( 'field_type': 'marker_symbol' }, download = True, - path_to_download='downloads' + filename='gene_list_query' ) ``` diff --git a/impc_api_helper/impc_api_helper/batch_solr_request.py b/impc_api_helper/impc_api_helper/batch_solr_request.py index c264d88..404259a 100644 --- a/impc_api_helper/impc_api_helper/batch_solr_request.py +++ b/impc_api_helper/impc_api_helper/batch_solr_request.py @@ -15,7 +15,7 @@ def batch_solr_request( - core, params, download=False, batch_size=5000, path_to_download="./" + core, params, download=False, batch_size=5000, filename="batch_request" ): """Function for large API requests (>1,000,000 results). Fetches the data in batches and produces a Pandas DataFrame or downloads a file in json or csv formats. @@ -27,7 +27,7 @@ def batch_solr_request( params (dict): dictionary containing the API call parameters. download (bool, optional): True for download a local file, False to display results as a DataFrame. Defaults to False. batch_size (int, optional): Size of batches to fetch the data. Defaults to 5000. - path_to_download (str, optional): When download=True, select the path to download the file. Defaults to './'. + filename (str, optional): When download=True, select the name of the file. Defaults to 'batch_request'. Returns: @@ -75,10 +75,10 @@ def batch_solr_request( # If user decides to download, a generator is used to fetch data in batches without storing results in memory. if download: # Implement loop behaviour - filename = Path(path_to_download) / f"{core}.{params['wt']}" + filename_path = Path(f"{filename}.{params["wt"]}") gen = _batch_solr_generator(core, params, num_results) - _solr_downloader(params, filename, gen) - print(f"File saved as: {filename}") + _solr_downloader(params, filename_path, gen) + print(f"File saved as: {filename_path}") return None # If the number of results is small enough and download is off, it's okay to show as df diff --git a/impc_api_helper/tests/test_batch_solr_request.py b/impc_api_helper/tests/test_batch_solr_request.py index 9649a77..93cc85d 100644 --- a/impc_api_helper/tests/test_batch_solr_request.py +++ b/impc_api_helper/tests/test_batch_solr_request.py @@ -183,16 +183,13 @@ def test_batch_solr_request_download_true( format, file_content, ): - # Create temporary files for the test - temp_dir = tmp_path / "temp_dir" - filename = f"{core}.{format}" - temp_file = temp_dir / filename - temp_file.write_text(file_content) + # Set a filename for the test + filename = f"{core}" # First we call the function # We patch solr_request to get the number of docs result = batch_solr_request( - core, params=params_format, download=True, path_to_download=temp_dir, batch_size=2000000 + core, params=params_format, download=True, filename=filename, batch_size=2000000 ) num_found = mock_solr_request.return_value[0] @@ -205,14 +202,15 @@ def test_batch_solr_request_download_true( ) # Check _solr_downloader gets called once with correct args + # Checks the filename is a Path and has the corresponding format mock_solr_downloader.assert_called_once_with( - params_format, temp_file, mock_batch_solr_generator.return_value + params_format, Path(f"{filename}.{format}"), mock_batch_solr_generator.return_value ) # Check the print statements captured = capsys.readouterr() assert f"Number of found documents: {num_found}" in captured.out - assert f"File saved as: {temp_file}" in captured.out + assert f"File saved as: {filename}.{format}" in captured.out # Check the function returns None assert result is None @@ -260,14 +258,16 @@ def test_batch_solr_request_multiple_fields( # Call test function # If download==True, create a temporary file and call with the path_to_download if download_bool: - temp_dir = tmp_path / "temp_dir" - temp_file = temp_dir / f"{core}.json" - temp_file.write_text('{"id": "1", "city": "Cape Town"}\n') + # temp_dir = tmp_path / "temp_dir" + filename = f"{core}" + # temp_file = temp_dir / f"{core}.json" + # temp_file.write_text('{"id": "1", "city": "Cape Town"}\n') + result = batch_solr_request( core, params=multiple_field_params, download=download_bool, - path_to_download=temp_dir, + filename=filename, ) else: # Otherwise, call without the path_to_download @@ -289,7 +289,7 @@ def test_batch_solr_request_multiple_fields( # Check _solr_downloader gets called once with correct args mock_solr_downloader.assert_called_once_with( - multiple_field_params, temp_file, mock_batch_solr_generator.return_value + multiple_field_params, Path(f"{filename}.json"), mock_batch_solr_generator.return_value ) # Check the function returns None assert result is None From 73b0b9f5ea5fb6157a85234af43b59212c41822d Mon Sep 17 00:00:00 2001 From: dpavam Date: Mon, 21 Oct 2024 21:36:42 +0100 Subject: [PATCH 54/64] feat: adds a helper function to read the df from the recently downloaded file --- .../impc_api_helper/batch_solr_request.py | 46 ++++++++++++++++--- 1 file changed, 40 insertions(+), 6 deletions(-) diff --git a/impc_api_helper/impc_api_helper/batch_solr_request.py b/impc_api_helper/impc_api_helper/batch_solr_request.py index 404259a..efef321 100644 --- a/impc_api_helper/impc_api_helper/batch_solr_request.py +++ b/impc_api_helper/impc_api_helper/batch_solr_request.py @@ -74,12 +74,19 @@ def batch_solr_request( # Download only logic # If user decides to download, a generator is used to fetch data in batches without storing results in memory. if download: - # Implement loop behaviour - filename_path = Path(f"{filename}.{params["wt"]}") - gen = _batch_solr_generator(core, params, num_results) - _solr_downloader(params, filename_path, gen) - print(f"File saved as: {filename_path}") - return None + try: + # Implement loop behaviour + print("Downloading file...") + filename_path = Path(f"{filename}.{params['wt']}") + gen = _batch_solr_generator(core, params, num_results) + _solr_downloader(params, filename_path, gen) + print(f"File saved as: {filename_path}") + + # Try to read the downloaded file + print("Reading downloaded file...") + return _read_downloaded_file(filename_path, params["wt"]) + except Exception as e: + print(f"An unexpected error occured:{e}") # If the number of results is small enough and download is off, it's okay to show as df if num_results < 1000000 and not download: @@ -220,3 +227,30 @@ def _solr_downloader(params, filename, solr_generator): else: # Skip the first line (header) in subsequent chunks f.write("\n" + "\n".join(lines[1:]) + "\n") + + +# File reader +def _read_downloaded_file(filename: Path, request_format): + """ Wrapper for reading files into Pandas DataFrames + + Args: + filename (Path): Name of the file to read + request_format (str): Format of the file to read. Only 'json' and 'csv' are supported. + + Raises: + ValueError: When an unsupported format is passed. + + Returns: + pd.DataFrame: Returns a pd.DataFrame with the data from the file. + """ + try: + match request_format: + case "json": + return pd.read_json(filename) + case "csv": + return pd.read_csv(filename) + case _: + raise ValueError("Unsupported format. Only 'json' and 'csv' are supported.") + except MemoryError as exc: + raise RuntimeError("MemoryError: Insuficient memory to read the file. Consider reading file in batches using Pandas or Polars.") from exc + From 8357839a0c9fdc32d09c962dfe2cbfe5f45c732d Mon Sep 17 00:00:00 2001 From: dpavam Date: Mon, 21 Oct 2024 22:31:40 +0100 Subject: [PATCH 55/64] feat: batch_solr_request raises a MemoryError when dataframe is too big --- impc_api_helper/impc_api_helper/batch_solr_request.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/impc_api_helper/impc_api_helper/batch_solr_request.py b/impc_api_helper/impc_api_helper/batch_solr_request.py index efef321..abfb2bf 100644 --- a/impc_api_helper/impc_api_helper/batch_solr_request.py +++ b/impc_api_helper/impc_api_helper/batch_solr_request.py @@ -239,6 +239,7 @@ def _read_downloaded_file(filename: Path, request_format): Raises: ValueError: When an unsupported format is passed. + MemoryError: When there is not enough memory to read the file. Returns: pd.DataFrame: Returns a pd.DataFrame with the data from the file. @@ -252,5 +253,4 @@ def _read_downloaded_file(filename: Path, request_format): case _: raise ValueError("Unsupported format. Only 'json' and 'csv' are supported.") except MemoryError as exc: - raise RuntimeError("MemoryError: Insuficient memory to read the file. Consider reading file in batches using Pandas or Polars.") from exc - + raise MemoryError("MemoryError: Insuficient memory to read the file. Consider reading file in batches using Pandas or Polars.") from exc From 8ff578001e547fa81c0884b475283e1cdfcfaacf Mon Sep 17 00:00:00 2001 From: dpavam Date: Mon, 21 Oct 2024 22:32:09 +0100 Subject: [PATCH 56/64] test: adds tests for _read_downloaded_file --- .../tests/test_batch_solr_request.py | 57 +++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/impc_api_helper/tests/test_batch_solr_request.py b/impc_api_helper/tests/test_batch_solr_request.py index 93cc85d..597a01b 100644 --- a/impc_api_helper/tests/test_batch_solr_request.py +++ b/impc_api_helper/tests/test_batch_solr_request.py @@ -7,6 +7,7 @@ solr_request, _batch_to_df, _solr_downloader, + _read_downloaded_file ) from impc_api_helper.utils.warnings import RowsParamIgnored import json @@ -635,3 +636,59 @@ def test_solr_downloader( } ).reset_index(drop=True), ) + + # Fixture to create a temporary file for use in tests + @pytest.fixture + def temp_file_fixture(self, tmp_path): + temp_dir = tmp_path / "temp_dir" + temp_dir.mkdir() + filename = "test_file" + return temp_dir / filename + + @pytest.mark.parametrize( + "request_format,content", + [ + ( + "json", + '[{"id": "1", "city": "Cape Town"},{"id": "2", "city": "Prague"}]', + ), + ( + "csv", + 'id,city\n1,Cape Town\n2,Prague\n', + ), + ( + "tsv", + 'id\tcity1\tCape Town2\tPrague' + ) + ] + ) + def test_read_downloaded_file(self, tmp_path, request_format, content, temp_file_fixture): + + # Write the file with corresponding content + temp_file_fixture.write_text(content) + + if (request_format != "json") & (request_format != "csv"): + with pytest.raises(ValueError): + _read_downloaded_file(temp_file_fixture, request_format) + else: + test_df = _read_downloaded_file(temp_file_fixture, request_format) + + # Assert the structure of the final df + assert_frame_equal( + test_df, + pd.DataFrame( + { + "id": [1, 2], + "city": ["Cape Town", "Prague"], + } + ).reset_index(drop=True), + ) + + def test_read_downloaded_file_memory_error(self, temp_file_fixture): + content = "id,city\n1,Cape Town\n2,Prague\n" + temp_file_fixture.write_text(content) + + # Create a mock that raises a memory error when called + with patch("pandas.read_csv", side_effect=MemoryError("Mock MemoryError")): + with pytest.raises(MemoryError, match="Insuficient memory to read the file."): + _ = _read_downloaded_file(temp_file_fixture, "csv") From c902579d5e85ec6ce2d1d56c315fb68977cbf3c2 Mon Sep 17 00:00:00 2001 From: dpavam Date: Tue, 22 Oct 2024 13:58:07 +0100 Subject: [PATCH 57/64] feat: new custom exception and validator to handle unsupported formats when download=True --- .../impc_api_helper/batch_solr_request.py | 22 +++++++++++++------ .../impc_api_helper/utils/validators.py | 18 ++++++++++++--- .../impc_api_helper/utils/warnings.py | 5 +++++ 3 files changed, 35 insertions(+), 10 deletions(-) diff --git a/impc_api_helper/impc_api_helper/batch_solr_request.py b/impc_api_helper/impc_api_helper/batch_solr_request.py index abfb2bf..ee44577 100644 --- a/impc_api_helper/impc_api_helper/batch_solr_request.py +++ b/impc_api_helper/impc_api_helper/batch_solr_request.py @@ -6,8 +6,11 @@ from .solr_request import solr_request from pathlib import Path import warnings -from impc_api_helper.utils.warnings import warning_config, RowsParamIgnored -# from .utils.warnings import warning_config, RowsParamIgnored +from impc_api_helper.utils.warnings import warning_config, RowsParamIgnored, UnsupportedDownloadFormatError +from impc_api_helper.utils.validators import DownloadFormatValidator +# from .utils.warnings import warning_config, RowsParamIgnored, UnsupportedDownloadFormatError +# from .utils.validators import DownloadFormatValidator + # Initialise warning config @@ -71,22 +74,30 @@ def batch_solr_request( ) print(f"Number of found documents: {num_results}") - # Download only logic + # # Download only logic # If user decides to download, a generator is used to fetch data in batches without storing results in memory. if download: try: + # Check if the format is supported + DownloadFormatValidator(wt=params["wt"]) + # Implement loop behaviour print("Downloading file...") filename_path = Path(f"{filename}.{params['wt']}") gen = _batch_solr_generator(core, params, num_results) _solr_downloader(params, filename_path, gen) print(f"File saved as: {filename_path}") + except UnsupportedDownloadFormatError as e: + raise e + except Exception as e: + raise (f"An error ocurred while downloading the data:{e}") # Try to read the downloaded file + try: print("Reading downloaded file...") return _read_downloaded_file(filename_path, params["wt"]) except Exception as e: - print(f"An unexpected error occured:{e}") + raise Exception(f"An unexpected error occured:{e}") # If the number of results is small enough and download is off, it's okay to show as df if num_results < 1000000 and not download: @@ -238,7 +249,6 @@ def _read_downloaded_file(filename: Path, request_format): request_format (str): Format of the file to read. Only 'json' and 'csv' are supported. Raises: - ValueError: When an unsupported format is passed. MemoryError: When there is not enough memory to read the file. Returns: @@ -250,7 +260,5 @@ def _read_downloaded_file(filename: Path, request_format): return pd.read_json(filename) case "csv": return pd.read_csv(filename) - case _: - raise ValueError("Unsupported format. Only 'json' and 'csv' are supported.") except MemoryError as exc: raise MemoryError("MemoryError: Insuficient memory to read the file. Consider reading file in batches using Pandas or Polars.") from exc diff --git a/impc_api_helper/impc_api_helper/utils/validators.py b/impc_api_helper/impc_api_helper/utils/validators.py index c58f2df..38ec919 100644 --- a/impc_api_helper/impc_api_helper/utils/validators.py +++ b/impc_api_helper/impc_api_helper/utils/validators.py @@ -1,11 +1,11 @@ -from pydantic import BaseModel, model_validator +from pydantic import BaseModel, model_validator, field_validator import json from typing import List, Dict from pathlib import Path import warnings from dataclasses import dataclass, field -from impc_api_helper.utils.warnings import warning_config, InvalidCoreWarning, InvalidFieldWarning -# from .warnings import warning_config, InvalidCoreWarning, InvalidFieldWarning +from impc_api_helper.utils.warnings import warning_config, InvalidCoreWarning, InvalidFieldWarning, UnsupportedDownloadFormatError +# from .warnings import warning_config, InvalidCoreWarning, InvalidFieldWarning, UnsupportedDownloadFormatError # Initialise warning config warning_config() @@ -77,3 +77,15 @@ def validate_core_and_fields(cls, values): category=InvalidFieldWarning) # Return validated values return values + + +class DownloadFormatValidator(BaseModel): + """Validates params["wt"] from a batch_request""" + wt: str + + @field_validator('wt') + def validate_wt(cls, value): + supported_formats = {'json', 'csv'} + if value not in supported_formats: + raise UnsupportedDownloadFormatError(f"Unsupported format '{value}'. Only {supported_formats} are supported for download.") + return value diff --git a/impc_api_helper/impc_api_helper/utils/warnings.py b/impc_api_helper/impc_api_helper/utils/warnings.py index 3ce10df..c8b08fd 100644 --- a/impc_api_helper/impc_api_helper/utils/warnings.py +++ b/impc_api_helper/impc_api_helper/utils/warnings.py @@ -16,6 +16,11 @@ class RowsParamIgnored(Warning): """Warning raised when the row param is ignored""" +# custom exceptions +class UnsupportedDownloadFormatError(Exception): + """Exception raised when the format is not supported for download""" + + # Custom warning function def warning_config(): """Customises formatting and filters for warnings""" From b5dcca086041b32befd81fb6c5ddef216b12efeb Mon Sep 17 00:00:00 2001 From: dpavam Date: Tue, 22 Oct 2024 14:37:49 +0100 Subject: [PATCH 58/64] tests: test for param["wt"] validation when download=true --- .../tests/test_batch_solr_request.py | 51 +++++++++++-------- 1 file changed, 29 insertions(+), 22 deletions(-) diff --git a/impc_api_helper/tests/test_batch_solr_request.py b/impc_api_helper/tests/test_batch_solr_request.py index 597a01b..093a178 100644 --- a/impc_api_helper/tests/test_batch_solr_request.py +++ b/impc_api_helper/tests/test_batch_solr_request.py @@ -9,7 +9,7 @@ _solr_downloader, _read_downloaded_file ) -from impc_api_helper.utils.warnings import RowsParamIgnored +from impc_api_helper.utils.warnings import RowsParamIgnored, UnsupportedDownloadFormatError import json import pandas as pd from pandas.testing import assert_frame_equal @@ -184,6 +184,7 @@ def test_batch_solr_request_download_true( format, file_content, ): + # Set a filename for the test filename = f"{core}" @@ -216,7 +217,21 @@ def test_batch_solr_request_download_true( # Check the function returns None assert result is None - # Test download - multiple fields - large and small + + # Test the download validates parameters + # Mock response for test containing 2,000,000 docs + @pytest.mark.parametrize("mock_solr_request", [2000000], indirect=True) + def test_batch_solr_request_download_true_validate_params_wt(self, core, mock_solr_request): + # Set a filename for the test + filename = f"{core}" + params = {"start": 0, "rows": 0, "wt": "wrong_format"} + + # Assert exception when the format is unsupported + if format != "json" and format != "csv": + with pytest.raises(UnsupportedDownloadFormatError): + batch_solr_request(core, params=params, download=True, filename=filename, batch_size=2000000) + + # Test download - multiple fields - large and small # Mock params for a multiple field query @pytest.fixture def multiple_field_params(self): @@ -655,34 +670,26 @@ def temp_file_fixture(self, tmp_path): ( "csv", 'id,city\n1,Cape Town\n2,Prague\n', - ), - ( - "tsv", - 'id\tcity1\tCape Town2\tPrague' ) ] ) - def test_read_downloaded_file(self, tmp_path, request_format, content, temp_file_fixture): + def test_read_downloaded_file(self, request_format, content, temp_file_fixture): # Write the file with corresponding content temp_file_fixture.write_text(content) - if (request_format != "json") & (request_format != "csv"): - with pytest.raises(ValueError): - _read_downloaded_file(temp_file_fixture, request_format) - else: - test_df = _read_downloaded_file(temp_file_fixture, request_format) + test_df = _read_downloaded_file(temp_file_fixture, request_format) - # Assert the structure of the final df - assert_frame_equal( - test_df, - pd.DataFrame( - { - "id": [1, 2], - "city": ["Cape Town", "Prague"], - } - ).reset_index(drop=True), - ) + # Assert the structure of the final df + assert_frame_equal( + test_df, + pd.DataFrame( + { + "id": [1, 2], + "city": ["Cape Town", "Prague"], + } + ).reset_index(drop=True), + ) def test_read_downloaded_file_memory_error(self, temp_file_fixture): content = "id,city\n1,Cape Town\n2,Prague\n" From 6d240dae007d5e6c14c8eb1352db0b4e9e43f148 Mon Sep 17 00:00:00 2001 From: dpavam Date: Tue, 22 Oct 2024 15:33:13 +0100 Subject: [PATCH 59/64] test: fixed the tests to expect dataframes when download == True --- .../tests/test_batch_solr_request.py | 73 ++++++++++++------- 1 file changed, 48 insertions(+), 25 deletions(-) diff --git a/impc_api_helper/tests/test_batch_solr_request.py b/impc_api_helper/tests/test_batch_solr_request.py index 093a178..cf53dbe 100644 --- a/impc_api_helper/tests/test_batch_solr_request.py +++ b/impc_api_helper/tests/test_batch_solr_request.py @@ -24,6 +24,13 @@ def core(): return "test_core" +# Fixture to create a temporary file for use in tests +@pytest.fixture(scope="function") +def temp_file_fixture(tmp_path,): + temp_dir = tmp_path / "temp_dir" + temp_dir.mkdir(exist_ok=True) + return temp_dir / "test_file" + class TestBatchSolrRequest: # Fixture containing the params of a normal batch_solr_request @@ -179,19 +186,20 @@ def test_batch_solr_request_download_true( mock_solr_request, mock_batch_solr_generator, mock_solr_downloader, - tmp_path, params_format, format, file_content, + temp_file_fixture ): - # Set a filename for the test - filename = f"{core}" + # Write the file with corresponding content + file_and_format = f"{temp_file_fixture}.{format}" + Path(file_and_format).write_text(file_content) # First we call the function # We patch solr_request to get the number of docs result = batch_solr_request( - core, params=params_format, download=True, filename=filename, batch_size=2000000 + core, params=params_format, download=True, filename=temp_file_fixture, batch_size=2000000 ) num_found = mock_solr_request.return_value[0] @@ -206,16 +214,25 @@ def test_batch_solr_request_download_true( # Check _solr_downloader gets called once with correct args # Checks the filename is a Path and has the corresponding format mock_solr_downloader.assert_called_once_with( - params_format, Path(f"{filename}.{format}"), mock_batch_solr_generator.return_value + params_format, Path(file_and_format), mock_batch_solr_generator.return_value ) # Check the print statements captured = capsys.readouterr() assert f"Number of found documents: {num_found}" in captured.out - assert f"File saved as: {filename}.{format}" in captured.out + assert f"File saved as: {file_and_format}" in captured.out - # Check the function returns None - assert result is None + # Check the function returns a df with expected content + # Assert the structure of the final df + assert_frame_equal( + result, + pd.DataFrame( + { + "id": [1, 2], + "city": ["Houston", "Prague"], + } + ).reset_index(drop=True), + ) # Test the download validates parameters @@ -231,7 +248,8 @@ def test_batch_solr_request_download_true_validate_params_wt(self, core, mock_so with pytest.raises(UnsupportedDownloadFormatError): batch_solr_request(core, params=params, download=True, filename=filename, batch_size=2000000) - # Test download - multiple fields - large and small + + # Test download - multiple fields - large and small # Mock params for a multiple field query @pytest.fixture def multiple_field_params(self): @@ -262,7 +280,7 @@ def test_batch_solr_request_multiple_fields( monkeypatch, mock_batch_to_df, mock_solr_downloader, - tmp_path, + temp_file_fixture, ): # This test should ensure the request is formatted properly. Regardless of going to downloads or to _batch_to_df # Retrieve num_found @@ -274,16 +292,17 @@ def test_batch_solr_request_multiple_fields( # Call test function # If download==True, create a temporary file and call with the path_to_download if download_bool: - # temp_dir = tmp_path / "temp_dir" - filename = f"{core}" - # temp_file = temp_dir / f"{core}.json" - # temp_file.write_text('{"id": "1", "city": "Cape Town"}\n') + + # Write the file with corresponding content + file_content = '[{"id": "1", "city": "Cape Town"}]\n' + file_and_format = f"{temp_file_fixture}.json" + Path(file_and_format).write_text(file_content) result = batch_solr_request( core, params=multiple_field_params, download=download_bool, - filename=filename, + filename=temp_file_fixture, ) else: # Otherwise, call without the path_to_download @@ -305,10 +324,20 @@ def test_batch_solr_request_multiple_fields( # Check _solr_downloader gets called once with correct args mock_solr_downloader.assert_called_once_with( - multiple_field_params, Path(f"{filename}.json"), mock_batch_solr_generator.return_value + multiple_field_params, Path(file_and_format), mock_batch_solr_generator.return_value + ) + + # Check the function returns a df with expected content + # Assert the structure of the final df + assert_frame_equal( + result, + pd.DataFrame( + { + "id": [1], + "city": ["Cape Town"], + } + ).reset_index(drop=True), ) - # Check the function returns None - assert result is None # Otherwise, use the 'y' input at the start of the test and make sure the required function is executed. if not download_bool and num_found == 2000000: @@ -652,13 +681,7 @@ def test_solr_downloader( ).reset_index(drop=True), ) - # Fixture to create a temporary file for use in tests - @pytest.fixture - def temp_file_fixture(self, tmp_path): - temp_dir = tmp_path / "temp_dir" - temp_dir.mkdir() - filename = "test_file" - return temp_dir / filename + @pytest.mark.parametrize( "request_format,content", From 2057d2bc3e136a07e8cb1b686501792802259013 Mon Sep 17 00:00:00 2001 From: dpavam Date: Tue, 22 Oct 2024 15:36:43 +0100 Subject: [PATCH 60/64] chore: formatting of all modules --- .../impc_api_helper/batch_solr_request.py | 24 ++-- .../impc_api_helper/solr_request.py | 57 +++++---- .../impc_api_helper/utils/validators.py | 36 ++++-- .../impc_api_helper/utils/warnings.py | 2 +- .../tests/test_batch_solr_request.py | 111 +++++++++++------- 5 files changed, 132 insertions(+), 98 deletions(-) diff --git a/impc_api_helper/impc_api_helper/batch_solr_request.py b/impc_api_helper/impc_api_helper/batch_solr_request.py index ee44577..9876832 100644 --- a/impc_api_helper/impc_api_helper/batch_solr_request.py +++ b/impc_api_helper/impc_api_helper/batch_solr_request.py @@ -6,13 +6,16 @@ from .solr_request import solr_request from pathlib import Path import warnings -from impc_api_helper.utils.warnings import warning_config, RowsParamIgnored, UnsupportedDownloadFormatError +from impc_api_helper.utils.warnings import ( + warning_config, + RowsParamIgnored, + UnsupportedDownloadFormatError, +) from impc_api_helper.utils.validators import DownloadFormatValidator # from .utils.warnings import warning_config, RowsParamIgnored, UnsupportedDownloadFormatError # from .utils.validators import DownloadFormatValidator - # Initialise warning config warning_config() @@ -41,8 +44,9 @@ def batch_solr_request( # If params["rows"] is passed, the user is warned about no effect if params.get("rows") is not None: warnings.warn( - message='The "rows" param will be ignored in batch_solr_request. To set a batch size, specify a "batch_size" argument.', - category=RowsParamIgnored) + message='The "rows" param will be ignored in batch_solr_request. To set a batch size, specify a "batch_size" argument.', + category=RowsParamIgnored, + ) # Set params for batch request params["start"] = 0 # Start at the first result @@ -74,7 +78,7 @@ def batch_solr_request( ) print(f"Number of found documents: {num_results}") - # # Download only logic + # # Download only logic # If user decides to download, a generator is used to fetch data in batches without storing results in memory. if download: try: @@ -91,7 +95,7 @@ def batch_solr_request( raise e except Exception as e: raise (f"An error ocurred while downloading the data:{e}") - + # Try to read the downloaded file try: print("Reading downloaded file...") @@ -141,7 +145,6 @@ def _batch_to_df(core, params, num_results): # Request chunks until we have complete data. with tqdm(total=num_results) as pbar: while start < num_results: - # Request chunk. We don't need num_results anymore because it does not change. _, df_chunk = solr_request( core=core, @@ -214,7 +217,6 @@ def _solr_downloader(params, filename, solr_generator): solr_generator ([dict, str]): Generator object with the results. """ with open(filename, "w", encoding="UTF-8") as f: - if params.get("wt") == "json": f.write("[\n") first_chunk = True @@ -242,7 +244,7 @@ def _solr_downloader(params, filename, solr_generator): # File reader def _read_downloaded_file(filename: Path, request_format): - """ Wrapper for reading files into Pandas DataFrames + """Wrapper for reading files into Pandas DataFrames Args: filename (Path): Name of the file to read @@ -261,4 +263,6 @@ def _read_downloaded_file(filename: Path, request_format): case "csv": return pd.read_csv(filename) except MemoryError as exc: - raise MemoryError("MemoryError: Insuficient memory to read the file. Consider reading file in batches using Pandas or Polars.") from exc + raise MemoryError( + "MemoryError: Insuficient memory to read the file. Consider reading file in batches using Pandas or Polars." + ) from exc diff --git a/impc_api_helper/impc_api_helper/solr_request.py b/impc_api_helper/impc_api_helper/solr_request.py index a6a0f18..3318c40 100644 --- a/impc_api_helper/impc_api_helper/solr_request.py +++ b/impc_api_helper/impc_api_helper/solr_request.py @@ -13,12 +13,12 @@ # Create helper function def solr_request(core, params, silent=False, validate=False): """Performs a single Solr request to the IMPC Solr API. - + Args: core (str): name of IMPC solr core. params (dict): dictionary containing the API call parameters. silent (bool, optional): default False - If True, displays: URL of API call, the number of found docs + If True, displays: URL of API call, the number of found docs and a portion of the DataFrame. validate (bool, optional): default False If True, validates the parameters against the core schema and raises warnings @@ -38,7 +38,7 @@ def solr_request(core, params, silent=False, validate=False): 'fl': 'marker_symbol,allele_symbol,parameter_stable_id', # Fields to retrieve. } ) - + Faceting query provides a summary of data distribution across the specified fields. Example faceting query: num_found, df = solr_request( @@ -54,7 +54,7 @@ def solr_request(core, params, silent=False, validate=False): ) When querying the phenodigm core, pass 'q': 'type:...' - Example phenodigm query: + Example phenodigm query: num_found, df = solr_request( core='phenodigm', params={ @@ -65,10 +65,7 @@ def solr_request(core, params, silent=False, validate=False): """ if validate: - CoreParamsValidator( - core=core, - params=params - ) + CoreParamsValidator(core=core, params=params) base_url = "https://www.ebi.ac.uk/mi/impc/solr/" solr_url = base_url + core + "/select" @@ -86,7 +83,7 @@ def solr_request(core, params, silent=False, validate=False): print(f"Number of found documents: {num_found}\n") # For faceting query. - if params.get('facet') == 'on': + if params.get("facet") == "on": df = _process_faceting(data, params) # For regular query. @@ -132,8 +129,8 @@ def _process_faceting(data, params): # Convert the list of dictionaries into a DataFrame and print the DataFrame. df = pd.DataFrame.from_dict( - faceting_dict, orient="index", columns=["counts"] - ).reset_index() + faceting_dict, orient="index", columns=["counts"] + ).reset_index() # Rename the columns. df.columns = [params["facet.field"], "count_per_category"] return df @@ -142,27 +139,27 @@ def _process_faceting(data, params): # Batch request based on solr_request. def batch_request(core, params, batch_size): """Calls `solr_request` multiple times with `params` - to retrieve results in chunk `batch_size` rows at a time. - - Passing parameter `rows` is ignored and replaced with `batch_size` - - Args: - core (str): name of IMPC solr core. - params (dict): dictionary containing the API call parameters. - batch_size (int): Size of batches (number of docs) per request. + to retrieve results in chunk `batch_size` rows at a time. - Returns: - pandas.DataFrame: Pandas.DataFrame object with the information requested. + Passing parameter `rows` is ignored and replaced with `batch_size` - Example query: - df = batch_request( - core="genotype-phenotype", - params={ - 'q': 'top_level_mp_term_name:"cardiovascular system phenotype" AND effect_size:[* TO *] AND life_stage_name:"Late adult"', - 'fl': 'allele_accession_id,life_stage_name,marker_symbol,mp_term_name,p_value,parameter_name,parameter_stable_id,phenotyping_center,statistical_method,top_level_mp_term_name,effect_size' - }, - batch_size=100 - ) + Args: + core (str): name of IMPC solr core. + params (dict): dictionary containing the API call parameters. + batch_size (int): Size of batches (number of docs) per request. + + Returns: + pandas.DataFrame: Pandas.DataFrame object with the information requested. + + Example query: + df = batch_request( + core="genotype-phenotype", + params={ + 'q': 'top_level_mp_term_name:"cardiovascular system phenotype" AND effect_size:[* TO *] AND life_stage_name:"Late adult"', + 'fl': 'allele_accession_id,life_stage_name,marker_symbol,mp_term_name,p_value,parameter_name,parameter_stable_id,phenotyping_center,statistical_method,top_level_mp_term_name,effect_size' + }, + batch_size=100 + ) """ if "rows" in "params": diff --git a/impc_api_helper/impc_api_helper/utils/validators.py b/impc_api_helper/impc_api_helper/utils/validators.py index 38ec919..f372c49 100644 --- a/impc_api_helper/impc_api_helper/utils/validators.py +++ b/impc_api_helper/impc_api_helper/utils/validators.py @@ -4,16 +4,22 @@ from pathlib import Path import warnings from dataclasses import dataclass, field -from impc_api_helper.utils.warnings import warning_config, InvalidCoreWarning, InvalidFieldWarning, UnsupportedDownloadFormatError +from impc_api_helper.utils.warnings import ( + warning_config, + InvalidCoreWarning, + InvalidFieldWarning, + UnsupportedDownloadFormatError, +) # from .warnings import warning_config, InvalidCoreWarning, InvalidFieldWarning, UnsupportedDownloadFormatError # Initialise warning config warning_config() + # Dataclass for the json validator @dataclass class ValidationJson: - CORE_FILE: Path = Path(__file__).resolve().parent / 'core_fields.json' + CORE_FILE: Path = Path(__file__).resolve().parent / "core_fields.json" _validation_json: Dict[str, List[str]] = field(default_factory=dict, init=False) # Eager initialisation @@ -21,8 +27,8 @@ def __post_init__(self): self._validation_json = self.load_core_fields(self.CORE_FILE) def load_core_fields(self, filename: Path) -> Dict[str, List[str]]: - with open(filename, "r") as f: - return json.load(f) + with open(filename, "r") as f: + return json.load(f) def valid_cores(self): return self._validation_json.keys() @@ -30,6 +36,7 @@ def valid_cores(self): def valid_fields(self, core: str) -> List[str]: return self._validation_json.get(core, []) + # Function to parse the fields (fl) params in params def get_fields(fields: str) -> List[str]: return fields.split(",") @@ -39,7 +46,7 @@ class CoreParamsValidator(BaseModel): core: str params: Dict - @model_validator(mode='before') + @model_validator(mode="before") @classmethod def validate_core_and_fields(cls, values): invalid_core: bool = False @@ -54,7 +61,8 @@ def validate_core_and_fields(cls, values): invalid_core = True warnings.warn( message=f'Invalid core: "{core}", select from the available cores:\n{jv.valid_cores()})\n', - category=InvalidCoreWarning) + category=InvalidCoreWarning, + ) # Compare passed fl values vs the allowed fl values for a given core fields: str = params.get("fl") @@ -67,25 +75,29 @@ def validate_core_and_fields(cls, values): # Get the fields passed to params and the expected fields for the core field_list: List[str] = get_fields(fields) - # Validate each field in params # TODO: perhaps pass al invalid fields as a list, instead of many warning messages if invalid_core is not True: for fl in field_list: if fl not in jv.valid_fields(core): - warnings.warn(message=f"""Unexpected field name: "{fl}". Check the spelling of fields.\nTo see expected fields check the documentation at: https://www.ebi.ac.uk/mi/impc/solrdoc/""", - category=InvalidFieldWarning) + warnings.warn( + message=f"""Unexpected field name: "{fl}". Check the spelling of fields.\nTo see expected fields check the documentation at: https://www.ebi.ac.uk/mi/impc/solrdoc/""", + category=InvalidFieldWarning, + ) # Return validated values return values class DownloadFormatValidator(BaseModel): """Validates params["wt"] from a batch_request""" + wt: str - @field_validator('wt') + @field_validator("wt") def validate_wt(cls, value): - supported_formats = {'json', 'csv'} + supported_formats = {"json", "csv"} if value not in supported_formats: - raise UnsupportedDownloadFormatError(f"Unsupported format '{value}'. Only {supported_formats} are supported for download.") + raise UnsupportedDownloadFormatError( + f"Unsupported format '{value}'. Only {supported_formats} are supported for download." + ) return value diff --git a/impc_api_helper/impc_api_helper/utils/warnings.py b/impc_api_helper/impc_api_helper/utils/warnings.py index c8b08fd..7773c1c 100644 --- a/impc_api_helper/impc_api_helper/utils/warnings.py +++ b/impc_api_helper/impc_api_helper/utils/warnings.py @@ -26,7 +26,7 @@ def warning_config(): """Customises formatting and filters for warnings""" def custom_warning(message, category, filename, lineno, line=None): - return f'{category.__name__}: {message}\n' + return f"{category.__name__}: {message}\n" warnings.formatwarning = custom_warning warnings.simplefilter("always", Warning) diff --git a/impc_api_helper/tests/test_batch_solr_request.py b/impc_api_helper/tests/test_batch_solr_request.py index cf53dbe..a595efe 100644 --- a/impc_api_helper/tests/test_batch_solr_request.py +++ b/impc_api_helper/tests/test_batch_solr_request.py @@ -7,9 +7,12 @@ solr_request, _batch_to_df, _solr_downloader, - _read_downloaded_file + _read_downloaded_file, +) +from impc_api_helper.utils.warnings import ( + RowsParamIgnored, + UnsupportedDownloadFormatError, ) -from impc_api_helper.utils.warnings import RowsParamIgnored, UnsupportedDownloadFormatError import json import pandas as pd from pandas.testing import assert_frame_equal @@ -17,16 +20,22 @@ # When rows is passed to batch solr request, a warning is raised. # Let's ignore this warning in all tests except the one that asserts the warning -pytestmark = pytest.mark.filterwarnings("ignore::impc_api_helper.utils.warnings.RowsParamIgnored") +pytestmark = pytest.mark.filterwarnings( + "ignore::impc_api_helper.utils.warnings.RowsParamIgnored" +) + # Fixture containing the core @pytest.fixture def core(): return "test_core" + # Fixture to create a temporary file for use in tests @pytest.fixture(scope="function") -def temp_file_fixture(tmp_path,): +def temp_file_fixture( + tmp_path, +): temp_dir = tmp_path / "temp_dir" temp_dir.mkdir(exist_ok=True) return temp_dir / "test_file" @@ -37,7 +46,7 @@ class TestBatchSolrRequest: @pytest.fixture def common_params(self): return {"start": 0, "rows": 0, "wt": "json"} - + # Fixture mocking solr_request within the batch_solr_request module. # solr_request will be mocked with different values for numFound, therefore it is passed as param @pytest.fixture @@ -62,7 +71,9 @@ def test_batch_solr_request_no_download_small_request( self, mock_solr_request, core, common_params, capsys, mock_batch_to_df ): # Call tested function - result = batch_solr_request(core, params=common_params, download=False, batch_size=100) + result = batch_solr_request( + core, params=common_params, download=False, batch_size=100 + ) # Assert the value of params was changed to batch_size assert common_params["rows"] == 100 @@ -146,9 +157,7 @@ def test_batch_solr_request_download_false_large_request( # Fixture mocking _batch_solr_generator @pytest.fixture def mock_batch_solr_generator(self): - with patch( - "impc_api_helper.batch_solr_request._batch_solr_generator" - ) as mock: + with patch("impc_api_helper.batch_solr_request._batch_solr_generator") as mock: yield mock # Fixture mocking _solr_downloader. Yields a tmp_path to write a file for the duration of the test. @@ -178,7 +187,7 @@ def mock_solr_downloader(self, tmp_path): ], ) # This test should check the correct helper functions and print statements are called. - # Calling the API and writing the file are tested within the helpers. + # Calling the API and writing the file are tested within the helpers. def test_batch_solr_request_download_true( self, core, @@ -189,9 +198,8 @@ def test_batch_solr_request_download_true( params_format, format, file_content, - temp_file_fixture + temp_file_fixture, ): - # Write the file with corresponding content file_and_format = f"{temp_file_fixture}.{format}" Path(file_and_format).write_text(file_content) @@ -199,7 +207,11 @@ def test_batch_solr_request_download_true( # First we call the function # We patch solr_request to get the number of docs result = batch_solr_request( - core, params=params_format, download=True, filename=temp_file_fixture, batch_size=2000000 + core, + params=params_format, + download=True, + filename=temp_file_fixture, + batch_size=2000000, ) num_found = mock_solr_request.return_value[0] @@ -234,11 +246,12 @@ def test_batch_solr_request_download_true( ).reset_index(drop=True), ) - # Test the download validates parameters # Mock response for test containing 2,000,000 docs @pytest.mark.parametrize("mock_solr_request", [2000000], indirect=True) - def test_batch_solr_request_download_true_validate_params_wt(self, core, mock_solr_request): + def test_batch_solr_request_download_true_validate_params_wt( + self, core, mock_solr_request + ): # Set a filename for the test filename = f"{core}" params = {"start": 0, "rows": 0, "wt": "wrong_format"} @@ -246,8 +259,13 @@ def test_batch_solr_request_download_true_validate_params_wt(self, core, mock_so # Assert exception when the format is unsupported if format != "json" and format != "csv": with pytest.raises(UnsupportedDownloadFormatError): - batch_solr_request(core, params=params, download=True, filename=filename, batch_size=2000000) - + batch_solr_request( + core, + params=params, + download=True, + filename=filename, + batch_size=2000000, + ) # Test download - multiple fields - large and small # Mock params for a multiple field query @@ -268,7 +286,6 @@ def multiple_field_params(self): "download_bool", [(True), (False)], ) - def test_batch_solr_request_multiple_fields( self, core, @@ -292,7 +309,6 @@ def test_batch_solr_request_multiple_fields( # Call test function # If download==True, create a temporary file and call with the path_to_download if download_bool: - # Write the file with corresponding content file_content = '[{"id": "1", "city": "Cape Town"}]\n' file_and_format = f"{temp_file_fixture}.json" @@ -307,7 +323,10 @@ def test_batch_solr_request_multiple_fields( else: # Otherwise, call without the path_to_download result = batch_solr_request( - core, params=multiple_field_params, download=download_bool, batch_size=5000 + core, + params=multiple_field_params, + download=download_bool, + batch_size=5000, ) # Check output which should be equal for both. @@ -324,7 +343,9 @@ def test_batch_solr_request_multiple_fields( # Check _solr_downloader gets called once with correct args mock_solr_downloader.assert_called_once_with( - multiple_field_params, Path(file_and_format), mock_batch_solr_generator.return_value + multiple_field_params, + Path(file_and_format), + mock_batch_solr_generator.return_value, ) # Check the function returns a df with expected content @@ -354,15 +375,14 @@ def test_batch_solr_request_multiple_fields( assert result is not None assert isinstance(result, pd.DataFrame) is True - # Test the warning when params["rows"] is passed - @pytest.mark.filterwarnings("default::impc_api_helper.utils.warnings.RowsParamIgnored") + @pytest.mark.filterwarnings( + "default::impc_api_helper.utils.warnings.RowsParamIgnored" + ) @pytest.mark.parametrize("mock_solr_request", [10000], indirect=True) def test_param_rows_warning(core, common_params, mock_solr_request): with pytest.warns(RowsParamIgnored): - batch_solr_request( - core, params=common_params - ) + batch_solr_request(core, params=common_params) # Have helper functions in a different class to separate fixtures and parameters @@ -424,7 +444,6 @@ def num_found(self, request): def test_batch_to_df( self, core, batch_params, num_found, mock_solr_request_generator, batch_size ): - # Call the tested function df = _batch_to_df(core, batch_params, num_found) @@ -491,7 +510,6 @@ def side_effect(*args, **kwargs): yield mock_get - # Fixture containing the params for batch_solr_generator @pytest.fixture def batch_solr_generator_params(self): @@ -549,7 +567,11 @@ def test_batch_solr_generator( # Since the function mock_requests_get.assert_called_with( "https://www.ebi.ac.uk/mi/impc/solr/test_core/select", - params={**batch_solr_generator_params, "start": idx, "rows": batch_size}, + params={ + **batch_solr_generator_params, + "start": idx, + "rows": batch_size, + }, timeout=10, ) @@ -681,23 +703,20 @@ def test_solr_downloader( ).reset_index(drop=True), ) - - @pytest.mark.parametrize( - "request_format,content", - [ - ( - "json", - '[{"id": "1", "city": "Cape Town"},{"id": "2", "city": "Prague"}]', - ), - ( - "csv", - 'id,city\n1,Cape Town\n2,Prague\n', - ) - ] + "request_format,content", + [ + ( + "json", + '[{"id": "1", "city": "Cape Town"},{"id": "2", "city": "Prague"}]', + ), + ( + "csv", + "id,city\n1,Cape Town\n2,Prague\n", + ), + ], ) def test_read_downloaded_file(self, request_format, content, temp_file_fixture): - # Write the file with corresponding content temp_file_fixture.write_text(content) @@ -717,8 +736,10 @@ def test_read_downloaded_file(self, request_format, content, temp_file_fixture): def test_read_downloaded_file_memory_error(self, temp_file_fixture): content = "id,city\n1,Cape Town\n2,Prague\n" temp_file_fixture.write_text(content) - + # Create a mock that raises a memory error when called with patch("pandas.read_csv", side_effect=MemoryError("Mock MemoryError")): - with pytest.raises(MemoryError, match="Insuficient memory to read the file."): + with pytest.raises( + MemoryError, match="Insuficient memory to read the file." + ): _ = _read_downloaded_file(temp_file_fixture, "csv") From 853bb2f9048024d6b6d08edacdd838c1ca1a3244 Mon Sep 17 00:00:00 2001 From: dpavam Date: Tue, 22 Oct 2024 15:40:02 +0100 Subject: [PATCH 61/64] chore: fixes description of warnings --- impc_api_helper/impc_api_helper/utils/warnings.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/impc_api_helper/impc_api_helper/utils/warnings.py b/impc_api_helper/impc_api_helper/utils/warnings.py index 7773c1c..9c73b4c 100644 --- a/impc_api_helper/impc_api_helper/utils/warnings.py +++ b/impc_api_helper/impc_api_helper/utils/warnings.py @@ -5,11 +5,11 @@ # Custom warnings class InvalidCoreWarning(Warning): - """Exception raised when the core is not in the expected core names""" + """Warning raised when the core is not in the expected core names""" class InvalidFieldWarning(Warning): - """Exception raised when the field name is not in the expected fields""" + """Warning raised when the field name is not in the expected fields""" class RowsParamIgnored(Warning): From 3287e04f50ebc793bbbe4119b86f1fa38e60fa58 Mon Sep 17 00:00:00 2001 From: dpavam Date: Tue, 22 Oct 2024 15:56:54 +0100 Subject: [PATCH 62/64] chore: updates README with instructions on how to use batch_solr_request --- impc_api_helper/README.md | 98 ++++++++------------------------------- 1 file changed, 19 insertions(+), 79 deletions(-) diff --git a/impc_api_helper/README.md b/impc_api_helper/README.md index f4cb322..3c32f0e 100644 --- a/impc_api_helper/README.md +++ b/impc_api_helper/README.md @@ -16,7 +16,7 @@ The available functions can be imported as: from impc_api_helper import solr_request, batch_solr_request ``` -### Solr request +## 1. Solr request The most basic request to the IMPC solr API ``` num_found, df = solr_request( core='genotype-phenotype', params={ @@ -27,7 +27,7 @@ num_found, df = solr_request( core='genotype-phenotype', params={ ) ``` -#### Facet request +### a. Facet request `solr_request` allows facet requests ``` @@ -44,15 +44,11 @@ num_found, df = solr_request( ) ``` -#### Solr request validation -A common pitfall when writing a query is the misspelling of `core` and `fields` arguments. For this, we have included an `validate` argument that raises a warning when these values are not as expected. Note this does not prevent you from executing a query; it just alerts you to a potential issue. +### b. Solr request validation +A common pitfall when writing a query is the misspelling of `core` and `fields` arguments. For this, we have included a `validate` argument that raises a warning when these values are not as expected. Note this does not prevent you from executing a query; it just alerts you to a potential issue. -##### Core validation -======= -#### Solr request validation -A common pitfall when writing a query is the misspelling of `core` and `fields` arguments. For this, we have included an `validate` argument that raises a warning when these values are not as expected. Note this does not prevent you from executing a query; it just alerts you to a potential issue. -##### Core validation +#### Core validation ``` num_found, df = solr_request( core='invalid_core', params={ 'q': '*:*', @@ -65,7 +61,7 @@ num_found, df = solr_request( core='invalid_core', params={ > dict_keys(['experiment', 'genotype-phenotype', 'impc_images', 'phenodigm', 'statistical-result'])) ``` -##### Field list validation +#### Field list validation ``` num_found, df = solr_request( core='genotype-phenotype', params={ 'q': '*:*', @@ -78,66 +74,7 @@ num_found, df = solr_request( core='genotype-phenotype', params={ > To see expected fields check the documentation at: https://www.ebi.ac.uk/mi/impc/solrdoc/ ``` -### Batch request -For larger requests, use the batch request function to query the API responsibly. - -``` -num_found, df = solr_request( core='invalid_core', params={ - 'q': '*:*', - 'rows': 10 - }, - validate=True -) - -> InvalidCoreWarning: Invalid core: "genotype-phenotyp", select from the available cores: -> dict_keys(['experiment', 'genotype-phenotype', 'impc_images', 'phenodigm', 'statistical-result'])) -``` - -##### Field list validation -``` -num_found, df = solr_request( core='genotype-phenotype', params={ - 'q': '*:*', - 'rows': 10, - 'fl': 'invalid_field,marker_symbol,allele_symbol' - }, - validate=True -) -> InvalidFieldWarning: Unexpected field name: "invalid_field". Check the spelling of fields. -> To see expected fields check the documentation at: https://www.ebi.ac.uk/mi/impc/solrdoc/ -``` - -#### Solr request validation -A common pitfall when writing a query is the misspelling of `core` and `fields` arguments. For this, we have included an `validate` argument that raises a warning when these values are not as expected. Note this does not prevent you from executing a query; it just alerts you to a potential issue. - -##### Core validation -``` -num_found, df = solr_request( core='invalid_core', params={ - 'q': '*:*', - 'rows': 10 - }, - validate=True -) - -> InvalidCoreWarning: Invalid core: "genotype-phenotyp", select from the available cores: -> dict_keys(['experiment', 'genotype-phenotype', 'impc_images', 'phenodigm', 'statistical-result'])) -``` - -##### Field list validation -``` -num_found, df = solr_request( core='genotype-phenotype', params={ - 'q': '*:*', - 'rows': 10, - 'fl': 'invalid_field,marker_symbol,allele_symbol' - }, - validate=True -) -> InvalidFieldWarning: Unexpected field name: "invalid_field". Check the spelling of fields. -> To see expected fields check the documentation at: https://www.ebi.ac.uk/mi/impc/solrdoc/ -``` - - - -### Batch Solr Request +## 2. Batch Solr Request `batch_solr_request` is available for large queries. This solves issues where a request is too large to fit into memory or where it puts a lot of strain on the API. Use `batch_solr_request` for: @@ -145,10 +82,10 @@ Use `batch_solr_request` for: - Querying multiple items in a list - Downloading data in `json` or `csv` format. -#### Large queries +### Large queries For large queries you can choose between seeing them in a DataFrame or downloading them in `json` or `csv` format. -##### Large query - see in DataFrame +### a. Large query - see in DataFrame This will fetch your data using the API responsibly and return a Pandas DataFrame When your request is larger than recommended and you have not opted for downloading the data, a warning will be presented and you should follow the instructions to proceed. @@ -165,11 +102,12 @@ df = batch_solr_request( print(df.head()) ``` -##### Large query - Download -When using the `download=True` option, no DataFrame will be returned, instead a file with the requested information will be saved as `filename`. The format is selected based on the `wt` parameter. +### b. Large query - Download +When using the `download=True` option, a file with the requested information will be saved as `filename`. The format is selected based on the `wt` parameter. +A DataFrame may be returned, provided it does not exceed the memory available on your laptop. If the DataFrame is too large, an error will be raised. For these cases, we recommend you read the downloaded file in batches/chunks. ``` -batch_solr_request( +df = batch_solr_request( core='genotype-phenotype', params={ 'q':'*:*', @@ -177,11 +115,12 @@ batch_solr_request( }, download=True, filename='geno_pheno_query', - batch_size=20000 + batch_size=100000 ) +print(df.head()) ``` -#### Query by multiple values +### c. Query by multiple values `batch_solr_request` also allows to search multiple items in a list provided they belong to them same field. Pass the list to the `field_list` param and specify the type of `fl` in `field_type`. @@ -198,8 +137,8 @@ df = batch_solr_request( 'field_type': 'marker_symbol' }, download = False -print(df.head()) ) +print(df.head()) ``` This too can be downloaded @@ -207,7 +146,7 @@ This too can be downloaded # List of gene symbols genes = ["Zfp580","Firrm","Gpld1","Mbip"] -batch_solr_request( +df = batch_solr_request( core='genotype-phenotype', params={ 'q':'*:*', @@ -218,6 +157,7 @@ batch_solr_request( download = True, filename='gene_list_query' ) +print(df.head()) ``` From 39be512d0c5d8fa8e977d8b0ab3ebe5ad1ba81dc Mon Sep 17 00:00:00 2001 From: dpavam Date: Tue, 22 Oct 2024 15:57:21 +0100 Subject: [PATCH 63/64] chore: removing dev imports --- impc_api_helper/impc_api_helper/batch_solr_request.py | 2 -- impc_api_helper/impc_api_helper/solr_request.py | 1 - impc_api_helper/impc_api_helper/utils/validators.py | 1 - 3 files changed, 4 deletions(-) diff --git a/impc_api_helper/impc_api_helper/batch_solr_request.py b/impc_api_helper/impc_api_helper/batch_solr_request.py index 9876832..bdc4bec 100644 --- a/impc_api_helper/impc_api_helper/batch_solr_request.py +++ b/impc_api_helper/impc_api_helper/batch_solr_request.py @@ -12,8 +12,6 @@ UnsupportedDownloadFormatError, ) from impc_api_helper.utils.validators import DownloadFormatValidator -# from .utils.warnings import warning_config, RowsParamIgnored, UnsupportedDownloadFormatError -# from .utils.validators import DownloadFormatValidator # Initialise warning config diff --git a/impc_api_helper/impc_api_helper/solr_request.py b/impc_api_helper/impc_api_helper/solr_request.py index 3318c40..01b7816 100644 --- a/impc_api_helper/impc_api_helper/solr_request.py +++ b/impc_api_helper/impc_api_helper/solr_request.py @@ -3,7 +3,6 @@ import pandas as pd import requests from impc_api_helper.utils.validators import CoreParamsValidator -# from .utils.validators import CoreParamsValidator # Display the whole dataframe <15 pd.set_option("display.max_rows", 15) diff --git a/impc_api_helper/impc_api_helper/utils/validators.py b/impc_api_helper/impc_api_helper/utils/validators.py index f372c49..77e78f9 100644 --- a/impc_api_helper/impc_api_helper/utils/validators.py +++ b/impc_api_helper/impc_api_helper/utils/validators.py @@ -10,7 +10,6 @@ InvalidFieldWarning, UnsupportedDownloadFormatError, ) -# from .warnings import warning_config, InvalidCoreWarning, InvalidFieldWarning, UnsupportedDownloadFormatError # Initialise warning config warning_config() From 25264fe541588e8ce2bc2b3b7592f0d02422209c Mon Sep 17 00:00:00 2001 From: dpavam Date: Tue, 22 Oct 2024 15:57:44 +0100 Subject: [PATCH 64/64] chore: remove TODO comment --- impc_api_helper/impc_api_helper/utils/validators.py | 1 - 1 file changed, 1 deletion(-) diff --git a/impc_api_helper/impc_api_helper/utils/validators.py b/impc_api_helper/impc_api_helper/utils/validators.py index 77e78f9..665fbf2 100644 --- a/impc_api_helper/impc_api_helper/utils/validators.py +++ b/impc_api_helper/impc_api_helper/utils/validators.py @@ -75,7 +75,6 @@ def validate_core_and_fields(cls, values): field_list: List[str] = get_fields(fields) # Validate each field in params - # TODO: perhaps pass al invalid fields as a list, instead of many warning messages if invalid_core is not True: for fl in field_list: if fl not in jv.valid_fields(core):