diff --git a/impc_api_helper/README.md b/impc_api_helper/README.md index 022e82b..3c32f0e 100644 --- a/impc_api_helper/README.md +++ b/impc_api_helper/README.md @@ -12,23 +12,43 @@ The functions in this package are intended for use on a Jupyter Notebook. ### Available functions The available functions can be imported as: -`from impc_api_helper import solr_request, batch_request, iterator_solr_request` +``` +from impc_api_helper import solr_request, batch_solr_request +``` -### Solr request +## 1. Solr request The most basic request to the IMPC solr API ``` num_found, df = solr_request( core='genotype-phenotype', params={ - 'q': '*:*' - 'rows': 10 + 'q': '*:*', + 'rows': 10, 'fl': 'marker_symbol,allele_symbol,parameter_stable_id' } ) ``` -#### Solr request validation -A common pitfall when writing a query is the misspelling of `core` and `fields` arguments. For this, we have included an `validate` argument that raises a warning when these values are not as expected. Note this does not prevent you from executing a query; it just alerts you to a potential issue. +### a. Facet request +`solr_request` allows facet requests + +``` +num_found, df = solr_request( + core="genotype-phenotype", + params={ + "q": "*:*", + "rows": 0, + "facet": "on", + "facet.field": "zygosity", + "facet.limit": 15, + "facet.mincount": 1, + }, + ) +``` + +### b. Solr request validation +A common pitfall when writing a query is the misspelling of `core` and `fields` arguments. For this, we have included a `validate` argument that raises a warning when these values are not as expected. Note this does not prevent you from executing a query; it just alerts you to a potential issue. + -##### Core validation +#### Core validation ``` num_found, df = solr_request( core='invalid_core', params={ 'q': '*:*', @@ -41,7 +61,7 @@ num_found, df = solr_request( core='invalid_core', params={ > dict_keys(['experiment', 'genotype-phenotype', 'impc_images', 'phenodigm', 'statistical-result'])) ``` -##### Field list validation +#### Field list validation ``` num_found, df = solr_request( core='genotype-phenotype', params={ 'q': '*:*', @@ -54,31 +74,91 @@ num_found, df = solr_request( core='genotype-phenotype', params={ > To see expected fields check the documentation at: https://www.ebi.ac.uk/mi/impc/solrdoc/ ``` -### Batch request -For larger requests, use the batch request function to query the API responsibly. +## 2. Batch Solr Request +`batch_solr_request` is available for large queries. This solves issues where a request is too large to fit into memory or where it puts a lot of strain on the API. + +Use `batch_solr_request` for: +- Large queries (>1,000,000) +- Querying multiple items in a list +- Downloading data in `json` or `csv` format. + +### Large queries +For large queries you can choose between seeing them in a DataFrame or downloading them in `json` or `csv` format. + +### a. Large query - see in DataFrame +This will fetch your data using the API responsibly and return a Pandas DataFrame + +When your request is larger than recommended and you have not opted for downloading the data, a warning will be presented and you should follow the instructions to proceed. + +``` +df = batch_solr_request( + core='genotype-phenotype', + params={ + 'q':'*:*' + }, + download=False, + batch_size=30000 +) +print(df.head()) +``` + +### b. Large query - Download +When using the `download=True` option, a file with the requested information will be saved as `filename`. The format is selected based on the `wt` parameter. +A DataFrame may be returned, provided it does not exceed the memory available on your laptop. If the DataFrame is too large, an error will be raised. For these cases, we recommend you read the downloaded file in batches/chunks. + +``` +df = batch_solr_request( + core='genotype-phenotype', + params={ + 'q':'*:*', + 'wt':'csv' + }, + download=True, + filename='geno_pheno_query', + batch_size=100000 +) +print(df.head()) +``` + +### c. Query by multiple values +`batch_solr_request` also allows to search multiple items in a list provided they belong to them same field. +Pass the list to the `field_list` param and specify the type of `fl` in `field_type`. + ``` -df = batch_request( - core="genotype-phenotype", +# List of gene symbols +genes = ["Zfp580","Firrm","Gpld1","Mbip"] + +df = batch_solr_request( + core='genotype-phenotype', params={ - 'q': 'top_level_mp_term_name:"cardiovascular system phenotype" AND effect_size:[* TO *] AND life_stage_name:"Late adult"', - 'fl': 'allele_accession_id,life_stage_name,marker_symbol,mp_term_name,p_value,parameter_name,parameter_stable_id,phenotyping_center,statistical_method,top_level_mp_term_name,effect_size' + 'q':'*:*', + 'fl': 'marker_symbol,mp_term_name,p_value', + 'field_list': genes, + 'field_type': 'marker_symbol' }, - batch_size=100 + download = False ) +print(df.head()) ``` +This too can be downloaded -### Iterator solr request -To pass a list of different fields and download a file with the information ``` -# Genes example +# List of gene symbols genes = ["Zfp580","Firrm","Gpld1","Mbip"] -# Initial query parameters -params = { - 'q': "*:*", - 'fl': 'marker_symbol,allele_symbol,parameter_stable_id', - 'field_list': genes, - 'field_type': "marker_symbol" -} -iterator_solr_request(core='genotype-phenotype', params=params, filename='marker_symbol', format ='csv') +df = batch_solr_request( + core='genotype-phenotype', + params={ + 'q':'*:*', + 'fl': 'marker_symbol,mp_term_name,p_value', + 'field_list': genes, + 'field_type': 'marker_symbol' + }, + download = True, + filename='gene_list_query' +) +print(df.head()) ``` + + + diff --git a/impc_api_helper/impc_api_helper/__init__.py b/impc_api_helper/impc_api_helper/__init__.py index c819a69..7216b9b 100644 --- a/impc_api_helper/impc_api_helper/__init__.py +++ b/impc_api_helper/impc_api_helper/__init__.py @@ -1,6 +1,6 @@ -from .solr_request import solr_request, batch_request -from .iterator_solr_request import iterator_solr_request +from .solr_request import solr_request +from .batch_solr_request import batch_solr_request from .utils import validators, warnings # Control what gets imported by client -__all__ = ["solr_request", "batch_request", "iterator_solr_request"] +__all__ = ["solr_request", "batch_solr_request"] diff --git a/impc_api_helper/impc_api_helper/batch_solr_request.py b/impc_api_helper/impc_api_helper/batch_solr_request.py new file mode 100644 index 0000000..bdc4bec --- /dev/null +++ b/impc_api_helper/impc_api_helper/batch_solr_request.py @@ -0,0 +1,266 @@ +from IPython.display import display +import json +import pandas as pd +import requests +from tqdm import tqdm +from .solr_request import solr_request +from pathlib import Path +import warnings +from impc_api_helper.utils.warnings import ( + warning_config, + RowsParamIgnored, + UnsupportedDownloadFormatError, +) +from impc_api_helper.utils.validators import DownloadFormatValidator + + +# Initialise warning config +warning_config() + + +def batch_solr_request( + core, params, download=False, batch_size=5000, filename="batch_request" +): + """Function for large API requests (>1,000,000 results). Fetches the data in batches and + produces a Pandas DataFrame or downloads a file in json or csv formats. + + Additionally, allows to search multiple items in a list provided they belong to them same field. + + Args: + core (str): name of IMPC solr core. + params (dict): dictionary containing the API call parameters. + download (bool, optional): True for download a local file, False to display results as a DataFrame. Defaults to False. + batch_size (int, optional): Size of batches to fetch the data. Defaults to 5000. + filename (str, optional): When download=True, select the name of the file. Defaults to 'batch_request'. + + + Returns: + pd.DataFrame: if download=False, displays a DataFrame with the results. + None: if download=True, displays a statement on the console and returns None. + """ + + # If params["rows"] is passed, the user is warned about no effect + if params.get("rows") is not None: + warnings.warn( + message='The "rows" param will be ignored in batch_solr_request. To set a batch size, specify a "batch_size" argument.', + category=RowsParamIgnored, + ) + + # Set params for batch request + params["start"] = 0 # Start at the first result + # Override param rows in case there was input. Defines batch size to 5000 by default + params["rows"] = batch_size + + # If user did not specify format, defaults to json. + if params.get("wt") is None: + params["wt"] = "json" + + # Check if it's multiple request + if params.get("field_list") is not None: + # Extract entities_list and entity_type from params + field_list = params.pop("field_list") + field_type = params.pop("field_type") + + # Construct the filter query with grouped model IDs + fq = "{}:({})".format( + field_type, " OR ".join(["{}".format(id) for id in field_list]) + ) + # Show users the field and field values they passed to the function + print("Queried field:", fq) + # Set internal params the users should not change + params["fq"] = fq + + # Determine the total number of rows. Note that we do not request any data (rows = 0). + num_results, _ = solr_request( + core=core, params={**params, "start": 0, "rows": 0, "wt": "json"}, silent=True + ) + print(f"Number of found documents: {num_results}") + + # # Download only logic + # If user decides to download, a generator is used to fetch data in batches without storing results in memory. + if download: + try: + # Check if the format is supported + DownloadFormatValidator(wt=params["wt"]) + + # Implement loop behaviour + print("Downloading file...") + filename_path = Path(f"{filename}.{params['wt']}") + gen = _batch_solr_generator(core, params, num_results) + _solr_downloader(params, filename_path, gen) + print(f"File saved as: {filename_path}") + except UnsupportedDownloadFormatError as e: + raise e + except Exception as e: + raise (f"An error ocurred while downloading the data:{e}") + + # Try to read the downloaded file + try: + print("Reading downloaded file...") + return _read_downloaded_file(filename_path, params["wt"]) + except Exception as e: + raise Exception(f"An unexpected error occured:{e}") + + # If the number of results is small enough and download is off, it's okay to show as df + if num_results < 1000000 and not download: + return _batch_to_df(core, params, num_results) + + # If it's too big, warn the user and ask if they want to proceed. + else: + print( + "Your request might exceed the available memory. We suggest setting 'download=True' and reading the file in batches" + ) + prompt = input( + "Do you wish to proceed anyway? press ('y' or enter to proceed) / type('n' or 'exit' to cancel)" + ) + match prompt: + case "n" | "exit": + print("Exiting gracefully") + exit() + case "y" | "": + print("Fetching data...") + return _batch_to_df(core, params, num_results) + + +# Helper batch_to_df +def _batch_to_df(core, params, num_results): + """Helper function to fetch data in batches and display them in a DataFrame + + Args: + core (str): name of IMPC solr core. + params (dict): dictionary containing the API call parameters. + num_results (int): Number of docs available + + Returns: + pd.DataFrame: DataFrame with the results. + """ + start = params["start"] + batch_size = params["rows"] + chunks = [] + # If the 'wt' param was changed by error, we set it to 'json' + params["wt"] = "json" + + # Request chunks until we have complete data. + with tqdm(total=num_results) as pbar: + while start < num_results: + # Request chunk. We don't need num_results anymore because it does not change. + _, df_chunk = solr_request( + core=core, + params={**params, "start": start, "rows": batch_size}, + silent=True, + ) + + # Update progress bar with the number of rows requested. + pbar.update(batch_size) + pbar.refresh() + # Record chunk. + chunks.append(df_chunk) + # Increment start. + start += batch_size + # Prepare final dataframe. + return pd.concat(chunks, ignore_index=True) + + +def _batch_solr_generator(core, params, num_results): + """Generator function to fetch results from the SOLR API in batches using pagination. + + Args: + core (str): name of IMPC solr core. + params (dict): dictionary containing the API call parameters. + num_results (int): Number of docs available + + Raises: + Exception: If a problem occurs during the download, an exception is raised. + + Yields: + ([dict, str]): A JSON object or plain text with the results. + """ + base_url = "https://www.ebi.ac.uk/mi/impc/solr/" + solr_url = base_url + core + "/select" + start = params["start"] + batch_size = params["rows"] + + with tqdm(total=num_results) as pbar: + while start <= num_results: + params["start"] = start + response = requests.get(solr_url, params=params, timeout=10) + + if response.status_code == 200: + if params.get("wt") == "json": + data = response.json()["response"]["docs"] + else: + data = response.text + + # Update and refresh the progress bar after getting the data + pbar.update(batch_size) + pbar.refresh() + yield data + + else: + raise Exception(f"Request failed. Status code: {response.status_code}") + + # pbar.update(batch_size) + start += batch_size + print(f"Your request URL after the last call:{response.url}") + + +# File writer +def _solr_downloader(params, filename, solr_generator): + """Function to write the data from the generator into the specified format. + Supports json and csv only. + + Args: + params (dict): dictionary containing the API call parameters. + filename (Path): name for the file to be downloaded. Defaults to "core.format" as passed by parent function. + solr_generator ([dict, str]): Generator object with the results. + """ + with open(filename, "w", encoding="UTF-8") as f: + if params.get("wt") == "json": + f.write("[\n") + first_chunk = True + + for chunk in solr_generator: + for item in chunk: + if not first_chunk: + f.write(",\n") + json.dump(item, f, ensure_ascii=False) + first_chunk = False + f.write("\n]\n") + + elif params.get("wt") == "csv": + first_chunk = True + for chunk in solr_generator: + lines = chunk.splitlines() + if first_chunk: + # Write all lines in the first chunk + f.write(chunk) + first_chunk = False + else: + # Skip the first line (header) in subsequent chunks + f.write("\n" + "\n".join(lines[1:]) + "\n") + + +# File reader +def _read_downloaded_file(filename: Path, request_format): + """Wrapper for reading files into Pandas DataFrames + + Args: + filename (Path): Name of the file to read + request_format (str): Format of the file to read. Only 'json' and 'csv' are supported. + + Raises: + MemoryError: When there is not enough memory to read the file. + + Returns: + pd.DataFrame: Returns a pd.DataFrame with the data from the file. + """ + try: + match request_format: + case "json": + return pd.read_json(filename) + case "csv": + return pd.read_csv(filename) + except MemoryError as exc: + raise MemoryError( + "MemoryError: Insuficient memory to read the file. Consider reading file in batches using Pandas or Polars." + ) from exc diff --git a/impc_api_helper/impc_api_helper/iterator_solr_request.py b/impc_api_helper/impc_api_helper/iterator_solr_request.py deleted file mode 100644 index 6967384..0000000 --- a/impc_api_helper/impc_api_helper/iterator_solr_request.py +++ /dev/null @@ -1,145 +0,0 @@ -import csv -import json - -import requests - - -# Helper function to fetch results. This function is used by the 'iterator_solr_request' function. -def entity_iterator(base_url, params): - """Generator function to fetch results from the SOLR server in chunks using pagination - - Args: - base_url (str): The base URL of the Solr server to fetch documents from. - params (dict): A dictionary of parameters to include in the GET request. Must include - 'start' and 'rows' keys, which represent the index of the first document - to fetch and the number of documents to fetch per request, respectively. - - Yields: - dict: The next document in the response from the Solr server. - """ - # Initialise variable to check the first request - first_request = True - - # Call the API in chunks and yield the documents in each chunk - while True: - response = requests.get(base_url, params=params) - data = response.json() - docs = data["response"]["docs"] - - # Print the first request only - if first_request: - print(f"Your first request: {response.url}") - first_request = False - - # Yield the documents in the current chunk - for doc in docs: - yield doc - - # Check if there are more results to fetch - start = params["start"] + params["rows"] - num_found = data["response"]["numFound"] - if start >= num_found: - break - - # Update the start parameter for the next request - params["start"] = start - - # Print last request and total number of documents retrieved - print(f"Your last request: {response.url}") - print(f'Number of found documents: {data["response"]["numFound"]}\n') - - -# Function to iterate over field list and write results to a file. -def iterator_solr_request( - core, params, filename="iteration_solr_request", format="json" -): - """Function to fetch results in batches from the Solr API and write them to a file. - Defaults to fetching 5000 rows at a time. - Avoids cluttering local memory, ideal for large requests. - - Args: - core (str): The name of the Solr core to fetch results from. - params (dict): A dictionary of parameters to use in the filter query. Must include - 'field_list' and 'field_type' keys, which represent the list of field items (i.e., list of MGI model identifiers) - to fetch and the type of the field (i.e., model_id) to filter on, respectively. - filename (str): The name of the file/path to write the results to. Defaults to 'iteration_solr_request'. - format (str): The format of the output file. Can be 'csv' or 'json'. Defaults to 'json'. - - Returns: None - - Example use case: - # List of model IDs. - models = ['MGI:3587188', 'MGI:3587185', 'MGI:3605874', 'MGI:2668213'] - - # Call iterator function - iterator_solr_request( - core='phenodigm', - params = { - 'q': 'type:disease_model_summary', - 'fl': 'model_id,marker_id,disease_id', - 'field_list': models, - 'field_type': 'model_id' - }, - filename='model_ids', - format='csv' - ) - """ - - # Validate format - if format not in ["json", "csv"]: - raise ValueError("Invalid format. Please use 'json' or 'csv'") - - # Base URL - base_url = "https://www.ebi.ac.uk/mi/impc/solr/" - solr_url = base_url + core + "/select" - - # Extract entities_list and entity_type from params - field_list = params.pop("field_list") - field_type = params.pop("field_type") - - # Construct the filter query with grouped model IDs - fq = "{}:({})".format( - field_type, " OR ".join(['"{}"'.format(id) for id in field_list]) - ) - - # Show users the field and field values they passed to the function - print("Queried field:", fq) - # Set internal params the users should not change - params["fq"] = fq - params["wt"] = "json" - params["start"] = 0 # Start at the first result - params["rows"] = 5000 # Fetch results in chunks of 5000 - - try: - # Fetch results using a generator function - results_generator = entity_iterator(solr_url, params) - except Exception as e: - raise Exception("An error occurred while downloading the data: " + str(e)) - - # Append extension to the filename - filename = f"{filename}.{format}" - - try: - # Open the file in write mode - with open(filename, "w", newline="") as f: - if format == "csv": - writer = None - for item in results_generator: - # Initialize the CSV writer with the keys of the first item as the field names - if writer is None: - writer = csv.DictWriter(f, fieldnames=item.keys()) - writer.writeheader() - # Write the item to the CSV file - writer.writerow(item) - # Write to json without loading to memory. - elif format == "json": - f.write("[") - for i, item in enumerate(results_generator): - if i != 0: - f.write(",") - json.dump(item, f) - f.write("]") - except Exception as e: - raise Exception("An error occurred while writing the file: " + str(e)) - - print(f"File {filename} was created.") diff --git a/impc_api_helper/impc_api_helper/solr_request.py b/impc_api_helper/impc_api_helper/solr_request.py index 888bf14..01b7816 100644 --- a/impc_api_helper/impc_api_helper/solr_request.py +++ b/impc_api_helper/impc_api_helper/solr_request.py @@ -12,12 +12,12 @@ # Create helper function def solr_request(core, params, silent=False, validate=False): """Performs a single Solr request to the IMPC Solr API. - + Args: core (str): name of IMPC solr core. params (dict): dictionary containing the API call parameters. silent (bool, optional): default False - If True, displays: URL of API call, the number of found docs + If True, displays: URL of API call, the number of found docs and a portion of the DataFrame. validate (bool, optional): default False If True, validates the parameters against the core schema and raises warnings @@ -37,7 +37,7 @@ def solr_request(core, params, silent=False, validate=False): 'fl': 'marker_symbol,allele_symbol,parameter_stable_id', # Fields to retrieve. } ) - + Faceting query provides a summary of data distribution across the specified fields. Example faceting query: num_found, df = solr_request( @@ -53,7 +53,7 @@ def solr_request(core, params, silent=False, validate=False): ) When querying the phenodigm core, pass 'q': 'type:...' - Example phenodigm query: + Example phenodigm query: num_found, df = solr_request( core='phenodigm', params={ @@ -64,10 +64,7 @@ def solr_request(core, params, silent=False, validate=False): """ if validate: - CoreParamsValidator( - core=core, - params=params - ) + CoreParamsValidator(core=core, params=params) base_url = "https://www.ebi.ac.uk/mi/impc/solr/" solr_url = base_url + core + "/select" @@ -85,7 +82,7 @@ def solr_request(core, params, silent=False, validate=False): print(f"Number of found documents: {num_found}\n") # For faceting query. - if params.get('facet') == 'on': + if params.get("facet") == "on": df = _process_faceting(data, params) # For regular query. @@ -131,8 +128,8 @@ def _process_faceting(data, params): # Convert the list of dictionaries into a DataFrame and print the DataFrame. df = pd.DataFrame.from_dict( - faceting_dict, orient="index", columns=["counts"] - ).reset_index() + faceting_dict, orient="index", columns=["counts"] + ).reset_index() # Rename the columns. df.columns = [params["facet.field"], "count_per_category"] return df @@ -141,27 +138,27 @@ def _process_faceting(data, params): # Batch request based on solr_request. def batch_request(core, params, batch_size): """Calls `solr_request` multiple times with `params` - to retrieve results in chunk `batch_size` rows at a time. - - Passing parameter `rows` is ignored and replaced with `batch_size` - - Args: - core (str): name of IMPC solr core. - params (dict): dictionary containing the API call parameters. - batch_size (int): Size of batches (number of docs) per request. + to retrieve results in chunk `batch_size` rows at a time. - Returns: - pandas.DataFrame: Pandas.DataFrame object with the information requested. + Passing parameter `rows` is ignored and replaced with `batch_size` - Example query: - df = batch_request( - core="genotype-phenotype", - params={ - 'q': 'top_level_mp_term_name:"cardiovascular system phenotype" AND effect_size:[* TO *] AND life_stage_name:"Late adult"', - 'fl': 'allele_accession_id,life_stage_name,marker_symbol,mp_term_name,p_value,parameter_name,parameter_stable_id,phenotyping_center,statistical_method,top_level_mp_term_name,effect_size' - }, - batch_size=100 - ) + Args: + core (str): name of IMPC solr core. + params (dict): dictionary containing the API call parameters. + batch_size (int): Size of batches (number of docs) per request. + + Returns: + pandas.DataFrame: Pandas.DataFrame object with the information requested. + + Example query: + df = batch_request( + core="genotype-phenotype", + params={ + 'q': 'top_level_mp_term_name:"cardiovascular system phenotype" AND effect_size:[* TO *] AND life_stage_name:"Late adult"', + 'fl': 'allele_accession_id,life_stage_name,marker_symbol,mp_term_name,p_value,parameter_name,parameter_stable_id,phenotyping_center,statistical_method,top_level_mp_term_name,effect_size' + }, + batch_size=100 + ) """ if "rows" in "params": diff --git a/impc_api_helper/impc_api_helper/temp.py b/impc_api_helper/impc_api_helper/temp.py new file mode 100644 index 0000000..1a7fe66 --- /dev/null +++ b/impc_api_helper/impc_api_helper/temp.py @@ -0,0 +1,19 @@ +from iterator_solr_request_2 import batch_solr_request +import pandas as pd + +markers = ['"Cthrc1"', '*11'] +df = batch_solr_request( + core="genotype-phenotype", + params={ + "q": "*:*", + "fl": "marker_symbol,mp_term_name,p_value", + 'field_list': markers, + 'field_type': 'marker_symbol' + }, + download=True, +) + +df = pd.read_json('genotype-phenotype.json', nrows=80000, lines=True) +# df = pd.read_csv('genotype-phenotype.csv', nrows=80000) +# df = pd.read_xml('genotype-phenotype.xml', parser='etree') +print(df.shape) diff --git a/impc_api_helper/impc_api_helper/utils/validators.py b/impc_api_helper/impc_api_helper/utils/validators.py index 1bc959a..665fbf2 100644 --- a/impc_api_helper/impc_api_helper/utils/validators.py +++ b/impc_api_helper/impc_api_helper/utils/validators.py @@ -1,18 +1,24 @@ -from pydantic import BaseModel, model_validator +from pydantic import BaseModel, model_validator, field_validator import json from typing import List, Dict from pathlib import Path import warnings from dataclasses import dataclass, field -from impc_api_helper.utils.warnings import warning_config, InvalidCoreWarning, InvalidFieldWarning +from impc_api_helper.utils.warnings import ( + warning_config, + InvalidCoreWarning, + InvalidFieldWarning, + UnsupportedDownloadFormatError, +) # Initialise warning config warning_config() + # Dataclass for the json validator @dataclass class ValidationJson: - CORE_FILE: Path = Path(__file__).resolve().parent / 'core_fields.json' + CORE_FILE: Path = Path(__file__).resolve().parent / "core_fields.json" _validation_json: Dict[str, List[str]] = field(default_factory=dict, init=False) # Eager initialisation @@ -20,8 +26,8 @@ def __post_init__(self): self._validation_json = self.load_core_fields(self.CORE_FILE) def load_core_fields(self, filename: Path) -> Dict[str, List[str]]: - with open(filename, "r") as f: - return json.load(f) + with open(filename, "r") as f: + return json.load(f) def valid_cores(self): return self._validation_json.keys() @@ -29,6 +35,7 @@ def valid_cores(self): def valid_fields(self, core: str) -> List[str]: return self._validation_json.get(core, []) + # Function to parse the fields (fl) params in params def get_fields(fields: str) -> List[str]: return fields.split(",") @@ -38,7 +45,7 @@ class CoreParamsValidator(BaseModel): core: str params: Dict - @model_validator(mode='before') + @model_validator(mode="before") @classmethod def validate_core_and_fields(cls, values): invalid_core: bool = False @@ -53,7 +60,8 @@ def validate_core_and_fields(cls, values): invalid_core = True warnings.warn( message=f'Invalid core: "{core}", select from the available cores:\n{jv.valid_cores()})\n', - category=InvalidCoreWarning) + category=InvalidCoreWarning, + ) # Compare passed fl values vs the allowed fl values for a given core fields: str = params.get("fl") @@ -66,13 +74,28 @@ def validate_core_and_fields(cls, values): # Get the fields passed to params and the expected fields for the core field_list: List[str] = get_fields(fields) - # Validate each field in params - # TODO: perhaps pass al invalid fields as a list, instead of many warning messages if invalid_core is not True: for fl in field_list: if fl not in jv.valid_fields(core): - warnings.warn(message=f"""Unexpected field name: "{fl}". Check the spelling of fields.\nTo see expected fields check the documentation at: https://www.ebi.ac.uk/mi/impc/solrdoc/""", - category=InvalidFieldWarning) + warnings.warn( + message=f"""Unexpected field name: "{fl}". Check the spelling of fields.\nTo see expected fields check the documentation at: https://www.ebi.ac.uk/mi/impc/solrdoc/""", + category=InvalidFieldWarning, + ) # Return validated values return values + + +class DownloadFormatValidator(BaseModel): + """Validates params["wt"] from a batch_request""" + + wt: str + + @field_validator("wt") + def validate_wt(cls, value): + supported_formats = {"json", "csv"} + if value not in supported_formats: + raise UnsupportedDownloadFormatError( + f"Unsupported format '{value}'. Only {supported_formats} are supported for download." + ) + return value diff --git a/impc_api_helper/impc_api_helper/utils/warnings.py b/impc_api_helper/impc_api_helper/utils/warnings.py index 6f154f9..9c73b4c 100644 --- a/impc_api_helper/impc_api_helper/utils/warnings.py +++ b/impc_api_helper/impc_api_helper/utils/warnings.py @@ -5,11 +5,20 @@ # Custom warnings class InvalidCoreWarning(Warning): - """Exception raised when the core is not in the expected core names""" + """Warning raised when the core is not in the expected core names""" class InvalidFieldWarning(Warning): - """Exception raised when the field name is not in the expected fields""" + """Warning raised when the field name is not in the expected fields""" + + +class RowsParamIgnored(Warning): + """Warning raised when the row param is ignored""" + + +# custom exceptions +class UnsupportedDownloadFormatError(Exception): + """Exception raised when the format is not supported for download""" # Custom warning function @@ -17,7 +26,7 @@ def warning_config(): """Customises formatting and filters for warnings""" def custom_warning(message, category, filename, lineno, line=None): - return f'{category.__name__}: {message}\n' + return f"{category.__name__}: {message}\n" warnings.formatwarning = custom_warning warnings.simplefilter("always", Warning) diff --git a/impc_api_helper/tests/test_batch_solr_request.py b/impc_api_helper/tests/test_batch_solr_request.py new file mode 100644 index 0000000..a595efe --- /dev/null +++ b/impc_api_helper/tests/test_batch_solr_request.py @@ -0,0 +1,745 @@ +import pytest +from pathlib import Path +from unittest.mock import patch, call, Mock +from impc_api_helper.batch_solr_request import ( + batch_solr_request, + _batch_solr_generator, + solr_request, + _batch_to_df, + _solr_downloader, + _read_downloaded_file, +) +from impc_api_helper.utils.warnings import ( + RowsParamIgnored, + UnsupportedDownloadFormatError, +) +import json +import pandas as pd +from pandas.testing import assert_frame_equal + + +# When rows is passed to batch solr request, a warning is raised. +# Let's ignore this warning in all tests except the one that asserts the warning +pytestmark = pytest.mark.filterwarnings( + "ignore::impc_api_helper.utils.warnings.RowsParamIgnored" +) + + +# Fixture containing the core +@pytest.fixture +def core(): + return "test_core" + + +# Fixture to create a temporary file for use in tests +@pytest.fixture(scope="function") +def temp_file_fixture( + tmp_path, +): + temp_dir = tmp_path / "temp_dir" + temp_dir.mkdir(exist_ok=True) + return temp_dir / "test_file" + + +class TestBatchSolrRequest: + # Fixture containing the params of a normal batch_solr_request + @pytest.fixture + def common_params(self): + return {"start": 0, "rows": 0, "wt": "json"} + + # Fixture mocking solr_request within the batch_solr_request module. + # solr_request will be mocked with different values for numFound, therefore it is passed as param + @pytest.fixture + def mock_solr_request(self, request): + with patch("impc_api_helper.batch_solr_request.solr_request") as mock: + # Mock expected return content of the solr_request (numFound and df) + mock.return_value = (request.param, pd.DataFrame()) + yield mock + + # Fixture mocking _batch_to_df + @pytest.fixture + def mock_batch_to_df(self): + with patch("impc_api_helper.batch_solr_request._batch_to_df") as mock: + # Mock expected return content of the _batch_to_df (pd.DataFrame) + mock.return_value = pd.DataFrame() + yield mock + + # Test no download - small request + # Parameters to determine the numFound of mock_solr_request + @pytest.mark.parametrize("mock_solr_request", [10000], indirect=True) + def test_batch_solr_request_no_download_small_request( + self, mock_solr_request, core, common_params, capsys, mock_batch_to_df + ): + # Call tested function + result = batch_solr_request( + core, params=common_params, download=False, batch_size=100 + ) + + # Assert the value of params was changed to batch_size + assert common_params["rows"] == 100 + + # Assert the mock was called with the expected parameters (start = 0, rows = 0) despite calling other values. + mock_solr_request.assert_called_with( + core=core, + params={**common_params, "start": 0, "rows": 0, "wt": "json"}, + silent=True, + ) + + # Retrieve the numFound + num_found = mock_solr_request.return_value[0] + # Capture stoud + captured = capsys.readouterr() + assert captured.out == f"Number of found documents: {num_found}\n" + + # Check _batch_to_df was called + mock_batch_to_df.assert_called_once() + + # Test no download - large request + # Set mock_solr_request to return a large numFound + @pytest.mark.parametrize("mock_solr_request", [1000001], indirect=True) + # Parameter to test 4 cases: when user selects 'y','' or 'n','exit' upon large download warning. + @pytest.mark.parametrize( + "user_input,expected_outcome", + [("y", "continue"), ("", "continue"), ("n", "exit"), ("exit", "exit")], + ) + def test_batch_solr_request_download_false_large_request( + self, + core, + common_params, + capsys, + monkeypatch, + mock_batch_to_df, + mock_solr_request, + user_input, + expected_outcome, + ): + # Monkeypatch the input() function with parametrized user input + monkeypatch.setattr("builtins.input", lambda _: user_input) + + # Set a batch_size for clarity + batch_size = 500000 + + # When user types 'n' or 'exit', exit should be triggered. + if expected_outcome == "exit": + with pytest.raises(SystemExit): + batch_solr_request( + core, params=common_params, download=False, batch_size=batch_size + ) + else: + result = batch_solr_request( + core, params=common_params, download=False, batch_size=batch_size + ) + + # Capture the exit messages + captured = capsys.readouterr() + + # Retrieve numFound + num_found = mock_solr_request.return_value[0] + + # Assertions for continue case + assert f"Number of found documents: {num_found}" in captured.out + + if expected_outcome == "continue": + assert ( + "Your request might exceed the available memory. We suggest setting 'download=True' and reading the file in batches" + in captured.out + ) + mock_batch_to_df.assert_called_with( + "test_core", {"start": 0, "rows": batch_size, "wt": "json"}, num_found + ) + + # Assertion for exit case + elif expected_outcome == "exit": + assert "Exiting gracefully" in captured.out + mock_batch_to_df.assert_not_called() + + # Test download - large request + # Fixture mocking _batch_solr_generator + @pytest.fixture + def mock_batch_solr_generator(self): + with patch("impc_api_helper.batch_solr_request._batch_solr_generator") as mock: + yield mock + + # Fixture mocking _solr_downloader. Yields a tmp_path to write a file for the duration of the test. + @pytest.fixture + def mock_solr_downloader(self, tmp_path): + with patch("impc_api_helper.batch_solr_request._solr_downloader") as mock: + temp_dir = Path(tmp_path) / "temp_dir" + temp_dir.mkdir() + yield mock + + # Mock response for test containing 2,000,000 docs + @pytest.mark.parametrize("mock_solr_request", [2000000], indirect=True) + # Parametrized decorator to simulate reading a json and csv files + @pytest.mark.parametrize( + "params_format, format, file_content", + [ + ( + {"start": 0, "rows": 0, "wt": "json"}, + "json", + '[{"id": "1", "city": "Houston"},{"id": "2", "city": "Prague"}]', + ), + ( + {"start": 0, "rows": 0, "wt": "csv"}, + "csv", + "id,city\n1,Houston\n2,Prague\n", + ), + ], + ) + # This test should check the correct helper functions and print statements are called. + # Calling the API and writing the file are tested within the helpers. + def test_batch_solr_request_download_true( + self, + core, + capsys, + mock_solr_request, + mock_batch_solr_generator, + mock_solr_downloader, + params_format, + format, + file_content, + temp_file_fixture, + ): + # Write the file with corresponding content + file_and_format = f"{temp_file_fixture}.{format}" + Path(file_and_format).write_text(file_content) + + # First we call the function + # We patch solr_request to get the number of docs + result = batch_solr_request( + core, + params=params_format, + download=True, + filename=temp_file_fixture, + batch_size=2000000, + ) + num_found = mock_solr_request.return_value[0] + + # Assert params["rows"] == batch size and not the original value (0) + assert params_format["rows"] == 2000000 + + # Check _batch_solr_generator gets called once with correct args + mock_batch_solr_generator.assert_called_once_with( + core, params_format, num_found + ) + + # Check _solr_downloader gets called once with correct args + # Checks the filename is a Path and has the corresponding format + mock_solr_downloader.assert_called_once_with( + params_format, Path(file_and_format), mock_batch_solr_generator.return_value + ) + + # Check the print statements + captured = capsys.readouterr() + assert f"Number of found documents: {num_found}" in captured.out + assert f"File saved as: {file_and_format}" in captured.out + + # Check the function returns a df with expected content + # Assert the structure of the final df + assert_frame_equal( + result, + pd.DataFrame( + { + "id": [1, 2], + "city": ["Houston", "Prague"], + } + ).reset_index(drop=True), + ) + + # Test the download validates parameters + # Mock response for test containing 2,000,000 docs + @pytest.mark.parametrize("mock_solr_request", [2000000], indirect=True) + def test_batch_solr_request_download_true_validate_params_wt( + self, core, mock_solr_request + ): + # Set a filename for the test + filename = f"{core}" + params = {"start": 0, "rows": 0, "wt": "wrong_format"} + + # Assert exception when the format is unsupported + if format != "json" and format != "csv": + with pytest.raises(UnsupportedDownloadFormatError): + batch_solr_request( + core, + params=params, + download=True, + filename=filename, + batch_size=2000000, + ) + + # Test download - multiple fields - large and small + # Mock params for a multiple field query + @pytest.fixture + def multiple_field_params(self): + return { + "q": "*:*", + "rows": 0, + "start": 0, + "field_list": ['"orange"', "apple", "*berry"], + "field_type": "fruits", + "wt": "json", + } + + # Mock response for test containing a large request and a small request + @pytest.mark.parametrize("mock_solr_request", [(2000000), (10000)], indirect=True) + @pytest.mark.parametrize( + "download_bool", + [(True), (False)], + ) + def test_batch_solr_request_multiple_fields( + self, + core, + multiple_field_params, + capsys, + mock_solr_request, + mock_batch_solr_generator, + download_bool, + monkeypatch, + mock_batch_to_df, + mock_solr_downloader, + temp_file_fixture, + ): + # This test should ensure the request is formatted properly. Regardless of going to downloads or to _batch_to_df + # Retrieve num_found + num_found = mock_solr_request.return_value[0] + # When download=False and numFound is > 1,000,001 we pass 'y' in this test case. + if not download_bool and num_found == 2000000: + monkeypatch.setattr("builtins.input", lambda _: "y") + + # Call test function + # If download==True, create a temporary file and call with the path_to_download + if download_bool: + # Write the file with corresponding content + file_content = '[{"id": "1", "city": "Cape Town"}]\n' + file_and_format = f"{temp_file_fixture}.json" + Path(file_and_format).write_text(file_content) + + result = batch_solr_request( + core, + params=multiple_field_params, + download=download_bool, + filename=temp_file_fixture, + ) + else: + # Otherwise, call without the path_to_download + result = batch_solr_request( + core, + params=multiple_field_params, + download=download_bool, + batch_size=5000, + ) + + # Check output which should be equal for both. + captured = capsys.readouterr() + assert f"Number of found documents: {num_found}" in captured.out + assert 'Queried field: fruits:("orange" OR apple OR *berry)' in captured.out + + # If download==True, check subsequent functions were executed + if download_bool: + # Check _batch_solr_generator gets called with correct args + mock_batch_solr_generator.assert_called_with( + core, multiple_field_params, num_found + ) + + # Check _solr_downloader gets called once with correct args + mock_solr_downloader.assert_called_once_with( + multiple_field_params, + Path(file_and_format), + mock_batch_solr_generator.return_value, + ) + + # Check the function returns a df with expected content + # Assert the structure of the final df + assert_frame_equal( + result, + pd.DataFrame( + { + "id": [1], + "city": ["Cape Town"], + } + ).reset_index(drop=True), + ) + + # Otherwise, use the 'y' input at the start of the test and make sure the required function is executed. + if not download_bool and num_found == 2000000: + assert ( + "Your request might exceed the available memory. We suggest setting 'download=True' and reading the file in batches" + in captured.out + ) + # Check _batch_to_df was called with correct params + mock_batch_to_df.assert_called_once_with( + core, multiple_field_params, num_found + ) + + # Check the function returns a dataframe + assert result is not None + assert isinstance(result, pd.DataFrame) is True + + # Test the warning when params["rows"] is passed + @pytest.mark.filterwarnings( + "default::impc_api_helper.utils.warnings.RowsParamIgnored" + ) + @pytest.mark.parametrize("mock_solr_request", [10000], indirect=True) + def test_param_rows_warning(core, common_params, mock_solr_request): + with pytest.warns(RowsParamIgnored): + batch_solr_request(core, params=common_params) + + +# Have helper functions in a different class to separate fixtures and parameters +class TestHelpersSolrBatchRequest: + # Define a generator to produce df's dynamically + def data_generator(self): + """Generator to produce data dynamically (row by row or doc by doc)/ + + Yields: + Tuple: tuple containing an id number and a value + """ + # Values for the dataframes + animals = ["Bull", "Elephant", "Rhino", "Monkey", "Snake"] + # Yield a tuple containing an id number and an animal string + for i, a in enumerate(animals): + yield (i, a) + + # Fixture mocking solr_request in the batch_solr_request module + # Num_found is passed dynamically as params during the test + # Generates df's dynamically using the data generator + @pytest.fixture + def mock_solr_request_generator(self, request): + """Patches solr_request for _batch_to_df _batch_solr_generator producing a df dynamically. + Creates a df in chunks (row by row) mocking incoming batches of responses. + """ + with patch("impc_api_helper.batch_solr_request.solr_request") as mock: + # Call the generator + data_generator = self.data_generator() + + # Use the side_effects to return num_found and the dfs + def side_effect(*args, **kwargs): + # Get the tuple from the data generator + idx, animal = next(data_generator) + # Create a df + df = pd.DataFrame({"id": [idx], "animal": [animal]}) + return request.param, df + + mock.side_effect = side_effect + yield mock + + # Fixture containing the params of a normal batch_solr_request with flexible number of rows (batch_size). + @pytest.fixture + def batch_params(self, batch_size): + return {"start": 0, "rows": batch_size, "wt": "json"} + + # Fixture to pass different num_found values per test + @pytest.fixture + def num_found(self, request): + return request.param + + # Parameters to be passsed to the test: a num_found value for mock_solr_request_generator, a num_found separately, and rows (batch_size). + # Note num_found is returned by solr_request, when we access it using the generator function, it causes issues. + # Hence, we pass num_found separately as a fixture. + @pytest.mark.parametrize( + "mock_solr_request_generator,num_found,batch_size", + [(50000, 50000, 10000), (5, 5, 1), (25000, 25000, 5000)], + indirect=["mock_solr_request_generator"], + ) + def test_batch_to_df( + self, core, batch_params, num_found, mock_solr_request_generator, batch_size + ): + # Call the tested function + df = _batch_to_df(core, batch_params, num_found) + + # Assert solr_request was called with the expected params and increasing start + expected_calls = [ + call( + core=core, + params={**batch_params, "start": i * batch_size, "rows": batch_size}, + silent=True, + ) + for i in range(5) + ] + mock_solr_request_generator.assert_has_calls(expected_calls) + + # Assert the structure of the final df + assert_frame_equal( + df, + pd.DataFrame( + { + "id": [0, 1, 2, 3, 4], + "animal": ["Bull", "Elephant", "Rhino", "Monkey", "Snake"], + } + ).reset_index(drop=True), + ) + + # Test _batch_solr_generator + # Fixture to mock the requests module + @pytest.fixture + def mock_requests_get(self, request): + with patch("impc_api_helper.batch_solr_request.requests.get") as mock_get: + # Capture the format of the response + wt = request.param["wt"] + mock_get.return_value.format = wt + + # Get the status code and + mock_get.return_value.status_code = request.param["status_code"] + + # Call the generator + data_generator = self.data_generator() + + # Use the side_effects to return num_found and the response data + def side_effect(*args, **kwargs): + # Create a mock response object + mock_response = Mock() + mock_response.status_code = 200 + + # Get the tuple from the data generator + _, animal = next(data_generator) + + # Create type of response + # if json + if wt == "json": + mock_response.json.return_value = { + "response": {"docs": [{"id": animal}]} + } + # if csv + if wt == "csv": + mock_response.text = f"id,\n{animal}" + + return mock_response + + # Assign the side effect + mock_get.side_effect = side_effect + + yield mock_get + + # Fixture containing the params for batch_solr_generator + @pytest.fixture + def batch_solr_generator_params(self): + return {"q": "*:*", "start": 0, "rows": 1} + + # Parameters with the params for fixtures and the expected results + @pytest.mark.parametrize( + "mock_requests_get,expected_results", + [ + ( + {"wt": "json", "status_code": 200}, + [ + [{"id": "Bull"}], + [{"id": "Elephant"}], + [{"id": "Rhino"}], + [{"id": "Monkey"}], + [{"id": "Snake"}], + ], + ), + ( + {"wt": "csv", "status_code": 200}, + [ + "id,\nBull", + "id,\nElephant", + "id,\nRhino", + "id,\nMonkey", + "id,\nSnake", + ], + ), + ], + indirect=["mock_requests_get"], + ) + def test_batch_solr_generator( + self, core, batch_solr_generator_params, mock_requests_get, expected_results + ): + # Define num_found + num_results = 5 + # Define the wt and batch_size param for the test + batch_solr_generator_params["wt"] = mock_requests_get.return_value.format + batch_size = 1 + + # Override rows as the parent function would + batch_solr_generator_params["rows"] = batch_size + + # Call the generator + result = _batch_solr_generator(core, batch_solr_generator_params, num_results) + + # Loop over the expected results and check corresponding calls + for idx, exp_result in enumerate(expected_results, start=0): + # Call the next iteration + assert next(result) == exp_result + + # Check requests.get was called with the correct url, params [especially, the 'start' param], and timeout. + # The first call will always be with the params["rows"] value, 1 in this case. + # Since the function + mock_requests_get.assert_called_with( + "https://www.ebi.ac.uk/mi/impc/solr/test_core/select", + params={ + **batch_solr_generator_params, + "start": idx, + "rows": batch_size, + }, + timeout=10, + ) + + # Simpler approach to test when status code is not 200 + # Fixture to mock requests.get returning a status code. + @pytest.fixture + def mock_requests_get_error(self, request): + with patch("impc_api_helper.batch_solr_request.requests.get") as mock_get: + mock_get.return_value.status_code = request.param + yield mock_get + + # Params for _batch_solr_generator when status code is not 200 + @pytest.mark.parametrize( + "mock_requests_get_error", [404, 500], indirect=["mock_requests_get_error"] + ) + def test_batch_solr_generator_error( + self, core, batch_solr_generator_params, mock_requests_get_error + ): + # Get status code: + status_code = mock_requests_get_error.return_value.status_code + # Call the generator and expect an exception to be raised + # Note the num_found is passed but the number itself does not matter + # Note list() is needed so that the generator is iterated otherwise exception is never reached. + with pytest.raises( + Exception, match=f"Request failed. Status code: {status_code}" + ): + _ = list( + _batch_solr_generator( + core=core, params=batch_solr_generator_params, num_results=4 + ) + ) + + # Fixture to mock _solr_generator. + @pytest.fixture + def mock_solr_generator(self, request): + """ + Mocks a generator yielding 2 batches/chunks to the tested function + """ + format = request.param + if format == "json": + + def data_chunks(): + chunk_1 = [ + {"id": idx, "number": number} + for idx, number in enumerate(range(0, 3)) + ] + chunk_2 = [ + {"id": idx, "number": number} + for idx, number in enumerate(range(100, 97, -1), start=3) + ] + + yield chunk_1 + yield chunk_2 + + yield data_chunks() + elif format == "csv": + + def data_chunks(): + chunk_1 = "id,number\n" + "\n".join( + f"{idx},{number}" for idx, number in enumerate(range(0, 3)) + ) + chunk_2 = "id,number\n" + "\n".join( + f"{idx},{number}" + for idx, number in enumerate(range(100, 97, -1), start=3) + ) + + yield chunk_1 + yield chunk_2 + + yield data_chunks() + + # Parameters for test function, one for the fixture and one as the expected format + @pytest.mark.parametrize( + "mock_solr_generator, expected_format", + [("json", "json"), ("csv", "csv")], + indirect=["mock_solr_generator"], + ) + # Test the writer + def test_solr_downloader( + self, + mock_solr_generator, + batch_solr_generator_params, + expected_format, + tmp_path, + ): + # Define the data generator and path to the temporary file to write + solr_gen = mock_solr_generator + path = Path(tmp_path) + file = "test." + expected_format + test_file = path / file + + # Call the tested function + _solr_downloader( + params={**batch_solr_generator_params, "wt": expected_format}, + filename=test_file, + solr_generator=solr_gen, + ) + + # Read the downloaded file and check it contains the expected data for json and csv. + with open(test_file, "r", encoding="UTF-8") as f: + if expected_format == "json": + content = json.load(f) + assert content == [ + {"id": 0, "number": 0}, + {"id": 1, "number": 1}, + {"id": 2, "number": 2}, + {"id": 3, "number": 100}, + {"id": 4, "number": 99}, + {"id": 5, "number": 98}, + ] + # Load data into a df + test_df = pd.read_json(test_file) + + elif expected_format == "csv": + content = f.read() + + assert content == "id,number\n0,0\n1,1\n2,2\n3,100\n4,99\n5,98\n" + # Load data into a df + test_df = pd.read_csv(test_file) + + # Assert the structure of the final df + assert_frame_equal( + test_df, + pd.DataFrame( + { + "id": [0, 1, 2, 3, 4, 5], + "number": [0, 1, 2, 100, 99, 98], + } + ).reset_index(drop=True), + ) + + @pytest.mark.parametrize( + "request_format,content", + [ + ( + "json", + '[{"id": "1", "city": "Cape Town"},{"id": "2", "city": "Prague"}]', + ), + ( + "csv", + "id,city\n1,Cape Town\n2,Prague\n", + ), + ], + ) + def test_read_downloaded_file(self, request_format, content, temp_file_fixture): + # Write the file with corresponding content + temp_file_fixture.write_text(content) + + test_df = _read_downloaded_file(temp_file_fixture, request_format) + + # Assert the structure of the final df + assert_frame_equal( + test_df, + pd.DataFrame( + { + "id": [1, 2], + "city": ["Cape Town", "Prague"], + } + ).reset_index(drop=True), + ) + + def test_read_downloaded_file_memory_error(self, temp_file_fixture): + content = "id,city\n1,Cape Town\n2,Prague\n" + temp_file_fixture.write_text(content) + + # Create a mock that raises a memory error when called + with patch("pandas.read_csv", side_effect=MemoryError("Mock MemoryError")): + with pytest.raises( + MemoryError, match="Insuficient memory to read the file." + ): + _ = _read_downloaded_file(temp_file_fixture, "csv") diff --git a/impc_api_helper/tests/test_iterator_solr_request.py b/impc_api_helper/tests/test_iterator_solr_request.py deleted file mode 100644 index e69de29..0000000