Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update ted_api.py #524

Merged
merged 1 commit into from
Mar 13, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 42 additions & 30 deletions ted_sws/notice_fetcher/adapters/ted_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,13 @@
import time
from datetime import date
from http import HTTPStatus
from typing import List, Generator
from typing import List, Generator, Callable, Optional

import requests
from requests import Response

from ted_sws import config
from ted_sws.event_manager.services.log import log_error
from ted_sws.event_manager.services.log import log_error, log_warning
from ted_sws.notice_fetcher.adapters.ted_api_abc import TedAPIAdapterABC, RequestAPI

DOCUMENTS_PER_PAGE = 100
Expand All @@ -30,6 +31,34 @@
ENGLISH_LANGUAGE_CONTENT_KEY = "ENG"
DOCUMENT_NOTICE_ID_KEY = "ND"

CUSTOM_HEADER = {'User-Agent': 'TED-SWS-Pipeline-Fetcher'}
MAX_RETRIES = 5
DEFAULT_BACKOFF_FACTOR = 1


def execute_request_with_retries(request_lambda: Callable,
max_retries: int = MAX_RETRIES,
backoff_factor: float = DEFAULT_BACKOFF_FACTOR) -> Response:
response = request_lambda()
requests_counter = 0
while response.status_code != HTTPStatus.OK:
if requests_counter >= max_retries:
log_warning(f"Max retries exceeded, retried {max_retries} times!")
return response
requests_counter += 1
time_to_sleep = backoff_factor * requests_counter
log_warning(f"Request returned status code {response.status_code}, retrying in {time_to_sleep} seconds!")
time.sleep(time_to_sleep)
response = request_lambda()
return response


def get_configured_custom_headers(custom_header: Optional[dict] = None) -> dict:
headers = requests.utils.default_headers()
if custom_header:
headers.update(custom_header)
return headers


class TedRequestAPI(RequestAPI):

Expand All @@ -40,15 +69,9 @@ def __call__(self, api_url: str, api_query: dict) -> dict:
:param api_query:
:return: dict
"""

response = requests.post(api_url, json=api_query)
try_again_request_count = 0
while response.status_code == HTTPStatus.TOO_MANY_REQUESTS:
try_again_request_count += 1
time.sleep(try_again_request_count * 0.1)
response = requests.post(api_url, json=api_query)
if try_again_request_count > 5:
break
headers = get_configured_custom_headers(CUSTOM_HEADER)
response = execute_request_with_retries(
request_lambda=lambda: requests.post(api_url, json=api_query, headers=headers))
if response.ok:
response_content = json.loads(response.text)
return response_content
Expand Down Expand Up @@ -108,14 +131,9 @@ def _retrieve_document_content(self, document_content: dict) -> str:
log_error(exception_message)
raise Exception(exception_message)
xml_document_content_link = xml_links[MULTIPLE_LANGUAGE_CONTENT_KEY]
response = requests.get(xml_document_content_link)
try_again_request_count = 0
while response.status_code == HTTPStatus.TOO_MANY_REQUESTS:
try_again_request_count += 1
time.sleep(try_again_request_count * 0.1)
response = requests.get(xml_document_content_link)
if try_again_request_count > 5:
break
headers = get_configured_custom_headers(CUSTOM_HEADER)
response = execute_request_with_retries(
request_lambda=lambda: requests.get(xml_document_content_link, headers=headers))
if response.ok:
return response.text
else:
Expand All @@ -142,17 +160,11 @@ def get_generator_by_query(self, query: dict, result_fields: dict = None, load_c
response_body = self.request_api(api_url=self.ted_api_url, api_query=query)
documents_content += response_body[RESPONSE_RESULTS]

for document_content in documents_content:
if load_content:
document_content[DOCUMENT_CONTENT] = self._retrieve_document_content(document_content)
del document_content[LINKS_TO_CONTENT_KEY]
yield document_content
else:
for document_content in documents_content:
if load_content:
document_content[DOCUMENT_CONTENT] = self._retrieve_document_content(document_content)
del document_content[LINKS_TO_CONTENT_KEY]
yield document_content
for document_content in documents_content:
if load_content:
document_content[DOCUMENT_CONTENT] = self._retrieve_document_content(document_content)
del document_content[LINKS_TO_CONTENT_KEY]
yield document_content

def get_by_query(self, query: dict, result_fields: dict = None, load_content: bool = True) -> List[dict]:
"""
Expand Down
Loading