From 494e386cab90c06db9c1d9be6925d11e76411962 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20Poche=C4=87?= Date: Tue, 26 Dec 2023 09:39:05 +0100 Subject: [PATCH 1/4] Fix year_limited deduplication year_limited decorator (which appropriately splits API calls that should be limited to a year per call) used deduplication based on index timestamp. This was probably due to API partial matching (request one day but recieve full month) and potential overlap of year blocks at the ends of interval. This commit introduces truncation of each year block to its nominal start and end. Index deduplication is removed. This should remove index duplicates stemming from partial matching and interval overlap but keep the duplicates served by the API (e.g. due to corrections). --- entsoe/decorators.py | 14 +++++++++++++- tests.py | 21 +++++++++++++++++++++ 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/entsoe/decorators.py b/entsoe/decorators.py index 1cfeeb1..361a036 100644 --- a/entsoe/decorators.py +++ b/entsoe/decorators.py @@ -98,6 +98,19 @@ def year_wrapper(*args, start=None, end=None, **kwargs): for _start, _end in blocks: try: frame = func(*args, start=_start, end=_end, **kwargs) + # Due to partial matching func may return data indexed by + # timestamps outside _start and _end. In order to avoid + # (unintentionally) repeating records, frames are truncated to + # left-open intervals. Additionally, second disjunct forces the + # earliest block to be a closed interval. + # + # If there are repeating records in a single frame (e.g. due + # to corrections) then the result will also have them. + interval_mask = ( + ((frame.index > _start) & (frame.index <= _end)) + | (frame.index == start) + ) + frame = frame.loc[interval_mask] except NoMatchingDataError: logger.debug(f"NoMatchingDataError: between {_start} and {_end}") frame = None @@ -108,7 +121,6 @@ def year_wrapper(*args, start=None, end=None, **kwargs): raise NoMatchingDataError df = pd.concat(frames, sort=True) - df = df.loc[~df.index.duplicated(keep='first')] return df return year_wrapper diff --git a/tests.py b/tests.py index 75c7a00..ce1e7c2 100644 --- a/tests.py +++ b/tests.py @@ -135,6 +135,27 @@ def test_query_procured_balancing_capacity(self): ) self.assertIsInstance(ts, pd.DataFrame) + def test_year_limited_truncation(self): + """ + This is a specific example of polish operator correcting the data + i.e. there was an additional monthly auction for this period. + This results in duplicated time indices. + + source: https://www.pse.pl/web/pse-eng/cross-border-electricity-exchange/auction-office/rzeszow-chmielnicka-interconnection/auction-results # noqa + """ + start = pd.Timestamp('2023-07-17 00:00:00', tz='Europe/Warsaw') + end = pd.Timestamp('2023-08-01 00:00:00', tz='Europe/Warsaw') + ts = self.client.query_offered_capacity( + 'UA_IPS', 'PL', + start=start, end=end, + contract_marketagreement_type='A03', + implicit=False + ) + total_hours = int((end - start).total_seconds()/60/60) + # Expected behaviour is to keep both initial data and corrections + # and leave the deduplication to the user. + self.assertEqual(total_hours*2, ts.shape[0]) + if __name__ == '__main__': unittest.main() From 1e058cd566f92f4fcddd2cf209f008deb687d24a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20Poche=C4=87?= Date: Tue, 26 Dec 2023 10:25:35 +0100 Subject: [PATCH 2/4] Fix type checking in year_limited Calls to type and literal comparisons were changed to isinstance as recommended by docs of type built-in. --- entsoe/decorators.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/entsoe/decorators.py b/entsoe/decorators.py index 361a036..c85c229 100644 --- a/entsoe/decorators.py +++ b/entsoe/decorators.py @@ -88,7 +88,10 @@ def year_wrapper(*args, start=None, end=None, **kwargs): if start is None or end is None: raise Exception('Please specify the start and end date explicity with start= when calling this ' 'function') - if type(start) != pd.Timestamp or type(end) != pd.Timestamp: + if ( + not isinstance(start, pd.Timestamp) + or not isinstance(end, pd.Timestamp) + ): raise Exception('Please use a timezoned pandas object for start and end') if start.tzinfo is None or end.tzinfo is None: raise Exception('Please use a timezoned pandas object for start and end') From 1370246f3aadf24006e49f0ce06167dd28a9e717 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20Poche=C4=87?= Date: Tue, 26 Dec 2023 10:43:16 +0100 Subject: [PATCH 3/4] Add code quality fixes for decorators PEP8 compliance, logging and import sorting. --- entsoe/decorators.py | 52 ++++++++++++++++++++++++++++---------------- 1 file changed, 33 insertions(+), 19 deletions(-) diff --git a/entsoe/decorators.py b/entsoe/decorators.py index c85c229..918d0c2 100644 --- a/entsoe/decorators.py +++ b/entsoe/decorators.py @@ -1,13 +1,13 @@ -import sys +import logging +from functools import wraps from socket import gaierror from time import sleep -import requests -from functools import wraps -from .exceptions import NoMatchingDataError, PaginationError + import pandas as pd -import logging +import requests -from .misc import year_blocks, day_blocks +from .exceptions import NoMatchingDataError, PaginationError +from .misc import day_blocks, year_blocks logger = logging.getLogger(__name__) @@ -24,8 +24,10 @@ def retry_wrapper(*args, **kwargs): result = func(*args, **kwargs) except (requests.ConnectionError, gaierror) as e: error = e - print("Connection Error, retrying in {} seconds".format( - self.retry_delay), file=sys.stderr) + logger.warning( + "Connection Error, " + f"retrying in {self.retry_delay} seconds" + ) sleep(self.retry_delay) continue else: @@ -53,9 +55,11 @@ def pagination_wrapper(*args, start, end, **kwargs): return pagination_wrapper + def documents_limited(n): def decorator(func): - """Deals with calls where you cannot query more than n documents at a time, by offsetting per n documents""" + """Deals with calls where you cannot query more than n documents at a + time, by offsetting per n documents""" @wraps(func) def documents_wrapper(*args, **kwargs): @@ -80,21 +84,27 @@ def documents_wrapper(*args, **kwargs): def year_limited(func): - """Deals with calls where you cannot query more than a year, by splitting - the call up in blocks per year""" + """Deals with calls where you cannot query more than a year, + by splitting the call up in blocks per year""" @wraps(func) def year_wrapper(*args, start=None, end=None, **kwargs): if start is None or end is None: - raise Exception('Please specify the start and end date explicity with start= when calling this ' - 'function') + raise Exception( + 'Please specify the start and end date explicity with' + 'start= when calling this function' + ) if ( not isinstance(start, pd.Timestamp) or not isinstance(end, pd.Timestamp) ): - raise Exception('Please use a timezoned pandas object for start and end') + raise Exception( + 'Please use a timezoned pandas object for start and end' + ) if start.tzinfo is None or end.tzinfo is None: - raise Exception('Please use a timezoned pandas object for start and end') + raise Exception( + 'Please use a timezoned pandas object for start and end' + ) blocks = year_blocks(start, end) frames = [] @@ -115,7 +125,9 @@ def year_wrapper(*args, start=None, end=None, **kwargs): ) frame = frame.loc[interval_mask] except NoMatchingDataError: - logger.debug(f"NoMatchingDataError: between {_start} and {_end}") + logger.debug( + f"NoMatchingDataError: between {_start} and {_end}" + ) frame = None frames.append(frame) @@ -130,8 +142,8 @@ def year_wrapper(*args, start=None, end=None, **kwargs): def day_limited(func): - """Deals with calls where you cannot query more than a year, by splitting - the call up in blocks per year""" + """Deals with calls where you cannot query more than a year, + by splitting the call up in blocks per year""" @wraps(func) def day_wrapper(*args, start, end, **kwargs): @@ -141,7 +153,9 @@ def day_wrapper(*args, start, end, **kwargs): try: frame = func(*args, start=_start, end=_end, **kwargs) except NoMatchingDataError: - print(f"NoMatchingDataError: between {_start} and {_end}", file=sys.stderr) + logger.debug( + f"NoMatchingDataError: between {_start} and {_end}" + ) frame = None frames.append(frame) From 27f4ccaf44af07747573387b9d7fb64418bfb114 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20Poche=C4=87?= Date: Tue, 2 Jan 2024 23:37:03 +0100 Subject: [PATCH 4/4] Change documents_limited deduplication The documents_limited decorator appropriately splits queries that exceed the limit of 100 documents (as per "Transparency Platform RESTful API - user guide"). These splits occur before the data are tabulated and their later alignment is not straightforward. This commit changes duplicate removal based on index, to picking last valid value for each column within groups based on index. This is not an ideal solution but seems to work for the issues at hand. Firstly - if any duplicated indices are returned by the API then they are dropped invisibly for the user. Secondly - arguably, the spliting and concatenation should happen before tabulation. It would make more sense and be more efficient. --- entsoe/decorators.py | 11 ++++++++++- tests.py | 20 ++++++++++++++++++++ 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/entsoe/decorators.py b/entsoe/decorators.py index 918d0c2..45656ba 100644 --- a/entsoe/decorators.py +++ b/entsoe/decorators.py @@ -77,12 +77,21 @@ def documents_wrapper(*args, **kwargs): raise NoMatchingDataError df = pd.concat(frames, sort=True) - df = df.loc[~df.index.duplicated(keep='first')] + # For same indices pick last valid value + if df.index.has_duplicates: + df = df.groupby(df.index).agg(deduplicate_documents_limited) return df return documents_wrapper return decorator +def deduplicate_documents_limited(group): + if group.shape[0] == 1: + return group + else: + return group.ffill().iloc[[-1]] + + def year_limited(func): """Deals with calls where you cannot query more than a year, by splitting the call up in blocks per year""" diff --git a/tests.py b/tests.py index ce1e7c2..4b777ae 100644 --- a/tests.py +++ b/tests.py @@ -156,6 +156,26 @@ def test_year_limited_truncation(self): # and leave the deduplication to the user. self.assertEqual(total_hours*2, ts.shape[0]) + def test_documents_limited_truncation(self): + ts = pd.DatetimeIndex( + ["2022-03-01", "2022-03-11", "2022-03-21", "2022-04-01"], + tz="Europe/Berlin" + ) + part_dfs = [] + for i in range(len(ts) - 1): + df = self.client.query_contracted_reserve_prices( + 'DE_LU', start=ts[i], end=ts[i+1], + type_marketagreement_type='A01' + ) + part_dfs.append(df) + df_parts = pd.concat(part_dfs) + df_full = self.client.query_contracted_reserve_prices( + 'DE_LU', start=ts[0], end=ts[-1], + type_marketagreement_type='A01' + ) + self.assertEqual(df_parts.shape, df_full.shape) + self.assertTrue(all(df_parts.isna().sum() == df_full.isna().sum())) + if __name__ == '__main__': unittest.main()