From 494e386cab90c06db9c1d9be6925d11e76411962 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20Poche=C4=87?= <piotr.pochec@interia.pl>
Date: Tue, 26 Dec 2023 09:39:05 +0100
Subject: [PATCH 1/4] Fix year_limited deduplication

year_limited decorator (which appropriately splits API calls that should
be limited to a year per call) used deduplication based on index
timestamp. This was probably due to API partial matching (request one
day but recieve full month) and potential overlap of year blocks at the
ends of interval.

This commit introduces truncation of each year block to its nominal
start and end. Index deduplication is removed. This should remove index
duplicates stemming from partial matching and interval overlap but keep
the duplicates served by the API (e.g. due to corrections).
---
 entsoe/decorators.py | 14 +++++++++++++-
 tests.py             | 21 +++++++++++++++++++++
 2 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/entsoe/decorators.py b/entsoe/decorators.py
index 1cfeeb1..361a036 100644
--- a/entsoe/decorators.py
+++ b/entsoe/decorators.py
@@ -98,6 +98,19 @@ def year_wrapper(*args, start=None, end=None, **kwargs):
         for _start, _end in blocks:
             try:
                 frame = func(*args, start=_start, end=_end, **kwargs)
+                # Due to partial matching func may return data indexed by
+                # timestamps outside _start and _end. In order to avoid
+                # (unintentionally) repeating records, frames are truncated to
+                # left-open intervals. Additionally, second disjunct forces the
+                # earliest block to be a closed interval.
+                #
+                # If there are repeating records in a single frame (e.g. due
+                # to corrections) then the result will also have them.
+                interval_mask = (
+                    ((frame.index > _start) & (frame.index <= _end))
+                    | (frame.index == start)
+                )
+                frame = frame.loc[interval_mask]
             except NoMatchingDataError:
                 logger.debug(f"NoMatchingDataError: between {_start} and {_end}")
                 frame = None
@@ -108,7 +121,6 @@ def year_wrapper(*args, start=None, end=None, **kwargs):
             raise NoMatchingDataError
 
         df = pd.concat(frames, sort=True)
-        df = df.loc[~df.index.duplicated(keep='first')]
         return df
 
     return year_wrapper
diff --git a/tests.py b/tests.py
index 75c7a00..ce1e7c2 100644
--- a/tests.py
+++ b/tests.py
@@ -135,6 +135,27 @@ def test_query_procured_balancing_capacity(self):
         )
         self.assertIsInstance(ts, pd.DataFrame)
 
+    def test_year_limited_truncation(self):
+        """
+        This is a specific example of polish operator correcting the data
+        i.e. there was an additional monthly auction for this period.
+        This results in duplicated time indices.
+
+        source: https://www.pse.pl/web/pse-eng/cross-border-electricity-exchange/auction-office/rzeszow-chmielnicka-interconnection/auction-results # noqa
+        """
+        start = pd.Timestamp('2023-07-17 00:00:00', tz='Europe/Warsaw')
+        end = pd.Timestamp('2023-08-01 00:00:00', tz='Europe/Warsaw')
+        ts = self.client.query_offered_capacity(
+            'UA_IPS', 'PL',
+            start=start, end=end,
+            contract_marketagreement_type='A03',
+            implicit=False
+        )
+        total_hours = int((end - start).total_seconds()/60/60)
+        # Expected behaviour is to keep both initial data and corrections
+        # and leave the deduplication to the user.
+        self.assertEqual(total_hours*2, ts.shape[0])
+
 
 if __name__ == '__main__':
     unittest.main()

From 1e058cd566f92f4fcddd2cf209f008deb687d24a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20Poche=C4=87?= <piotr.pochec@interia.pl>
Date: Tue, 26 Dec 2023 10:25:35 +0100
Subject: [PATCH 2/4] Fix type checking in year_limited

Calls to type and literal comparisons were changed to isinstance as
recommended by docs of type built-in.
---
 entsoe/decorators.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/entsoe/decorators.py b/entsoe/decorators.py
index 361a036..c85c229 100644
--- a/entsoe/decorators.py
+++ b/entsoe/decorators.py
@@ -88,7 +88,10 @@ def year_wrapper(*args, start=None, end=None, **kwargs):
         if start is None or end is None:
             raise Exception('Please specify the start and end date explicity with start=<date> when calling this '
                             'function')
-        if type(start) != pd.Timestamp or type(end) != pd.Timestamp:
+        if (
+            not isinstance(start, pd.Timestamp)
+            or not isinstance(end, pd.Timestamp)
+        ):
             raise Exception('Please use a timezoned pandas object for start and end')
         if start.tzinfo is None or end.tzinfo is None:
             raise Exception('Please use a timezoned pandas object for start and end')

From 1370246f3aadf24006e49f0ce06167dd28a9e717 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20Poche=C4=87?= <piotr.pochec@interia.pl>
Date: Tue, 26 Dec 2023 10:43:16 +0100
Subject: [PATCH 3/4] Add code quality fixes for decorators

PEP8 compliance, logging and import sorting.
---
 entsoe/decorators.py | 52 ++++++++++++++++++++++++++++----------------
 1 file changed, 33 insertions(+), 19 deletions(-)

diff --git a/entsoe/decorators.py b/entsoe/decorators.py
index c85c229..918d0c2 100644
--- a/entsoe/decorators.py
+++ b/entsoe/decorators.py
@@ -1,13 +1,13 @@
-import sys
+import logging
+from functools import wraps
 from socket import gaierror
 from time import sleep
-import requests
-from functools import wraps
-from .exceptions import NoMatchingDataError, PaginationError
+
 import pandas as pd
-import logging
+import requests
 
-from .misc import year_blocks, day_blocks
+from .exceptions import NoMatchingDataError, PaginationError
+from .misc import day_blocks, year_blocks
 
 logger = logging.getLogger(__name__)
 
@@ -24,8 +24,10 @@ def retry_wrapper(*args, **kwargs):
                 result = func(*args, **kwargs)
             except (requests.ConnectionError, gaierror) as e:
                 error = e
-                print("Connection Error, retrying in {} seconds".format(
-                    self.retry_delay), file=sys.stderr)
+                logger.warning(
+                    "Connection Error, "
+                    f"retrying in {self.retry_delay} seconds"
+                )
                 sleep(self.retry_delay)
                 continue
             else:
@@ -53,9 +55,11 @@ def pagination_wrapper(*args, start, end, **kwargs):
 
     return pagination_wrapper
 
+
 def documents_limited(n):
     def decorator(func):
-        """Deals with calls where you cannot query more than n documents at a time, by offsetting per n documents"""
+        """Deals with calls where you cannot query more than n documents at a
+        time, by offsetting per n documents"""
 
         @wraps(func)
         def documents_wrapper(*args, **kwargs):
@@ -80,21 +84,27 @@ def documents_wrapper(*args, **kwargs):
 
 
 def year_limited(func):
-    """Deals with calls where you cannot query more than a year, by splitting
-    the call up in blocks per year"""
+    """Deals with calls where you cannot query more than a year,
+    by splitting the call up in blocks per year"""
 
     @wraps(func)
     def year_wrapper(*args, start=None, end=None, **kwargs):
         if start is None or end is None:
-            raise Exception('Please specify the start and end date explicity with start=<date> when calling this '
-                            'function')
+            raise Exception(
+                'Please specify the start and end date explicity with'
+                'start=<date> when calling this function'
+            )
         if (
             not isinstance(start, pd.Timestamp)
             or not isinstance(end, pd.Timestamp)
         ):
-            raise Exception('Please use a timezoned pandas object for start and end')
+            raise Exception(
+                'Please use a timezoned pandas object for start and end'
+            )
         if start.tzinfo is None or end.tzinfo is None:
-            raise Exception('Please use a timezoned pandas object for start and end')
+            raise Exception(
+                'Please use a timezoned pandas object for start and end'
+            )
 
         blocks = year_blocks(start, end)
         frames = []
@@ -115,7 +125,9 @@ def year_wrapper(*args, start=None, end=None, **kwargs):
                 )
                 frame = frame.loc[interval_mask]
             except NoMatchingDataError:
-                logger.debug(f"NoMatchingDataError: between {_start} and {_end}")
+                logger.debug(
+                    f"NoMatchingDataError: between {_start} and {_end}"
+                )
                 frame = None
             frames.append(frame)
 
@@ -130,8 +142,8 @@ def year_wrapper(*args, start=None, end=None, **kwargs):
 
 
 def day_limited(func):
-    """Deals with calls where you cannot query more than a year, by splitting
-    the call up in blocks per year"""
+    """Deals with calls where you cannot query more than a year,
+    by splitting the call up in blocks per year"""
 
     @wraps(func)
     def day_wrapper(*args, start, end, **kwargs):
@@ -141,7 +153,9 @@ def day_wrapper(*args, start, end, **kwargs):
             try:
                 frame = func(*args, start=_start, end=_end, **kwargs)
             except NoMatchingDataError:
-                print(f"NoMatchingDataError: between {_start} and {_end}", file=sys.stderr)
+                logger.debug(
+                    f"NoMatchingDataError: between {_start} and {_end}"
+                )
                 frame = None
             frames.append(frame)
 

From 27f4ccaf44af07747573387b9d7fb64418bfb114 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20Poche=C4=87?= <piotr.pochec@interia.pl>
Date: Tue, 2 Jan 2024 23:37:03 +0100
Subject: [PATCH 4/4] Change documents_limited deduplication

The documents_limited decorator appropriately splits queries that exceed
the limit of 100 documents (as per "Transparency Platform RESTful API -
user guide"). These splits occur before the data are tabulated and their
later alignment is not straightforward.

This commit changes duplicate removal based on index, to picking last
valid value for each column within groups based on index. This is not an
ideal solution but seems to work for the issues at hand. Firstly - if
any duplicated indices are returned by the API then they are dropped
invisibly for the user. Secondly - arguably, the spliting and
concatenation should happen before tabulation. It would make more sense
and be more efficient.
---
 entsoe/decorators.py | 11 ++++++++++-
 tests.py             | 20 ++++++++++++++++++++
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/entsoe/decorators.py b/entsoe/decorators.py
index 918d0c2..45656ba 100644
--- a/entsoe/decorators.py
+++ b/entsoe/decorators.py
@@ -77,12 +77,21 @@ def documents_wrapper(*args, **kwargs):
                 raise NoMatchingDataError
 
             df = pd.concat(frames, sort=True)
-            df = df.loc[~df.index.duplicated(keep='first')]
+            # For same indices pick last valid value
+            if df.index.has_duplicates:
+                df = df.groupby(df.index).agg(deduplicate_documents_limited)
             return df
         return documents_wrapper
     return decorator
 
 
+def deduplicate_documents_limited(group):
+    if group.shape[0] == 1:
+        return group
+    else:
+        return group.ffill().iloc[[-1]]
+
+
 def year_limited(func):
     """Deals with calls where you cannot query more than a year,
     by splitting the call up in blocks per year"""
diff --git a/tests.py b/tests.py
index ce1e7c2..4b777ae 100644
--- a/tests.py
+++ b/tests.py
@@ -156,6 +156,26 @@ def test_year_limited_truncation(self):
         # and leave the deduplication to the user.
         self.assertEqual(total_hours*2, ts.shape[0])
 
+    def test_documents_limited_truncation(self):
+        ts = pd.DatetimeIndex(
+            ["2022-03-01", "2022-03-11", "2022-03-21", "2022-04-01"],
+            tz="Europe/Berlin"
+        )
+        part_dfs = []
+        for i in range(len(ts) - 1):
+            df = self.client.query_contracted_reserve_prices(
+                'DE_LU', start=ts[i], end=ts[i+1],
+                type_marketagreement_type='A01'
+            )
+            part_dfs.append(df)
+        df_parts = pd.concat(part_dfs)
+        df_full = self.client.query_contracted_reserve_prices(
+            'DE_LU', start=ts[0], end=ts[-1],
+            type_marketagreement_type='A01'
+        )
+        self.assertEqual(df_parts.shape, df_full.shape)
+        self.assertTrue(all(df_parts.isna().sum() == df_full.isna().sum()))
+
 
 if __name__ == '__main__':
     unittest.main()