Change documents_limited deduplication

The documents_limited decorator appropriately splits queries that exceed the limit of 100 documents (as per "Transparency Platform RESTful API - user guide"). These splits occur before the data are tabulated and their later alignment is not straightforward. This commit changes duplicate removal based on index, to picking last valid value for each column within groups based on index. This is not an ideal solution but seems to work for the issues at hand. Firstly - if any duplicated indices are returned by the API then they are dropped invisibly for the user. Secondly - arguably, the spliting and concatenation should happen before tabulation. It would make more sense and be more efficient.
EnergieID · Jan 2, 2024 · 27f4cca · 27f4cca
1 parent 1370246
commit 27f4cca
Show file tree

Hide file tree

Showing 2 changed files with 30 additions and 1 deletion.
diff --git a/entsoe/decorators.py b/entsoe/decorators.py
@@ -77,12 +77,21 @@ def documents_wrapper(*args, **kwargs):
                 raise NoMatchingDataError
 
             df = pd.concat(frames, sort=True)
-            df = df.loc[~df.index.duplicated(keep='first')]
+            # For same indices pick last valid value
+            if df.index.has_duplicates:
+                df = df.groupby(df.index).agg(deduplicate_documents_limited)
             return df
         return documents_wrapper
     return decorator
 
 
+def deduplicate_documents_limited(group):
+    if group.shape[0] == 1:
+        return group
+    else:
+        return group.ffill().iloc[[-1]]
+
+
 def year_limited(func):
     """Deals with calls where you cannot query more than a year,
     by splitting the call up in blocks per year"""

diff --git a/tests.py b/tests.py
@@ -156,6 +156,26 @@ def test_year_limited_truncation(self):
         # and leave the deduplication to the user.
         self.assertEqual(total_hours*2, ts.shape[0])
 
+    def test_documents_limited_truncation(self):
+        ts = pd.DatetimeIndex(
+            ["2022-03-01", "2022-03-11", "2022-03-21", "2022-04-01"],
+            tz="Europe/Berlin"
+        )
+        part_dfs = []
+        for i in range(len(ts) - 1):
+            df = self.client.query_contracted_reserve_prices(
+                'DE_LU', start=ts[i], end=ts[i+1],
+                type_marketagreement_type='A01'
+            )
+            part_dfs.append(df)
+        df_parts = pd.concat(part_dfs)
+        df_full = self.client.query_contracted_reserve_prices(
+            'DE_LU', start=ts[0], end=ts[-1],
+            type_marketagreement_type='A01'
+        )
+        self.assertEqual(df_parts.shape, df_full.shape)
+        self.assertTrue(all(df_parts.isna().sum() == df_full.isna().sum()))
+
 
 if __name__ == '__main__':
     unittest.main()