Skip to content

Commit

Permalink
Change documents_limited deduplication
Browse files Browse the repository at this point in the history
The documents_limited decorator appropriately splits queries that exceed
the limit of 100 documents (as per "Transparency Platform RESTful API -
user guide"). These splits occur before the data are tabulated and their
later alignment is not straightforward.

This commit changes duplicate removal based on index, to picking last
valid value for each column within groups based on index. This is not an
ideal solution but seems to work for the issues at hand. Firstly - if
any duplicated indices are returned by the API then they are dropped
invisibly for the user. Secondly - arguably, the spliting and
concatenation should happen before tabulation. It would make more sense
and be more efficient.
  • Loading branch information
pee-po committed Jan 2, 2024
1 parent 1370246 commit 27f4cca
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 1 deletion.
11 changes: 10 additions & 1 deletion entsoe/decorators.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,12 +77,21 @@ def documents_wrapper(*args, **kwargs):
raise NoMatchingDataError

df = pd.concat(frames, sort=True)
df = df.loc[~df.index.duplicated(keep='first')]
# For same indices pick last valid value
if df.index.has_duplicates:
df = df.groupby(df.index).agg(deduplicate_documents_limited)
return df
return documents_wrapper
return decorator


def deduplicate_documents_limited(group):
if group.shape[0] == 1:
return group
else:
return group.ffill().iloc[[-1]]


def year_limited(func):
"""Deals with calls where you cannot query more than a year,
by splitting the call up in blocks per year"""
Expand Down
20 changes: 20 additions & 0 deletions tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,26 @@ def test_year_limited_truncation(self):
# and leave the deduplication to the user.
self.assertEqual(total_hours*2, ts.shape[0])

def test_documents_limited_truncation(self):
ts = pd.DatetimeIndex(
["2022-03-01", "2022-03-11", "2022-03-21", "2022-04-01"],
tz="Europe/Berlin"
)
part_dfs = []
for i in range(len(ts) - 1):
df = self.client.query_contracted_reserve_prices(
'DE_LU', start=ts[i], end=ts[i+1],
type_marketagreement_type='A01'
)
part_dfs.append(df)
df_parts = pd.concat(part_dfs)
df_full = self.client.query_contracted_reserve_prices(
'DE_LU', start=ts[0], end=ts[-1],
type_marketagreement_type='A01'
)
self.assertEqual(df_parts.shape, df_full.shape)
self.assertTrue(all(df_parts.isna().sum() == df_full.isna().sum()))


if __name__ == '__main__':
unittest.main()

0 comments on commit 27f4cca

Please sign in to comment.