Skip to content

Commit

Permalink
Prune cache_eclsum from fmu-ensemble
Browse files Browse the repository at this point in the history
  • Loading branch information
berland committed Feb 9, 2021
1 parent 9830f63 commit 243b50d
Show file tree
Hide file tree
Showing 4 changed files with 12 additions and 134 deletions.
23 changes: 2 additions & 21 deletions src/fmu/ensemble/ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -697,7 +697,6 @@ def load_smry(
time_index="raw",
column_keys=None,
stacked=True,
cache_eclsum=True,
start_date=None,
end_date=None,
include_restart=True,
Expand Down Expand Up @@ -742,9 +741,6 @@ def load_smry(
by vector name, and with realization index as columns.
This only works when time_index is the same for all
realizations. Not implemented yet!
cache_eclsum (boolean): Boolean for whether we should cache the EclSum
objects. Set to False if you cannot keep all EclSum files in
memory simultaneously
start_date (str or date): First date to include.
Dates prior to this date will be dropped, supplied
start_date will always be included. Overridden if time_index
Expand Down Expand Up @@ -772,7 +768,6 @@ def load_smry(
realization.load_smry(
time_index=time_index,
column_keys=column_keys,
cache_eclsum=cache_eclsum,
start_date=start_date,
end_date=end_date,
include_restart=include_restart,
Expand Down Expand Up @@ -963,7 +958,6 @@ def get_smry_dates(
normalize=True,
start_date=None,
end_date=None,
cache_eclsum=True,
include_restart=True,
):
"""Return list of datetimes for an ensemble according to frequency
Expand Down Expand Up @@ -999,13 +993,9 @@ def get_smry_dates(
# Build list of list of eclsum dates
eclsumsdates = []
for _, realization in self.realizations.items():
if realization.get_eclsum(
cache=cache_eclsum, include_restart=include_restart
):
if realization.get_eclsum(include_restart=include_restart):
eclsumsdates.append(
realization.get_eclsum(
cache=cache_eclsum, include_restart=include_restart
).dates
realization.get_eclsum(include_restart=include_restart).dates
)
return unionize_smry_dates(eclsumsdates, freq, normalize, start_date, end_date)

Expand All @@ -1014,7 +1004,6 @@ def get_smry_stats(
column_keys=None,
time_index="monthly",
quantiles=None,
cache_eclsum=True,
start_date=None,
end_date=None,
):
Expand All @@ -1037,8 +1026,6 @@ def get_smry_stats(
to compute. Quantiles refer to scientific standard, which
is opposite to the oil industry convention.
Ask for p10 if you need the oil industry p90.
cache_eclsum: boolean for whether to keep the loaded EclSum
object in memory after data has been loaded.
start_date: str or date with first date to include.
Dates prior to this date will be dropped, supplied
start_date will always be included. Overridden if time_index
Expand Down Expand Up @@ -1070,7 +1057,6 @@ def get_smry_stats(
dframe = self.get_smry(
time_index=time_index,
column_keys=column_keys,
cache_eclsum=cache_eclsum,
start_date=start_date,
end_date=end_date,
)
Expand Down Expand Up @@ -1323,7 +1309,6 @@ def get_smry(
self,
time_index=None,
column_keys=None,
cache_eclsum=True,
start_date=None,
end_date=None,
include_restart=True,
Expand All @@ -1342,9 +1327,6 @@ def get_smry(
a wanted frequencey for dates, daily, weekly, monthly, yearly,
that will be send to get_smry_dates()
column_keys: list of column key wildcards
cache_eclsum: boolean for whether to cache the EclSum
objects. Defaults to True. Set to False if
not enough memory to keep all summary files in memory.
start_date: str or date with first date to include.
Dates prior to this date will be dropped, supplied
start_date will always be included. Overridden if time_index
Expand Down Expand Up @@ -1379,7 +1361,6 @@ def get_smry(
dframe = realization.get_smry(
time_index=time_index,
column_keys=column_keys,
cache_eclsum=cache_eclsum,
include_restart=include_restart,
)
dframe.insert(0, "REAL", index)
Expand Down
23 changes: 2 additions & 21 deletions src/fmu/ensemble/ensembleset.py
Original file line number Diff line number Diff line change
Expand Up @@ -571,7 +571,6 @@ def load_smry(
self,
time_index="raw",
column_keys=None,
cache_eclsum=True,
start_date=None,
end_date=None,
):
Expand All @@ -595,9 +594,6 @@ def load_smry(
If a string is supplied, that string is attempted used
via get_smry_dates() in order to obtain a time index.
column_keys: list of column key wildcards
cache_eclsum: Boolean for whether we should cache the EclSum
objects. Set to False if you cannot keep all EclSum files in
memory simultaneously
start_date: str or date with first date to include.
Dates prior to this date will be dropped, supplied
start_date will always be included. Overridden if time_index
Expand All @@ -616,7 +612,6 @@ def load_smry(
ensemble.load_smry(
time_index=time_index,
column_keys=column_keys,
cache_eclsum=cache_eclsum,
start_date=start_date,
end_date=end_date,
)
Expand All @@ -630,7 +625,6 @@ def get_smry(
self,
time_index=None,
column_keys=None,
cache_eclsum=False,
start_date=None,
end_date=None,
):
Expand All @@ -646,11 +640,6 @@ def get_smry(
If a string is supplied, that string is attempted used
via get_smry_dates() in order to obtain a time index.
column_keys: list of column key wildcards
cache_eclsum: boolean for whether to cache the EclSum
objects. Defaults to False. Set to True if
there is enough memory to keep all realizations summary
files in memory at once. This will speed up subsequent
operations
start_date: str or date with first date to include.
Dates prior to this date will be dropped, supplied
start_date will always be included. Overridden if time_index
Expand All @@ -666,18 +655,14 @@ def get_smry(
"""
smrylist = []
for _, ensemble in self._ensembles.items():
smry = ensemble.get_smry(
time_index, column_keys, cache_eclsum, start_date, end_date
)
smry = ensemble.get_smry(time_index, column_keys, start_date, end_date)
smry.insert(0, "ENSEMBLE", ensemble.name)
smrylist.append(smry)
if smrylist:
return pd.concat(smrylist, sort=False)
return pd.DataFrame()

def get_smry_dates(
self, freq="monthly", cache_eclsum=True, start_date=None, end_date=None
):
def get_smry_dates(self, freq="monthly", start_date=None, end_date=None):
"""Return list of datetimes from an ensembleset
Datetimes from each realization in each ensemble can
Expand All @@ -689,9 +674,6 @@ def get_smry_dates(
yield the sorted union of all valid timesteps for
all realizations. Other valid options are
'daily', 'monthly' and 'yearly'.
cache_eclsum: Boolean for whether we should cache the EclSum
objects. Set to False if you cannot keep all EclSum files in
memory simultaneously
start_date: str or date with first date to include.
Dates prior to this date will be dropped, supplied
start_date will always be included. Overridden if time_index
Expand All @@ -709,7 +691,6 @@ def get_smry_dates(
rawdates = rawdates.union(
ensemble.get_smry_dates(
freq="report",
cache_eclsum=cache_eclsum,
start_date=start_date,
end_date=end_date,
)
Expand Down
50 changes: 8 additions & 42 deletions src/fmu/ensemble/realization.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,9 +100,6 @@ def __init__(
)
self.eclfiles = None # ecl2df.EclFiles object

self._eclsum = None # Placeholder for caching
self._eclsum_include_restart = None # Flag for cached object

# The datastore for internalized data. Dictionary
# indexed by filenames (local to the realization).
# values in the dictionary can be either dicts or dataframes
Expand Down Expand Up @@ -894,7 +891,7 @@ def get_eclfiles(self):
return None
return ecl2df.EclFiles(data_filename)

def get_eclsum(self, cache=True, include_restart=True):
def get_eclsum(self, include_restart=True):
"""
Fetch the Eclipse Summary file from the realization
and return as a libecl EclSum object
Expand All @@ -908,20 +905,13 @@ def get_eclsum(self, cache=True, include_restart=True):
turning off autodiscovery is strongly recommended.
Arguments:
cache: boolean indicating whether we should keep an
object reference to the EclSum object. Set to
false if you need to conserve memory.
include_restart: boolean sent to libecl for whether restart
files should be traversed.
Returns:
EclSum: object representing the summary file. None if
nothing was found.
"""
if cache and self._eclsum: # Return cached object if available
if self._eclsum_include_restart == include_restart:
return self._eclsum

unsmry_file_row = self.files[self.files.FILETYPE == "UNSMRY"]
unsmry_filename = None
if len(unsmry_file_row) == 1:
Expand Down Expand Up @@ -952,11 +942,6 @@ def get_eclsum(self, cache=True, include_restart=True):
# or if SMSPEC is missing.
logger.warning("Failed to create summary instance from %s", unsmry_filename)
return None

if cache:
self._eclsum = eclsum
self._eclsum_include_restart = include_restart

return eclsum

def load_smry(self, **kwargs):
Expand Down Expand Up @@ -994,7 +979,6 @@ def get_smry(
self,
time_index=None,
column_keys=None,
cache_eclsum=True,
start_date=None,
end_date=None,
include_restart=True,
Expand All @@ -1013,8 +997,6 @@ def get_smry(
to these. If a date in ISO-8601 format is supplied, that is
used as a single date.
column_keys: list of column key wildcards. None means everything.
cache_eclsum: boolean for whether to keep the loaded EclSum
object in memory after data has been loaded.
start_date: str or date with first date to include.
Dates prior to this date will be dropped, supplied
start_date will always be included. Overridden if time_index
Expand All @@ -1032,7 +1014,7 @@ def get_smry(
if self.get_eclfiles() is None:
return pd.DataFrame()
try:
dframe = ecl2df.summary.df(
return ecl2df.summary.df(
self.get_eclfiles(),
time_index=time_index,
column_keys=column_keys,
Expand All @@ -1043,17 +1025,6 @@ def get_smry(
paramfile=None,
datetime=datetimeindex,
)
if cache_eclsum:
if self.get_eclfiles():
# This is necessary for tests to pass, but might not
# be the way to do it since ecl2df should take full
# responsibility for the eclsum objects.
self._eclsum = self.get_eclfiles().get_eclsum()
else:
# Do this to ensure that we cut the rope to the EclSum object
# Can be critical for garbage collection
self._eclsum = None
return dframe
except OSError:
# Missing or bogus UNSMRY file
return pd.DataFrame()
Expand Down Expand Up @@ -1121,7 +1092,7 @@ def _glob_smry_keys(self, column_keys):
keys = set()
for key in column_keys:
if isinstance(key, str):
keys = keys.union(set(self._eclsum.keys(key)))
keys = keys.union(set(self.get_eclsum().keys(key)))
return list(keys)

def get_volumetric_rates(self, column_keys=None, time_index=None, time_unit=None):
Expand All @@ -1142,24 +1113,19 @@ def get_smryvalues(self, props_wildcard=None):
a dataframe with values. Raw times from UNSMRY.
Empty dataframe if no summary file data available
"""
if not self._eclsum: # check if it is cached
self.get_eclsum()

if not self._eclsum:
return pd.DataFrame()

props = self._glob_smry_keys(props_wildcard)

if "numpy_vector" in dir(self._eclsum):
if "numpy_vector" in dir(self.get_eclsum()):
data = {
prop: self._eclsum.numpy_vector(prop, report_only=False)
prop: self.get_eclsum().numpy_vector(prop, report_only=False)
for prop in props
}
else: # get_values() is deprecated in newer libecl
data = {
prop: self._eclsum.get_values(prop, report_only=False) for prop in props
prop: self.get_eclsum().get_values(prop, report_only=False)
for prop in props
}
dates = self._eclsum.get_dates(report_only=False)
dates = self.get_eclsum().get_dates(report_only=False)
return pd.DataFrame(data=data, index=dates)

def get_smry_dates(
Expand Down
50 changes: 0 additions & 50 deletions tests/test_ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -828,56 +828,6 @@ def test_nonexisting():
assert not nopermission


def test_eclsumcaching():
"""Test caching of eclsum"""

if "__file__" in globals():
# Easen up copying test code into interactive sessions
testdir = os.path.dirname(os.path.abspath(__file__))
else:
testdir = os.path.abspath(".")

dirs = testdir + "/data/testensemble-reek001/" + "realization-*/iter-0"
ens = ScratchEnsemble("reektest", dirs)

# The problem here is if you load in a lot of UNSMRY files
# and the Python process keeps them in memory. Not sure
# how to check in code that an object has been garbage collected
# but for garbage collection to work, at least the realization
# _eclsum variable must be None.

ens.load_smry()
# Default is to do caching, so these will not be None:
assert all([x._eclsum for (idx, x) in ens.realizations.items()])

# If we redo this operation, the same objects should all
# be None afterwards:
ens.load_smry(cache_eclsum=None)
assert not any([x._eclsum for (idx, x) in ens.realizations.items()])

ens.get_smry()
assert all([x._eclsum for (idx, x) in ens.realizations.items()])

ens.get_smry(cache_eclsum=False)
assert not any([x._eclsum for (idx, x) in ens.realizations.items()])

ens.get_smry_stats()
assert all([x._eclsum for (idx, x) in ens.realizations.items()])

ens.get_smry_stats(cache_eclsum=False)
assert not any([x._eclsum for (idx, x) in ens.realizations.items()])

ens.get_smry_dates()
assert all([x._eclsum for (idx, x) in ens.realizations.items()])

# Clear the cached objects because the statement above has cached it..
for _, realization in ens.realizations.items():
realization._eclsum = None

ens.get_smry_dates(cache_eclsum=False)
assert not any([x._eclsum for (idx, x) in ens.realizations.items()])


def test_filedescriptors():
"""Test how filedescriptors are used.
Expand Down

0 comments on commit 243b50d

Please sign in to comment.