Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fill bad time intervals with fake data #782

Merged
merged 24 commits into from
Jan 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
76b5e56
Add side parameter to find_nearest
matteobachetti Dec 8, 2023
5b50e65
Add function to randomize data in small bad time intervals
matteobachetti Dec 8, 2023
f74b9cc
Add changelog
matteobachetti Dec 8, 2023
b91a8d3
Pay attention to edges
matteobachetti Dec 8, 2023
3e6e4b4
Rename options
matteobachetti Dec 8, 2023
b2e9817
Fix syntax
matteobachetti Dec 8, 2023
1352e9d
Test that only the attributes we want are randomized
matteobachetti Dec 8, 2023
4d444d4
Test case with no btis
matteobachetti Dec 8, 2023
9c52405
Add fake data to docs
matteobachetti Dec 9, 2023
2b2eebb
Fix plotting
matteobachetti Dec 11, 2023
fc1e77c
Fix deprecation in plot
matteobachetti Dec 13, 2023
988dc30
Fix deprecation in plot
matteobachetti Dec 14, 2023
7447456
Fix corner cases and warnings
matteobachetti Dec 14, 2023
11cf78e
Improve docstring
matteobachetti Jan 3, 2024
11c01d8
Test more keywords and more bad intervals
matteobachetti Jan 4, 2024
32b6570
Top-level import when appropriate
matteobachetti Jan 11, 2024
47d99d7
Add warning about applying the technique only to *very* short gaps
matteobachetti Jan 11, 2024
973fcc8
Change uniform to even, to avoid confusion between uniformly distribu…
matteobachetti Jan 11, 2024
8133062
Cleanup
matteobachetti Jan 11, 2024
8f8243d
Further warning
matteobachetti Jan 11, 2024
aac044b
Fix docstring
matteobachetti Jan 11, 2024
ee47f7e
Change buffer default behavior
matteobachetti Jan 11, 2024
51a3369
Improve compatibility with Numpy 2.0
matteobachetti Jan 11, 2024
de2ee3e
Fix rst issue in docstring
matteobachetti Jan 11, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/changes/782.feature.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Add function to randomize data in small bad time intervals
1 change: 1 addition & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ Current Capabilities
* simulating a light curve from another light curve and a 1-d (time) or 2-d (time-energy) impulse response
* simulating an event list from a given light curve _and_ with a given energy spectrum
* Good Time Interval operations
* Filling gaps in light curves with statistically sound fake data

2. Fourier methods
~~~~~~~~~~~~~~~~~~
Expand Down
227 changes: 204 additions & 23 deletions stingray/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,27 @@
from astropy.time import Time, TimeDelta
from astropy.units import Quantity

from stingray.utils import sqsum
from .io import _can_save_longdouble, _can_serialize_meta
from .utils import (
sqsum,
assign_value_if_none,
make_nd_into_arrays,
make_1d_arrays_into_nd,
get_random_state,
find_nearest,
rebin_data,
)
from .gti import (
create_gti_mask,
check_gtis,
cross_two_gtis,
join_gtis,
gti_border_bins,
get_btis,
merge_gtis,
check_separate,
append_gtis,
)

from typing import TYPE_CHECKING, Type, TypeVar, Union

Expand Down Expand Up @@ -478,7 +497,6 @@ def to_pandas(self) -> DataFrame:

"""
from pandas import DataFrame
from .utils import make_nd_into_arrays

data = {}
array_attrs = self.array_attrs() + [self.main_array_attr] + self.internal_array_attrs()
Expand Down Expand Up @@ -515,7 +533,6 @@ def from_pandas(cls: Type[Tso], ts: DataFrame) -> Tso:

"""
import re
from .utils import make_1d_arrays_into_nd

cls = cls()

Expand Down Expand Up @@ -1023,7 +1040,6 @@ def __getitem__(self, index):
ts_new : :class:`StingrayObject` object
The new :class:`StingrayObject` object with the set of selected data.
"""
from .utils import assign_value_if_none

if isinstance(index, (int, np.integer)):
start = index
Expand Down Expand Up @@ -1077,7 +1093,7 @@ class StingrayTimeseries(StingrayObject):

dt: float
The time resolution of the time series. Can be a scalar or an array attribute (useful
for non-uniformly sampled data or events from different instruments)
for non-evenly sampled data or events from different instruments)

mjdref : float
The MJD used as a reference for the time array.
Expand Down Expand Up @@ -1112,7 +1128,7 @@ class StingrayTimeseries(StingrayObject):

dt: float
The time resolution of the measurements. Can be a scalar or an array attribute (useful
for non-uniformly sampled data or events from different instruments)
for non-evenly sampled data or events from different instruments)

mjdref : float
The MJD used as a reference for the time array.
Expand Down Expand Up @@ -1200,8 +1216,6 @@ def gti(self, value):

@property
def mask(self):
from .gti import create_gti_mask

if self._mask is None:
self._mask = create_gti_mask(self.time, self.gti, dt=self.dt)
return self._mask
Expand Down Expand Up @@ -1292,7 +1306,6 @@ def apply_gtis(self, new_gti=None, inplace: bool = True):

"""
# I import here to avoid the risk of circular imports
from .gti import check_gtis, create_gti_mask

if new_gti is None:
new_gti = self.gti
Expand Down Expand Up @@ -1324,7 +1337,6 @@ def split_by_gti(self, gti=None, min_points=2):
list_of_tss : list
A list of :class:`StingrayTimeseries` objects, one for each GTI segment
"""
from .gti import gti_border_bins, create_gti_mask

if gti is None:
gti = self.gti
Expand Down Expand Up @@ -1530,8 +1542,6 @@ def _operation_with_other_obj(
other = other.change_mjdref(self.mjdref)

if not np.array_equal(self.gti, other.gti):
from .gti import cross_two_gtis

warnings.warn(
"The good time intervals in the two time series are different. Data outside the "
"common GTIs will be discarded."
Expand Down Expand Up @@ -1634,8 +1644,6 @@ def __getitem__(self, index):
>>> assert np.allclose(ts[2].counts, [33])
>>> assert np.allclose(ts[:2].counts, [11, 22])
"""
from .utils import assign_value_if_none
from .gti import cross_two_gtis

new_ts = super().__getitem__(index)
step = 1
Expand Down Expand Up @@ -1721,7 +1729,6 @@ def truncate(self, start=0, stop=None, method="index"):

def _truncate_by_index(self, start, stop):
"""Private method for truncation using index values."""
from .gti import cross_two_gtis

new_ts = self.apply_mask(slice(start, stop))

Expand Down Expand Up @@ -1835,7 +1842,6 @@ def _join_timeseries(self, others, strategy="intersection", ignore_meta=[]):
`ts_new` : :class:`StingrayTimeseries` object
The resulting :class:`StingrayTimeseries` object.
"""
from .gti import check_separate, cross_gtis, append_gtis

new_ts = type(self)()

Expand Down Expand Up @@ -1869,8 +1875,6 @@ def _join_timeseries(self, others, strategy="intersection", ignore_meta=[]):

all_objs = [self] + others

from .gti import merge_gtis

# Check if none of the GTIs was already initialized.
all_gti = [obj._gti for obj in all_objs if obj._gti is not None]

Expand Down Expand Up @@ -2039,7 +2043,6 @@ def rebin(self, dt_new=None, f=None, method="sum"):
ts_new: :class:`StingrayTimeseries` object
The :class:`StingrayTimeseries` object with the new, binned time series.
"""
from .utils import rebin_data

if f is None and dt_new is None:
raise ValueError("You need to specify at least one between f and " "dt_new")
Expand Down Expand Up @@ -2134,6 +2137,162 @@ def sort(self, reverse=False, inplace=False):
mask = mask[::-1]
return self.apply_mask(mask, inplace=inplace)

def fill_bad_time_intervals(
self,
max_length=None,
attrs_to_randomize=None,
buffer_size=None,
even_sampling=None,
seed=None,
):
"""Fill short bad time intervals with random data.

.. warning::
This method is only appropriate for *very short* bad time intervals. The simulated data
are basically white noise, so they are able to alter the statistical properties of
variable data. For very short gaps in the data, the effect of these small
injections of white noise should be negligible. How short depends on the single case,
the user is urged not to use the method as a black box and make simulations to measure
its effect. If you have long bad time intervals, you should use more advanced
techniques, not currently available in Stingray for this use case, such as Gaussian
Processes. In particular, please verify that the values of ``max_length`` and
``buffer_size`` are adequate to your case.

To fill the gaps in all but the time points (i.e., flux measures, energies), we take the
``buffer_size`` (by default, the largest value between 100 and the estimated samples in
a ``max_length``-long gap) valid data points closest to the gap and repeat them randomly
with the same empirical statistical distribution. So, if the `my_fancy_attr` attribute, in
the 100 points of the buffer, has 30 times 10, 10 times 9, and 60 times 11, there will be
*on average* 30% of 10, 60% of 11, and 10% of 9 in the simulated data.

Times are treated differently depending on the fact that the time series is evenly
sampled or not. If it is not, the times are simulated from a uniform distribution with the
same count rate found in the buffer. Otherwise, times just follow the same grid used
inside GTIs. Using the evenly sampled or not is decided based on the ``even_sampling``
parameter. If left to ``None``, the time series is considered evenly sampled if
``self.dt`` is greater than zero and the median separation between subsequent times is
within 1% of the time resolution.
dhuppenkothen marked this conversation as resolved.
Show resolved Hide resolved

Other Parameters
----------------
max_length : float
Maximum length of a bad time interval to be filled. If None, the criterion is bad
time intervals shorter than 1/100th of the longest good time interval.
attrs_to_randomize : list of str, default None
List of array_attrs to randomize. ``If None``, all array_attrs are randomized.
It should not include ``time`` and ``_mask``, which are treated separately.
buffer_size : int, default 100
Number of good data points to use to calculate the means and variance the random data
on each side of the bad time interval
even_sampling : bool, default None
Force the treatment of the data as evenly sampled or not. If None, the data are
considered evenly sampled if ``self.dt`` is larger than zero and the median
separation between subsequent times is within 1% of ``self.dt``.
seed : int, default None
Random seed to use for the simulation. If None, a random seed is generated.

"""

rs = get_random_state(seed)

if attrs_to_randomize is None:
attrs_to_randomize = self.array_attrs() + self.internal_array_attrs()
for attr in ["time", "_mask"]:
if attr in attrs_to_randomize:
attrs_to_randomize.remove(attr)

attrs_to_leave_alone = [
a
for a in self.array_attrs() + self.internal_array_attrs()
if a not in attrs_to_randomize
]

if max_length is None:
max_length = np.max(self.gti[:, 1] - self.gti[:, 0]) / 100

btis = get_btis(self.gti, self.time[0], self.time[-1])
if len(btis) == 0:
logging.info("No bad time intervals to fill")
return copy.deepcopy(self)
filtered_times = self.time[self.mask]

new_times = [filtered_times.copy()]
new_attrs = {}
mean_data_separation = np.median(np.diff(filtered_times))
if even_sampling is None:
# The time series is considered evenly sampled if the median separation between
# subsequent times is within 1% of the time resolution
even_sampling = False
if self.dt > 0 and np.isclose(mean_data_separation, self.dt, rtol=0.01):
even_sampling = True
logging.info(f"Data are {'not' if not even_sampling else ''} evenly sampled")

if even_sampling:
est_samples_in_gap = int(max_length / self.dt)
else:
est_samples_in_gap = int(max_length / mean_data_separation)

if buffer_size is None:
buffer_size = max(100, est_samples_in_gap)

added_gtis = []

total_filled_time = 0
for bti in btis:
length = bti[1] - bti[0]
if length > max_length:
continue
logging.info(f"Filling bad time interval {bti} ({length:.4f} s)")
epsilon = 1e-5 * length
added_gtis.append([bti[0] - epsilon, bti[1] + epsilon])
filt_low_t, filt_low_idx = find_nearest(filtered_times, bti[0])
filt_hig_t, filt_hig_idx = find_nearest(filtered_times, bti[1], side="right")
if even_sampling:
local_new_times = np.arange(bti[0] + self.dt / 2, bti[1], self.dt)
nevents = local_new_times.size
else:
low_time_arr = filtered_times[max(filt_low_idx - buffer_size, 0) : filt_low_idx]
high_time_arr = filtered_times[filt_hig_idx : buffer_size + filt_hig_idx]

ctrate = (
np.count_nonzero(low_time_arr) / (filt_low_t - low_time_arr[0])
+ np.count_nonzero(high_time_arr) / (high_time_arr[-1] - filt_hig_t)
) / 2
nevents = rs.poisson(ctrate * (bti[1] - bti[0]))
local_new_times = rs.uniform(bti[0], bti[1], nevents)
new_times.append(local_new_times)

for attr in attrs_to_randomize:
low_arr = getattr(self, attr)[max(buffer_size - filt_low_idx, 0) : filt_low_idx]
high_arr = getattr(self, attr)[filt_hig_idx : buffer_size + filt_hig_idx]
if attr not in new_attrs:
new_attrs[attr] = [getattr(self, attr)[self.mask]]
new_attrs[attr].append(rs.choice(np.concatenate([low_arr, high_arr]), nevents))
for attr in attrs_to_leave_alone:
if attr not in new_attrs:
new_attrs[attr] = [getattr(self, attr)[self.mask]]
if attr == "_mask":
new_attrs[attr].append(np.ones(nevents, dtype=bool))
else:
new_attrs[attr].append(np.zeros(nevents) + np.nan)
total_filled_time += length

logging.info(f"A total of {total_filled_time} s of data were simulated")

new_gtis = join_gtis(self.gti, added_gtis)
new_times = np.concatenate(new_times)
order = np.argsort(new_times)
new_obj = type(self)()
new_obj.time = new_times[order]

for attr in self.meta_attrs():
setattr(new_obj, attr, getattr(self, attr))

for attr, values in new_attrs.items():
setattr(new_obj, attr, np.concatenate(values)[order])
new_obj.gti = new_gtis
return new_obj

def plot(
self,
attr,
Expand All @@ -2145,6 +2304,7 @@ def plot(
save=False,
filename=None,
plot_btis=True,
axis_limits=None,
):
"""
Plot the time series using ``matplotlib``.
Expand All @@ -2168,6 +2328,10 @@ def plot(
could be ``['Time (s)', 'Counts (s^-1)']``
ax : ``matplotlib.pyplot.axis`` object
Axis to be used for plotting. Defaults to creating a new one.
axis_limits : list, tuple, string, default ``None``
Parameter to set axis properties of the ``matplotlib`` figure. For example
it can be a list like ``[xmin, xmax, ymin, ymax]`` or any other
acceptable argument for the``matplotlib.pyplot.axis()`` method.
title : str, default ``None``
The title of the plot.
marker : str, default '-'
Expand All @@ -2182,17 +2346,23 @@ def plot(
Plot the bad time intervals as red areas on the plot
"""
import matplotlib.pyplot as plt
from .gti import get_btis

if ax is None:
plt.figure(attr)
ax = plt.gca()

if labels is None:
valid_labels = (isinstance(labels, Iterable) and not isinstance(labels, str)) and len(
labels
) == 2
if labels is not None and not valid_labels:
warnings.warn("``labels`` must be an iterable with two labels for x and y axes.")

if labels is None or not valid_labels:
labels = ["Time (s)"] + [attr]

ylabel = labels[1]
xlabel = labels[0]
ylabel = labels[1]
# Default values for labels

ax.plot(self.time, getattr(self, attr), marker, ds="steps-mid", label=attr, zorder=10)

Expand All @@ -2202,11 +2372,15 @@ def plot(
getattr(self, attr),
yerr=getattr(self, attr + "_err"),
fmt="o",
zorder=10,
)

ax.set_ylabel(ylabel)
ax.set_xlabel(xlabel)

if axis_limits is not None:
ax.set_xlim(axis_limits[0], axis_limits[1])
ax.set_ylim(axis_limits[2], axis_limits[3])
if title is not None:
ax.set_title(title)

Expand All @@ -2221,7 +2395,14 @@ def plot(
tend = max(self.time[-1] + self.dt / 2, self.gti[-1, 1])
btis = get_btis(self.gti, tstart, tend)
for bti in btis:
plt.axvspan(bti[0], bti[1], alpha=0.5, color="r", zorder=10)
plt.axvspan(
bti[0],
bti[1],
alpha=0.5,
facecolor="r",
zorder=1,
edgecolor="none",
)
return ax


Expand Down
Loading