updated epoching and tutorials

ncc-brain · Sep 25, 2024 · b6a59bc · b6a59bc
1 parent 44d5e35
commit b6a59bc
Show file tree

Hide file tree

Showing 2 changed files with 416 additions and 5,179 deletions.
diff --git a/pyneon/preprocess/epoch.py b/pyneon/preprocess/epoch.py
@@ -9,16 +9,59 @@
 import pandas as pd
 
 class Epoch:
-    def __init__(self, data: pd.DataFrame, times: pd.DataFrame):
+    """
+    Class to create and manage epochs in the data streams.
+
+    Parameters
+    ----------
+    data : pd.DataFrame
+        Data stream to create epochs from. Must contain a 'timestamp [ns]' or 'start timestamp [ns]' column.
+    times_df : pd.DataFrame, optional
+        DataFrame containing epoch information with the following columns:
+        - 't_ref': Reference time of the epoch, in nanoseconds.
+        - 't_before': Time before the reference time to start the epoch, in nanoseconds.
+        - 't_after': Time after the reference time to end the epoch, in nanoseconds.
+        - 'description': Description or label associated with the epoch.
+        If provided, `t_ref`, `t_before`, `t_after`, `description`, `global_t_ref`, and `time_unit` are ignored.
+    t_ref : np.ndarray or list, optional
+        Array or list of reference times for the epochs. Units specified by `time_unit`.
+    t_before : float, np.ndarray, or list, optional
+        Time before the reference time to start the epoch, in **seconds**.
+    t_after : float, np.ndarray, or list, optional
+        Time after the reference time to end the epoch, in **seconds**.
+    description : str, np.ndarray, or list, optional
+        Description or label associated with the epoch.
+    global_t_ref : int or float, optional
+        Global reference time to be added to each reference time in `t_ref`. Units specified by `time_unit`. Default is 0.
+    time_unit : str, optional
+        Unit of time for the reference times and `global_t_ref` ('ns' for nanoseconds or 's' for seconds). Default is 'ns'.
+
+    Notes
+    -----
+    - If `times_df` is provided, it is used to create epochs, and the other time-related parameters are ignored.
+    - If `times_df` is not provided, `t_ref`, `t_before`, `t_after`, and `description` must be provided.
+    - The `t_before` and `t_after` parameters are always expected in **seconds** and will be converted to nanoseconds internally.
+    """
+
+    def __init__(self, data: pd.DataFrame,
+    times_df: Union[pd.DataFrame, None] = None,
+    t_ref: Union[np.ndarray, None] = None,
+    t_before: Union[np.ndarray, Number, None] = None,
+    t_after: Union[np.ndarray, Number, None] = None,
+    description: Union[np.ndarray, None] = None,
+    global_t_ref: Union[int, float] = 0,
+    time_unit: str = "ns",
+    ):
+
         self.data = data
-        self.times = times
+        self.times = times_df
 
         # Check if data is uniformly sampled
         ts_diff = data["timestamp [ns]"].diff().dropna().unique()
         self.uniform_data = len(ts_diff) == 1
 
         # Create epochs
-        self.epochs, self.data = create_epoch(data, times)
+        self.epochs, self.data = create_epoch(data, times_df, t_ref, t_before, t_after, description, global_t_ref, time_unit)
 
         # Check epoch lengths
         data_len = self.epochs["epoch data"].apply(lambda x: x.shape[0])
@@ -37,11 +80,32 @@ def __init__(self, data: pd.DataFrame, times: pd.DataFrame):
         if self.equal_length:
             self.window_length = self.epochs["t_before"].iloc[0] + self.epochs["t_after"].iloc[0]
 
-
-    def to_numpy(self, sampling_rate=100):
+    def to_numpy(self, sampling_rate=100, columns=None):
         """
         Converts epochs into a NumPy array with dimensions (n_epochs, n_times, n_channels).
         Resamples epochs to a fixed sampling rate.
+
+        Parameters
+        ----------
+        sampling_rate : int
+            The sampling rate to resample the data to, in **Hz** (samples per second).
+        columns : list of str, optional
+            List of column names to extract from the DataFrame. If None, all columns except 't_rel' are used.
+
+        Returns
+        -------
+        epochs_np : np.ndarray
+            NumPy array of shape (n_epochs, n_times, n_channels).
+        info : dict
+            A dictionary containing:
+            - 'column_ids': List of provided column names.
+            - 't_rel': The common time grid, in nanoseconds.
+            - 'nan_status': String indicating whether NaN values were found in the data.
+
+        Notes
+        -----
+        - The time grid (`t_rel`) is in nanoseconds.
+        - If `NaN` values are present after interpolation, they are noted in `nan_status`.
         """
         # Ensure there are epochs to process
         if len(self.epochs) == 0:
@@ -55,11 +119,20 @@ def to_numpy(self, sampling_rate=100):
         t_before = self.epochs['t_before'].iloc[0]
         t_after = self.epochs['t_after'].iloc[0]
         total_duration = t_after + t_before
-        n_times = int(total_duration /1e9 * sampling_rate) + 1
+        n_times = int(total_duration / 1e9 * sampling_rate) + 1
         common_times = np.linspace(-t_before, t_after, n_times)
 
-        # Assume all epochs have the same data columns (excluding 't_rel')
-        data_columns = self.epochs.iloc[0]['epoch data'].columns.drop('t_rel')
+        # Select the relevant data columns
+        if columns is None:
+            # If no columns are provided, use all columns except 't_rel'
+            data_columns = self.epochs.iloc[0]['epoch data'].columns.drop('t_rel')
+        else:
+            # Use the explicitly provided columns
+            data_columns = [col for col in columns if col in self.epochs.iloc[0]['epoch data'].columns]
+
+        if len(data_columns) == 0:
+            raise ValueError("None of the provided columns exist in the epoch data.")
+
         n_channels = len(data_columns)
 
         # Initialize the NumPy array
@@ -81,108 +154,158 @@ def to_numpy(self, sampling_rate=100):
                 )
                 epochs_np[i, :, idx] = interp_values
 
-        return epochs_np
+
+        # check if there are any NaN values in the data
+        nan_flag = np.isnan(epochs_np).any()
+        if nan_flag:
+            nan_text = "NaN values were found in the data."
+        else:
+            nan_text = "No NaN values were found in the data."
+
+        # Return an object holding the column ids, times, and data
+        info = {
+            'column_ids': data_columns,
+            't_rel': common_times,
+            'nan_status': nan_text
+        }
+        print(nan_text)
+
+        return epochs_np, info
+
 
 
     def __len__(self):
         return len(self.epochs)
 
 
-
 def create_epoch(
     data: pd.DataFrame,
     times_df: Union[pd.DataFrame, None] = None,
-    t_ref: Union[np.ndarray, None] = None,
-    t_before: Union[np.ndarray, Number, None] = None,
-    t_after: Union[np.ndarray, Number, None] = None,
-    description: Union[np.ndarray, None] = None,
-    ):
+    t_refs: Union[list, np.ndarray, None] = None,
+    t_before: Union[np.ndarray, float, None] = None,
+    t_after: Union[np.ndarray, float, None] = None,
+    description: Union[np.ndarray, str, None] = None,
+    global_t_ref: Union[int, float] = 0,
+    time_unit: str = "ns",
+):
     """
-    Create epochs in the data stream(s) based on the input epochs dataframe.
+    Create epochs in the data streams based on the input epochs DataFrame or provided times.
 
     Parameters
     ----------
     data : pd.DataFrame
-        Data stream to create epochs from. Must contain a 'timestamp [ns]' column.
-    times : pd.DataFrame
-        DataFrame containing the epochs information with the following columns:
-        - 't_ref': Reference time of the epoch in seconds.
-        - 't_before': Time before the reference time to start the epoch, in seconds.
-        - 't_after': Time after the reference time to end the epoch, in seconds.
-        - 'description': Message or label associated with the epoch.
+        Data stream to create epochs from. Must contain a 'timestamp [ns]' or 'start timestamp [ns]' column.
+    times_df : pd.DataFrame, optional
+        DataFrame containing epoch information with the following columns:
+        - 't_ref': Reference time of the epoch, in nanoseconds.
+        - 't_before': Time before the reference time to start the epoch, in nanoseconds.
+        - 't_after': Time after the reference time to end the epoch, in nanoseconds.
+        - 'description': Description or label associated with the epoch.
+        If provided, other time-related parameters are ignored.
+    t_refs : list or np.ndarray, optional
+        List or array of reference times for the epochs. Units specified by `time_unit`.
+    t_before : float, np.ndarray, or list, optional
+        Time before the reference time to start the epoch, in **seconds**.
+    t_after : float, np.ndarray, or list, optional
+        Time after the reference time to end the epoch, in **seconds**.
+    description : str, np.ndarray, or list, optional
+        Description or label associated with the epoch.
+    global_t_ref : int or float, optional
+        Global reference time to be added to each reference time in `t_refs`. Units specified by `time_unit`. Default is 0.
+    time_unit : str, optional
+        Unit of time for the reference times and `global_t_ref` ('ns' for nanoseconds or 's' for seconds). Default is 'ns'.
 
     Returns
     -------
-    data : pd.DataFrame
-        Data stream with appended ``epoch id``, ``'t_rel'``, and ``description``.
-    epochs_data : pd.DataFrame
+    epochs : pd.DataFrame
         DataFrame where each row corresponds to an epoch, containing the data belonging to the epoch as a nested DataFrame.
+        Columns include:
+        - 'epoch id': Unique identifier for the epoch.
+        - 't_ref': Reference time of the epoch, in nanoseconds.
+        - 't_before': Time before the reference time to start the epoch, in nanoseconds.
+        - 't_after': Time after the reference time to end the epoch, in nanoseconds.
+        - 'description': Description or label associated with the epoch.
+        - 'epoch data': DataFrame containing the data within the epoch.
+    annotated_data : pd.DataFrame
+        Original data with added columns:
+        - 'epoch id': Identifier of the epoch to which the data point belongs.
+        - 't_rel': Time relative to the epoch reference time, in nanoseconds.
+        - 'description': Description or label associated with the epoch.
+
+    Notes
+    -----
+    - If `times_df` is provided, it is used to create epochs, and other time-related parameters are ignored.
+    - If `times_df` is not provided, `t_refs`, `t_before`, `t_after`, and `description` must be provided.
+    - The `t_before` and `t_after` parameters are always expected in **seconds** and will be converted to nanoseconds internally.
     """
 
+    # Determine the timestamp column name
     if "timestamp [ns]" in data.columns:
         ts_name = "timestamp [ns]"
     elif "start timestamp [ns]" in data.columns:
         ts_name = "start timestamp [ns]"
     else:
         raise ValueError("Data must contain a 'timestamp [ns]' or 'start timestamp [ns]' column.")
-
+
+    # Generate event_times DataFrame
     if times_df is not None:
         # Ensure the DataFrame has the required columns
         if not all(col in times_df.columns for col in ['t_ref', 't_before', 't_after', 'description']):
-            raise ValueError("DataFrame must contain 't_ref', 't_before', 't_after', and 'description' columns")
-        # Extract the columns from the DataFrame
-        t_ref = times_df['t_ref'].to_numpy()
-        t_before = times_df['t_before'].to_numpy()
-        t_after = times_df['t_after'].to_numpy()
-        description = times_df['description'].to_numpy()
+            raise ValueError("times_df must contain 't_ref', 't_before', 't_after', and 'description' columns")
+        event_times = times_df
     else:
         # Ensure the input arrays are not None
-        if any(x is None for x in [t_ref, t_before, t_after, description]):
-            raise ValueError("t_ref, t_before, t_after, and description must be provided if times_df is None")
-        n_epoch = len(t_ref)
-        other_info = []
-        for x in [t_before, t_after, description]:
-            # If an array is provided, ensure it has the same length as t_ref
-            if isinstance(x, np.ndarray):
-                if len(x) != n_epoch:
-                    raise ValueError("If a numpy array is provided, it must have the same length as t_ref")
-                other_info.append(x)
-            # If a single value is provided, populate the list with the same value for each epoch
-            else:
-                other_info.append(np.repeat(x, n_epoch))
-        t_before, t_after, description = other_info
+        if any(x is None for x in [t_refs, t_before, t_after, description]):
+            raise ValueError("t_refs, t_before, t_after, and description must be provided if times_df is None")
+        # Use construct_event_times to create the event_times DataFrame
+        event_times = construct_event_times(
+            t_refs=t_refs,
+            t_before=t_before,
+            t_after=t_after,
+            description=description,
+            global_t_ref=global_t_ref,
+            time_unit=time_unit,
+        )
 
     # Initialize lists to collect data
     annotated_data = data.copy()
     epochs = pd.DataFrame(
         columns=["epoch id", "t_ref", "t_before", "t_after", "description", "epoch data"]
     )
-
-    for i, (t_ref_i, t_before_i, t_after_i, description_i) in enumerate(zip(t_ref, t_before, t_after, description)):
+
+    # Iterate over each event time to create epochs
+    for i, row in event_times.iterrows():
+        t_ref_i = row['t_ref']
+        t_before_i = row['t_before']
+        t_after_i = row['t_after']
+        description_i = row['description']
+
         start_time = t_ref_i - t_before_i
         end_time = t_ref_i + t_after_i
         mask = (data[ts_name] >= start_time) & (data[ts_name] <= end_time)
-        if mask.empty:
+
+        if not mask.any():
             continue
+
         annotated_data.loc[mask, "epoch id"] = i
         annotated_data.loc[mask, "description"] = description_i
         annotated_data.loc[mask, "t_rel"] = data.loc[mask, ts_name] - t_ref_i
+
         local_data = data.loc[mask].copy()
         local_data["t_rel"] = local_data[ts_name] - t_ref_i
         local_data.reset_index(drop=True, inplace=True)
+
         epochs.at[i, "epoch id"] = i
         epochs.at[i, "t_ref"] = t_ref_i
         epochs.at[i, "t_before"] = t_before_i
         epochs.at[i, "t_after"] = t_after_i
         epochs.at[i, "description"] = description_i
         epochs.at[i, "epoch data"] = local_data
 
-    # Drop rows where 'data' is empty
-    for i, epoch in epochs.iterrows():
-        if epoch["epoch data"].empty:
-            epochs.drop(i, inplace=True)
+    # Drop rows where 'epoch data' is empty
+    epochs = epochs.dropna(subset=["epoch data"]).reset_index(drop=True)
 
-    # set datatypes of the columns
+    # Set data types of the columns
     epochs = epochs.astype(
         {
             "epoch id": "Int32",
@@ -207,22 +330,36 @@ def extract_event_times(
     event_name: str = "all",
 ) -> pd.DataFrame:
     """
-    Extract the timestamps of the events from the data stream.
+    Construct event times from a list or array of reference times.
 
     Parameters
     ----------
-    data : pd.DataFrame
-        Data stream to extract event timestamps from. Must contain a 'timestamp [ns]' column.
-    t_before : float
-        Time before the event to start the epoch, in seconds.
-    t_after : float
-        Time after the event to end the epoch, in seconds.
-    event_name : str, optional
-        Name of the event to extract. If 'all', extract all events. Only relevant if a concat stream or event data is provided.
+    t_refs : list or np.ndarray
+        List or array of reference times. Units specified by `time_unit`.
+    t_before : float, np.ndarray, or list
+        Time before the reference time to start the epoch, in **seconds**.
+    t_after : float, np.ndarray, or list
+        Time after the reference time to end the epoch, in **seconds**.
+    description : str, np.ndarray, or list
+        Description or label associated with the epoch.
+    global_t_ref : int or float, optional
+        Global reference time to be added to each reference time in `t_refs`. Units specified by `time_unit`. Default is 0.
+    time_unit : str, optional
+        Unit of time for the reference times and `global_t_ref` ('ns' for nanoseconds or 's' for seconds). Default is 'ns'.
+
     Returns
     -------
     event_times : pd.DataFrame
-        DataFrame containing the timestamps of the events.
+        DataFrame containing the constructed event times with columns:
+        - 't_ref': Reference time of the event, in nanoseconds.
+        - 't_before': Time before the reference time to start the epoch, in nanoseconds.
+        - 't_after': Time after the reference time to end the epoch, in nanoseconds.
+        - 'description': Description or label associated with the event.
+
+    Notes
+    -----
+    - The `t_refs` and `global_t_ref` are combined and converted to nanoseconds according to `time_unit`.
+    - The `t_before` and `t_after` parameters are always expected in **seconds** and will be converted to nanoseconds internally.
     """
 
     if "start timestamp [ns]" not in event_data.columns: