From 8b00838be62a9adb2334ad7a46e9a468429294e8 Mon Sep 17 00:00:00 2001
From: Scott Staniewicz <scott.j.staniewicz@jpl.nasa.gov>
Date: Wed, 27 Sep 2023 20:40:32 -0400
Subject: [PATCH] Separate `opera_utils.py` (#132)

* separate more opera code from main config

* fix tests and references to `opera_utils`

add changelog entry
---
 CHANGELOG.md                           |   6 +
 src/dolphin/opera_utils.py             | 224 +++++++++++++++++++++++++
 src/dolphin/workflows/_cli_config.py   |  13 +-
 src/dolphin/workflows/_utils.py        | 202 +---------------------
 src/dolphin/workflows/config.py        |  46 +----
 src/dolphin/workflows/s1_disp.py       |   3 +-
 src/dolphin/workflows/wrapped_phase.py |   5 +-
 tests/test_workflows_config.py         |   8 +-
 tests/test_workflows_opera_utils.py    |  47 ++++++
 tests/test_workflows_s1_disp.py        |   5 +-
 tests/test_workflows_utils.py          |  45 -----
 11 files changed, 302 insertions(+), 302 deletions(-)
 create mode 100644 src/dolphin/opera_utils.py
 create mode 100644 tests/test_workflows_opera_utils.py
diff --git a/CHANGELOG.md b/CHANGELOG.md
index b55f52e5..3e82e80e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,11 @@
 # Unreleased
 
+**Changed**
+
+- Moved all `OPERA_` variables to a new module `dolphin.opera_utils`.
+    - Other OPERA-specific quirks have been moved to the seperate `disp-s1` repo,
+     but the functions remaining are the ones that seem most broadly useful to `sweets`
+     and other users working with burst SLCs.
 
 # [0.4.1](https://github.com/isce-framework/dolphin/compare/v0.4.0...v0.4.1) - 2023-09-08
 
diff --git a/src/dolphin/opera_utils.py b/src/dolphin/opera_utils.py
new file mode 100644
index 00000000..f8f1876d
--- /dev/null
+++ b/src/dolphin/opera_utils.py
@@ -0,0 +1,224 @@
+from __future__ import annotations
+
+import itertools
+import json
+import re
+import subprocess
+import tempfile
+from pathlib import Path
+from typing import Pattern, Sequence, Union
+
+import h5py
+from shapely import geometry, ops, wkt
+
+from dolphin._log import get_log
+from dolphin._types import Filename
+
+logger = get_log(__name__)
+
+
+# Specific to OPERA CSLC products:
+OPERA_DATASET_ROOT = "/"
+OPERA_DATASET_NAME = f"{OPERA_DATASET_ROOT}/data/VV"
+OPERA_IDENTIFICATION = f"{OPERA_DATASET_ROOT}/identification"
+
+# It should match either or these within a filename:
+# t087_185684_iw2 (which comes from COMPASS)
+# T087-165495-IW3 (which is the official product naming scheme)
+# e.g.
+# OPERA_L2_CSLC-S1_T078-165495-IW3_20190906T232711Z_20230101T100506Z_S1A_VV_v1.0.h5
+
+OPERA_BURST_RE = re.compile(
+    r"(?P<prefix>.*?)(?P<track>\d{3})[-_](?P<burst_id>\d{6})[-_](?P<subswath>iw[1-3])",
+    re.IGNORECASE,
+)
+
+
+def group_by_burst(
+    file_list: Sequence[Filename],
+    burst_id_fmt: Union[str, Pattern[str]] = OPERA_BURST_RE,
+    minimum_images: int = 2,
+) -> dict[str, list[Path]]:
+    """Group Sentinel CSLC files by burst.
+
+    Parameters
+    ----------
+    file_list: list[Filename]
+        list of paths of CSLC files
+    burst_id_fmt: str
+        format of the burst id in the filename.
+        Default is [`OPERA_BURST_RE`][dolphin.opera_utils.OPERA_BURST_RE]
+    minimum_images: int
+        Minimum number of SLCs needed to run the workflow for each burst.
+        If there are fewer SLCs in a burst, it will be skipped and
+        a warning will be logged.
+
+    Returns
+    -------
+    dict
+        key is the burst id of the SLC acquisition
+        Value is a list of Paths on that burst:
+        {
+            't087_185678_iw2': [Path(...), Path(...),],
+            't087_185678_iw3': [Path(...),... ],
+        }
+    """
+
+    def get_burst_id(filename):
+        if not (m := re.search(burst_id_fmt, str(filename))):
+            raise ValueError(f"Could not parse burst id from {filename}")
+        return m.group()
+
+    def sort_by_burst_id(file_list):
+        """Sort files by burst id."""
+        burst_ids = [get_burst_id(f) for f in file_list]
+        file_burst_tups = sorted(
+            [(Path(f), d) for f, d in zip(file_list, burst_ids)],
+            # use the date or dates as the key
+            key=lambda f_d_tuple: f_d_tuple[1],  # type: ignore
+        )
+        # Unpack the sorted pairs with new sorted values
+        file_list, burst_ids = zip(*file_burst_tups)  # type: ignore
+        return file_list
+
+    if not file_list:
+        return {}
+
+    sorted_file_list = sort_by_burst_id(file_list)
+    # Now collapse into groups, sorted by the burst_id
+    grouped_images = {
+        burst_id: list(g)
+        for burst_id, g in itertools.groupby(
+            sorted_file_list, key=lambda x: get_burst_id(x)
+        )
+    }
+    # Make sure that each burst has at least the minimum number of SLCs
+    out = {}
+    for burst_id, slc_list in grouped_images.items():
+        if len(slc_list) < minimum_images:
+            logger.warning(
+                f"Skipping burst {burst_id} because it has only {len(slc_list)} SLCs."
+                f"Minimum number of SLCs is {minimum_images}"
+            )
+        else:
+            out[burst_id] = slc_list
+    return out
+
+
+def get_cslc_polygon(
+    opera_file: Filename, buffer_degrees: float = 0.0
+) -> Union[geometry.Polygon, None]:
+    """Get the union of the bounding polygons of the given files.
+
+    Parameters
+    ----------
+    opera_file : list[Filename]
+        list of COMPASS SLC filenames.
+    buffer_degrees : float, optional
+        Buffer the polygons by this many degrees, by default 0.0
+    """
+    dset_name = f"{OPERA_IDENTIFICATION}/bounding_polygon"
+    with h5py.File(opera_file) as hf:
+        if dset_name not in hf:
+            logger.debug(f"Could not find {dset_name} in {opera_file}")
+            return None
+        wkt_str = hf[dset_name][()].decode("utf-8")
+    return wkt.loads(wkt_str).buffer(buffer_degrees)
+
+
+def get_union_polygon(
+    opera_file_list: Sequence[Filename], buffer_degrees: float = 0.0
+) -> geometry.Polygon:
+    """Get the union of the bounding polygons of the given files.
+
+    Parameters
+    ----------
+    opera_file_list : list[Filename]
+        list of COMPASS SLC filenames.
+    buffer_degrees : float, optional
+        Buffer the polygons by this many degrees, by default 0.0
+    """
+    polygons = [get_cslc_polygon(f, buffer_degrees) for f in opera_file_list]
+    polygons = [p for p in polygons if p is not None]
+
+    if len(polygons) == 0:
+        raise ValueError("No polygons found in the given file list.")
+    # Union all the polygons
+    return ops.unary_union(polygons)
+
+
+def make_nodata_mask(
+    opera_file_list: Sequence[Filename],
+    out_file: Filename,
+    buffer_pixels: int = 0,
+    overwrite: bool = False,
+):
+    """Make a boolean raster mask from the union of nodata polygons.
+
+    Parameters
+    ----------
+    opera_file_list : list[Filename]
+        list of COMPASS SLC filenames.
+    out_file : Filename
+        Output filename.
+    buffer_pixels : int, optional
+        Number of pixels to buffer the union polygon by, by default 0.
+        Note that buffering will *decrease* the numba of pixels marked as nodata.
+        This is to be more conservative to not mask possible valid pixels.
+    overwrite : bool, optional
+        Overwrite the output file if it already exists, by default False
+    """
+    from dolphin import io
+
+    if Path(out_file).exists():
+        if not overwrite:
+            logger.debug(f"Skipping {out_file} since it already exists.")
+            return
+        else:
+            logger.info(f"Overwriting {out_file} since overwrite=True.")
+            Path(out_file).unlink()
+
+    # Check these are the right format to get nodata polygons
+    try:
+        test_f = f"NETCDF:{opera_file_list[0]}:{OPERA_DATASET_NAME}"
+        # convert pixels to degrees lat/lon
+        gt = io.get_raster_gt(test_f)
+        # TODO: more robust way to get the pixel size... this is a hack
+        # maybe just use pyproj to warp lat/lon to meters and back?
+        dx_meters = gt[1]
+        dx_degrees = dx_meters / 111000
+        buffer_degrees = buffer_pixels * dx_degrees
+    except RuntimeError:
+        raise ValueError(f"Unable to open {test_f}")
+
+    # Get the union of all the polygons and convert to a temp geojson
+    union_poly = get_union_polygon(opera_file_list, buffer_degrees=buffer_degrees)
+    # convert shapely polygon to geojson
+
+    # Make a dummy raster from the first file with all 0s
+    # This will get filled in with the polygon rasterization
+    cmd = (
+        f"gdal_calc.py --quiet --outfile {out_file} --type Byte  -A"
+        f" NETCDF:{opera_file_list[0]}:{OPERA_DATASET_NAME} --calc 'numpy.nan_to_num(A)"
+        " * 0' --creation-option COMPRESS=LZW --creation-option TILED=YES"
+        " --creation-option BLOCKXSIZE=256 --creation-option BLOCKYSIZE=256"
+    )
+    logger.info(cmd)
+    subprocess.check_call(cmd, shell=True)
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        temp_vector_file = Path(tmpdir) / "temp.geojson"
+        with open(temp_vector_file, "w") as f:
+            f.write(
+                json.dumps(
+                    {
+                        "geometry": geometry.mapping(union_poly),
+                        "properties": {"id": 1},
+                    }
+                )
+            )
+
+        # Now burn in the union of all polygons
+        cmd = f"gdal_rasterize -q -burn 1 {temp_vector_file} {out_file}"
+        logger.info(cmd)
+        subprocess.check_call(cmd, shell=True)
diff --git a/src/dolphin/workflows/_cli_config.py b/src/dolphin/workflows/_cli_config.py
index c5ba8019..2f1565ec 100644
--- a/src/dolphin/workflows/_cli_config.py
+++ b/src/dolphin/workflows/_cli_config.py
@@ -7,13 +7,7 @@
 from pathlib import Path
 from typing import Optional, Union
 
-from .config import (
-    OPERA_DATASET_NAME,
-    InterferogramNetworkType,
-    ShpMethod,
-    Workflow,
-    WorkflowName,
-)
+from .config import InterferogramNetworkType, ShpMethod, Workflow, WorkflowName
 
 
 def create_config(
@@ -126,10 +120,7 @@ def get_parser(subparser=None, subcommand_name="run"):
     inputs.add_argument(
         "-sds",
         "--subdataset",
-        help=(
-            "Subdataset to use from HDF5/NetCDF files. For OPERA CSLC NetCDF files, if"
-            f" None is passed, the default is {OPERA_DATASET_NAME}."
-        ),
+        help="Subdataset to use from HDF5/NetCDF files.",
     )
     inputs.add_argument(
         "--amplitude-mean-files",
diff --git a/src/dolphin/workflows/_utils.py b/src/dolphin/workflows/_utils.py
index ea2d9d09..dd9444cb 100644
--- a/src/dolphin/workflows/_utils.py
+++ b/src/dolphin/workflows/_utils.py
@@ -1,214 +1,14 @@
 from __future__ import annotations
 
-import itertools
-import json
-import re
-import subprocess
-import tempfile
 from pathlib import Path
-from typing import Pattern, Sequence, Union
 
-import h5py
-from shapely import geometry, ops, wkt
-
-from dolphin import io
 from dolphin._log import get_log
-from dolphin._types import Filename
 
-from .config import OPERA_BURST_RE, OPERA_DATASET_NAME, OPERA_IDENTIFICATION, Workflow
+from .config import Workflow
 
 logger = get_log(__name__)
 
 
-def group_by_burst(
-    file_list: Sequence[Filename],
-    burst_id_fmt: Union[str, Pattern[str]] = OPERA_BURST_RE,
-    minimum_images: int = 2,
-) -> dict[str, list[Path]]:
-    """Group Sentinel CSLC files by burst.
-
-    Parameters
-    ----------
-    file_list: list[Filename]
-        list of paths of CSLC files
-    burst_id_fmt: str
-        format of the burst id in the filename.
-        Default is [`OPERA_BURST_RE`][dolphin.workflows.config.OPERA_BURST_RE]
-    minimum_images: int
-        Minimum number of SLCs needed to run the workflow for each burst.
-        If there are fewer SLCs in a burst, it will be skipped and
-        a warning will be logged.
-
-    Returns
-    -------
-    dict
-        key is the burst id of the SLC acquisition
-        Value is a list of Paths on that burst:
-        {
-            't087_185678_iw2': [Path(...), Path(...),],
-            't087_185678_iw3': [Path(...),... ],
-        }
-    """
-
-    def get_burst_id(filename):
-        m = re.search(burst_id_fmt, str(filename))
-        if not m:
-            raise ValueError(f"Could not parse burst id from {filename}")
-        return m.group()
-
-    def sort_by_burst_id(file_list):
-        """Sort files by burst id."""
-        burst_ids = [get_burst_id(f) for f in file_list]
-        file_burst_tups = sorted(
-            [(Path(f), d) for f, d in zip(file_list, burst_ids)],
-            # use the date or dates as the key
-            key=lambda f_d_tuple: f_d_tuple[1],  # type: ignore
-        )
-        # Unpack the sorted pairs with new sorted values
-        file_list, burst_ids = zip(*file_burst_tups)  # type: ignore
-        return file_list
-
-    if not file_list:
-        return {}
-
-    sorted_file_list = sort_by_burst_id(file_list)
-    # Now collapse into groups, sorted by the burst_id
-    grouped_images = {
-        burst_id: list(g)
-        for burst_id, g in itertools.groupby(
-            sorted_file_list, key=lambda x: get_burst_id(x)
-        )
-    }
-    # Make sure that each burst has at least the minimum number of SLCs
-    out = {}
-    for burst_id, slc_list in grouped_images.items():
-        if len(slc_list) < minimum_images:
-            logger.warning(
-                f"Skipping burst {burst_id} because it has only {len(slc_list)} SLCs."
-                f"Minimum number of SLCs is {minimum_images}"
-            )
-        else:
-            out[burst_id] = slc_list
-    return out
-
-
-def get_cslc_polygon(
-    opera_file: Filename, buffer_degrees: float = 0.0
-) -> Union[geometry.Polygon, None]:
-    """Get the union of the bounding polygons of the given files.
-
-    Parameters
-    ----------
-    opera_file : list[Filename]
-        list of COMPASS SLC filenames.
-    buffer_degrees : float, optional
-        Buffer the polygons by this many degrees, by default 0.0
-    """
-    dset_name = f"{OPERA_IDENTIFICATION}/bounding_polygon"
-    with h5py.File(opera_file) as hf:
-        if dset_name not in hf:
-            logger.debug(f"Could not find {dset_name} in {opera_file}")
-            return None
-        wkt_str = hf[dset_name][()].decode("utf-8")
-    return wkt.loads(wkt_str).buffer(buffer_degrees)
-
-
-def get_union_polygon(
-    opera_file_list: Sequence[Filename], buffer_degrees: float = 0.0
-) -> geometry.Polygon:
-    """Get the union of the bounding polygons of the given files.
-
-    Parameters
-    ----------
-    opera_file_list : list[Filename]
-        list of COMPASS SLC filenames.
-    buffer_degrees : float, optional
-        Buffer the polygons by this many degrees, by default 0.0
-    """
-    polygons = [get_cslc_polygon(f, buffer_degrees) for f in opera_file_list]
-    polygons = [p for p in polygons if p is not None]
-
-    if len(polygons) == 0:
-        raise ValueError("No polygons found in the given file list.")
-    # Union all the polygons
-    return ops.unary_union(polygons)
-
-
-def make_nodata_mask(
-    opera_file_list: Sequence[Filename],
-    out_file: Filename,
-    buffer_pixels: int = 0,
-    overwrite: bool = False,
-):
-    """Make a boolean raster mask from the union of nodata polygons.
-
-    Parameters
-    ----------
-    opera_file_list : list[Filename]
-        list of COMPASS SLC filenames.
-    out_file : Filename
-        Output filename.
-    buffer_pixels : int, optional
-        Number of pixels to buffer the union polygon by, by default 0.
-        Note that buffering will *decrease* the numba of pixels marked as nodata.
-        This is to be more conservative to not mask possible valid pixels.
-    overwrite : bool, optional
-        Overwrite the output file if it already exists, by default False
-    """
-    if Path(out_file).exists():
-        if not overwrite:
-            logger.debug(f"Skipping {out_file} since it already exists.")
-            return
-        else:
-            logger.info(f"Overwriting {out_file} since overwrite=True.")
-            Path(out_file).unlink()
-
-    # Check these are the right format to get nodata polygons
-    try:
-        test_f = f"NETCDF:{opera_file_list[0]}:{OPERA_DATASET_NAME}"
-        # convert pixels to degrees lat/lon
-        gt = io.get_raster_gt(test_f)
-        # TODO: more robust way to get the pixel size... this is a hack
-        # maybe just use pyproj to warp lat/lon to meters and back?
-        dx_meters = gt[1]
-        dx_degrees = dx_meters / 111000
-        buffer_degrees = buffer_pixels * dx_degrees
-    except RuntimeError:
-        raise ValueError(f"Unable to open {test_f}")
-
-    # Get the union of all the polygons and convert to a temp geojson
-    union_poly = get_union_polygon(opera_file_list, buffer_degrees=buffer_degrees)
-    # convert shapely polygon to geojson
-
-    # Make a dummy raster from the first file with all 0s
-    # This will get filled in with the polygon rasterization
-    cmd = (
-        f"gdal_calc.py --quiet --outfile {out_file} --type Byte  -A"
-        f" NETCDF:{opera_file_list[0]}:{OPERA_DATASET_NAME} --calc 'numpy.nan_to_num(A)"
-        " * 0' --creation-option COMPRESS=LZW --creation-option TILED=YES"
-        " --creation-option BLOCKXSIZE=256 --creation-option BLOCKYSIZE=256"
-    )
-    logger.info(cmd)
-    subprocess.check_call(cmd, shell=True)
-
-    with tempfile.TemporaryDirectory() as tmpdir:
-        temp_vector_file = Path(tmpdir) / "temp.geojson"
-        with open(temp_vector_file, "w") as f:
-            f.write(
-                json.dumps(
-                    {
-                        "geometry": geometry.mapping(union_poly),
-                        "properties": {"id": 1},
-                    }
-                )
-            )
-
-        # Now burn in the union of all polygons
-        cmd = f"gdal_rasterize -q -burn 1 {temp_vector_file} {out_file}"
-        logger.info(cmd)
-        subprocess.check_call(cmd, shell=True)
-
-
 def _create_burst_cfg(
     cfg: Workflow,
     burst_id: str,
diff --git a/src/dolphin/workflows/config.py b/src/dolphin/workflows/config.py
index 7ee22acf..e795377a 100644
--- a/src/dolphin/workflows/config.py
+++ b/src/dolphin/workflows/config.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
 
-import re
 from datetime import datetime
 from glob import glob
 from pathlib import Path
@@ -30,16 +29,6 @@
 
 logger = get_log(__name__)
 
-# Specific to OPERA CSLC products:
-OPERA_DATASET_ROOT = "/"
-OPERA_DATASET_NAME = f"{OPERA_DATASET_ROOT}/data/VV"
-OPERA_IDENTIFICATION = f"{OPERA_DATASET_ROOT}/identification"
-
-# for example, t087_185684_iw2
-OPERA_BURST_RE = re.compile(
-    r"t(?P<track>\d{3})_(?P<burst_id>\d{6})_(?P<subswath>iw[1-3])"
-)
-
 
 class PsOptions(BaseModel, extra="forbid"):
     """Options for the PS pixel selection portion of the workflow."""
@@ -229,11 +218,7 @@ class InputOptions(BaseModel, extra="forbid"):
 
     subdataset: Optional[str] = Field(
         None,
-        description=(
-            "If passing HDF5/NetCDF files, subdataset to use from CSLC files. "
-            f"If not specified, but all `cslc_file_list` looks like {OPERA_BURST_RE}, "
-            f" will use {OPERA_DATASET_NAME} as the subdataset."
-        ),
+        description="If passing HDF5/NetCDF files, subdataset to use from CSLC files. ",
     )
     cslc_date_fmt: str = Field(
         "%Y%m%d",
@@ -420,12 +405,6 @@ def _check_input_file_list(cls, v):
 
         return list(v)
 
-    @staticmethod
-    def _is_opera_file_list(cslc_file_list):
-        return all(
-            re.search(OPERA_BURST_RE, str(f)) is not None for f in cslc_file_list
-        )
-
     @model_validator(mode="after")
     def _check_slc_files_exist(self) -> "Workflow":
         file_list = self.cslc_file_list
@@ -435,11 +414,11 @@ def _check_slc_files_exist(self) -> "Workflow":
         input_options = self.input_options
         date_fmt = input_options.cslc_date_fmt
         # Filter out files that don't have dates in the filename
-        file_matching_date = [Path(f) for f in file_list if get_dates(f, fmt=date_fmt)]
-        if len(file_matching_date) < len(file_list):
+        files_matching_date = [Path(f) for f in file_list if get_dates(f, fmt=date_fmt)]
+        if len(files_matching_date) < len(file_list):
             raise ValueError(
-                f"Found {len(file_matching_date)} files with dates in the filename"
-                f" out of {len(file_list)} files."
+                f"Found {len(files_matching_date)} files with dates like {date_fmt} in"
+                f" the filename out of {len(file_list)} files."
             )
 
         ext = file_list[0].suffix
@@ -447,18 +426,9 @@ def _check_slc_files_exist(self) -> "Workflow":
         if ext in [".h5", ".nc"]:
             subdataset = input_options.subdataset
             if subdataset is None:
-                if self._is_opera_file_list(file_list):
-                    # Assume that the user forgot to set the subdataset, and set it to the
-                    # default OPERA dataset name
-                    logger.info(
-                        "CSLC files look like OPERA files, setting subdataset to"
-                        f" {OPERA_DATASET_NAME}."
-                    )
-                    subdataset = input_options.subdataset = OPERA_DATASET_NAME
-                else:
-                    raise ValueError(
-                        "Must provide subdataset name for input HDF5 files."
-                    )
+                raise ValueError(
+                    "Must provide subdataset name for input NetCDF/HDF5 files."
+                )
 
         # Coerce the file_list to a sorted list of Path objects
         self.cslc_file_list = [
diff --git a/src/dolphin/workflows/s1_disp.py b/src/dolphin/workflows/s1_disp.py
index 0d11b559..ff95e883 100755
--- a/src/dolphin/workflows/s1_disp.py
+++ b/src/dolphin/workflows/s1_disp.py
@@ -9,10 +9,11 @@
 from dolphin import __version__
 from dolphin._background import DummyProcessPoolExecutor
 from dolphin._log import get_log, log_runtime
+from dolphin.opera_utils import group_by_burst
 from dolphin.utils import get_max_memory_usage, set_num_threads
 
 from . import stitch_and_unwrap, wrapped_phase
-from ._utils import _create_burst_cfg, _remove_dir_if_empty, group_by_burst
+from ._utils import _create_burst_cfg, _remove_dir_if_empty
 from .config import Workflow
 
 
diff --git a/src/dolphin/workflows/wrapped_phase.py b/src/dolphin/workflows/wrapped_phase.py
index cdcbeaf4..c4844b1c 100644
--- a/src/dolphin/workflows/wrapped_phase.py
+++ b/src/dolphin/workflows/wrapped_phase.py
@@ -7,8 +7,9 @@
 from dolphin._background import NvidiaMemoryWatcher
 from dolphin._log import get_log, log_runtime
 from dolphin.interferogram import Network
+from dolphin.opera_utils import make_nodata_mask
 
-from . import _utils, sequential, single
+from . import sequential, single
 from .config import Workflow
 
 
@@ -54,7 +55,7 @@ def run(cfg: Workflow, debug: bool = False) -> tuple[list[Path], Path, Path, Pat
     # Make the nodata mask from the polygons, if we're using OPERA CSLCs
     try:
         nodata_mask_file = cfg.work_directory / "nodata_mask.tif"
-        _utils.make_nodata_mask(
+        make_nodata_mask(
             vrt_stack.file_list, out_file=nodata_mask_file, buffer_pixels=200
         )
     except Exception as e:
diff --git a/tests/test_workflows_config.py b/tests/test_workflows_config.py
index fc67f855..9f25bf9e 100644
--- a/tests/test_workflows_config.py
+++ b/tests/test_workflows_config.py
@@ -240,7 +240,7 @@ def test_input_date_sort(dir_with_2_slcs):
 
 
 def test_input_opera_cslc(tmp_path, slc_stack):
-    """Check that we recognize the OPERA filename format and don't need a subdataset."""
+    """Check that we recognize the OPERA filename format."""
     # Make a file with the OPERA name like OPERA_BURST_RE
     # r"t(?P<track>\d{3})_(?P<burst_id>\d{6})_(?P<subswath>iw[1-3])"
     start_date = 20220101
@@ -260,9 +260,11 @@ def test_input_opera_cslc(tmp_path, slc_stack):
         )
         file_list.append(Path(fname))
 
-    opts = config.Workflow(cslc_file_list=file_list)
+    opts = config.Workflow(
+        cslc_file_list=file_list, input_options=dict(subdataset="/data/VV")
+    )
     assert opts.cslc_file_list == file_list
-    assert opts.input_options.subdataset == config.OPERA_DATASET_NAME
+    assert opts.input_options.subdataset == "/data/VV"
 
 
 def test_input_cslc_empty():
diff --git a/tests/test_workflows_opera_utils.py b/tests/test_workflows_opera_utils.py
new file mode 100644
index 00000000..2d6686af
--- /dev/null
+++ b/tests/test_workflows_opera_utils.py
@@ -0,0 +1,47 @@
+import random
+from itertools import chain
+from pathlib import Path
+
+import pytest
+
+from dolphin.opera_utils import group_by_burst
+
+
+def test_group_by_burst():
+    expected = {
+        "t087_185678_iw2": [
+            Path("t087_185678_iw2/20180210/t087_185678_iw2_20180210_VV.h5"),
+            Path("t087_185678_iw2/20180318/t087_185678_iw2_20180318_VV.h5"),
+            Path("t087_185678_iw2/20180423/t087_185678_iw2_20180423_VV.h5"),
+        ],
+        "t087_185678_iw3": [
+            Path("t087_185678_iw3/20180210/t087_185678_iw3_20180210_VV.h5"),
+            Path("t087_185678_iw3/20180318/t087_185678_iw3_20180318_VV.h5"),
+            Path("t087_185678_iw3/20180517/t087_185678_iw3_20180517_VV.h5"),
+        ],
+        "t087_185679_iw1": [
+            Path("t087_185679_iw1/20180210/t087_185679_iw1_20180210_VV.h5"),
+            Path("t087_185679_iw1/20180318/t087_185679_iw1_20180318_VV.h5"),
+        ],
+    }
+    in_files = list(chain.from_iterable(expected.values()))
+
+    assert group_by_burst(in_files) == expected
+
+    # Any order should work
+    random.shuffle(in_files)
+    # but the order of the lists of each key may be different
+    for burst, file_list in group_by_burst(in_files).items():
+        assert sorted(file_list) == sorted(expected[burst])
+
+
+def test_group_by_burst_non_opera():
+    with pytest.raises(ValueError, match="Could not parse burst id"):
+        group_by_burst(["20200101.slc", "20200202.slc"])
+        # A combination should still error
+        group_by_burst(
+            [
+                "20200101.slc",
+                Path("t087_185679_iw1/20180210/t087_185679_iw1_20180210_VV.h5"),
+            ]
+        )
diff --git a/tests/test_workflows_s1_disp.py b/tests/test_workflows_s1_disp.py
index bf6f0b1e..5accd07e 100644
--- a/tests/test_workflows_s1_disp.py
+++ b/tests/test_workflows_s1_disp.py
@@ -7,6 +7,7 @@
 import pytest
 from make_netcdf import create_test_nc
 
+from dolphin.opera_utils import OPERA_DATASET_NAME
 from dolphin.workflows import config, s1_disp
 
 # 'Grid size 49 will likely result in GPU under-utilization due to low occupancy.'
@@ -28,7 +29,7 @@ def opera_slc_files(tmp_path, slc_stack) -> list[Path]:
     d.mkdir()
     file_list = []
 
-    *group_parts, ds_name = config.OPERA_DATASET_NAME.split("/")
+    *group_parts, ds_name = OPERA_DATASET_NAME.split("/")
     group = "/".join(group_parts)
     for burst_id in ["t087_185683_iw2", "t087_185684_iw2"]:
         for i in range(len(slc_stack)):
@@ -54,6 +55,7 @@ def test_s1_disp_run_single(opera_slc_files: list[Path], tmpdir):
         cfg = config.Workflow(
             workflow_name=config.WorkflowName.SINGLE,
             cslc_file_list=opera_slc_files,
+            input_options=dict(subdataset="/data/VV"),
             interferogram_network=dict(
                 network_type=config.InterferogramNetworkType.MANUAL_INDEX,
                 indexes=[(0, -1)],
@@ -73,6 +75,7 @@ def test_s1_disp_run_stack(opera_slc_files: list[Path], tmpdir):
         cfg = config.Workflow(
             workflow_name=config.WorkflowName.STACK,
             cslc_file_list=opera_slc_files,
+            input_options=dict(subdataset="/data/VV"),
             phase_linking=dict(
                 ministack_size=500,
             ),
diff --git a/tests/test_workflows_utils.py b/tests/test_workflows_utils.py
index 73d7f6aa..4818e88e 100644
--- a/tests/test_workflows_utils.py
+++ b/tests/test_workflows_utils.py
@@ -1,56 +1,11 @@
-import random
-from itertools import chain
-from pathlib import Path
-
 import numpy as np
 import pytest
 from osgeo import gdal
 
 from dolphin import stack
-from dolphin.workflows import group_by_burst
 from dolphin.workflows.single import setup_output_folder
 
 
-def test_group_by_burst():
-    expected = {
-        "t087_185678_iw2": [
-            Path("t087_185678_iw2/20180210/t087_185678_iw2_20180210_VV.h5"),
-            Path("t087_185678_iw2/20180318/t087_185678_iw2_20180318_VV.h5"),
-            Path("t087_185678_iw2/20180423/t087_185678_iw2_20180423_VV.h5"),
-        ],
-        "t087_185678_iw3": [
-            Path("t087_185678_iw3/20180210/t087_185678_iw3_20180210_VV.h5"),
-            Path("t087_185678_iw3/20180318/t087_185678_iw3_20180318_VV.h5"),
-            Path("t087_185678_iw3/20180517/t087_185678_iw3_20180517_VV.h5"),
-        ],
-        "t087_185679_iw1": [
-            Path("t087_185679_iw1/20180210/t087_185679_iw1_20180210_VV.h5"),
-            Path("t087_185679_iw1/20180318/t087_185679_iw1_20180318_VV.h5"),
-        ],
-    }
-    in_files = list(chain.from_iterable(expected.values()))
-
-    assert group_by_burst(in_files) == expected
-
-    # Any order should work
-    random.shuffle(in_files)
-    # but the order of the lists of each key may be different
-    for burst, file_list in group_by_burst(in_files).items():
-        assert sorted(file_list) == sorted(expected[burst])
-
-
-def test_group_by_burst_non_opera():
-    with pytest.raises(ValueError, match="Could not parse burst id"):
-        group_by_burst(["20200101.slc", "20200202.slc"])
-        # A combination should still error
-        group_by_burst(
-            [
-                "20200101.slc",
-                Path("t087_185679_iw1/20180210/t087_185679_iw1_20180210_VV.h5"),
-            ]
-        )
-
-
 def test_setup_output_folder(tmpdir, tiled_file_list):
     vrt_stack = stack.VRTStack(tiled_file_list, outfile=tmpdir / "stack.vrt")
     out_file_list = setup_output_folder(vrt_stack, driver="GTiff", dtype=np.complex64)