From 882805a1bef8a176320d6e265219efc3185a8b86 Mon Sep 17 00:00:00 2001
From: Feda Curic <feda.curic@gmail.com>
Date: Fri, 23 Aug 2024 08:38:07 +0200
Subject: [PATCH] Don't return duplicate data from export tool

Happens when there are multiple experiments that have ensembles with same names.
---
 src/ert/gui/ertwidgets/listeditbox.py         | 36 ++++----
 src/ert/gui/tools/export/export_panel.py      | 22 ++++-
 .../jobs/internal-gui/scripts/csv_export.py   | 88 ++++++++++---------
 .../scripts/gen_data_rft_export.py            | 49 ++++++-----
 tests/unit_tests/gui/test_csv_export.py       | 68 +++++++++++---
 5 files changed, 168 insertions(+), 95 deletions(-)

diff --git a/src/ert/gui/ertwidgets/listeditbox.py b/src/ert/gui/ertwidgets/listeditbox.py
index 64ef9cca3f0..92aeadc1dc2 100644
--- a/src/ert/gui/ertwidgets/listeditbox.py
+++ b/src/ert/gui/ertwidgets/listeditbox.py
@@ -1,4 +1,5 @@
-from typing import Iterable, List, Optional
+from typing import Dict, Iterable, Optional
+from uuid import UUID
 
 from qtpy.QtCore import QSize, Qt
 from qtpy.QtGui import QIcon, QKeyEvent
@@ -86,13 +87,14 @@ class ListEditBox(QWidget):
     NO_ITEMS_SPECIFIED_MSG = "The list must contain at least one item or * (for all)."
     DEFAULT_MSG = "A list of comma separated ensemble names or * for all."
 
-    def __init__(self, possible_items: List[str]) -> None:
+    def __init__(self, possible_items: Dict[UUID, str]) -> None:
         QWidget.__init__(self)
 
         self._editing = True
-        self._possible_items = possible_items
+        self._possible_items_dict = possible_items
+        self._possible_items = list(possible_items.values())
 
-        self._list_edit_line = AutoCompleteLineEdit(possible_items, self)
+        self._list_edit_line = AutoCompleteLineEdit(self._possible_items, self)
         self._list_edit_line.setMinimumWidth(350)
 
         layout = QHBoxLayout()
@@ -127,21 +129,27 @@ def getListText(self) -> str:
         text = "".join(text.split())
         return text
 
-    def getItems(self) -> List[str]:
+    def getItems(self) -> Dict[UUID, str]:
         text = self.getListText()
         items = text.split(",")
 
         if len(items) == 1 and items[0] == "*":
-            items = self._possible_items
+            return self._possible_items_dict
 
-        return [item for item in items if len(item) > 0]
+        result = {}
+        for item in items:
+            item = item.strip()
+            for uuid, name in self._possible_items_dict.items():
+                if name == item:
+                    result[uuid] = name
+                    break
+
+        return result
 
     def validateList(self) -> None:
         """Called whenever the list is modified"""
         palette = self._list_edit_line.palette()
-
         items = self.getItems()
-
         valid = True
         message = ""
 
@@ -149,19 +157,17 @@ def validateList(self) -> None:
             valid = False
             message = ListEditBox.NO_ITEMS_SPECIFIED_MSG
         else:
-            for item in items:
-                if item not in self._possible_items:
+            for _, name in items.items():
+                if name not in self._possible_items_dict.values():
                     valid = False
-                    message = ListEditBox.ITEM_DOES_NOT_EXIST_MSG % item
+                    message = ListEditBox.ITEM_DOES_NOT_EXIST_MSG % name
+                    break
 
         validity_type = ValidationSupport.WARNING
-
         color = ValidationSupport.ERROR_COLOR if not valid else self._valid_color
-
         self._validation_support.setValidationMessage(message, validity_type)
         self._list_edit_line.setToolTip(message)
         palette.setColor(self._list_edit_line.backgroundRole(), color)
-
         self._list_edit_line.setPalette(palette)
 
         if valid:
diff --git a/src/ert/gui/tools/export/export_panel.py b/src/ert/gui/tools/export/export_panel.py
index f4241441ed8..03c1a1c33df 100644
--- a/src/ert/gui/tools/export/export_panel.py
+++ b/src/ert/gui/tools/export/export_panel.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import json
 from typing import TYPE_CHECKING, Optional
 
 from qtpy.QtWidgets import QCheckBox, QWidget
@@ -23,6 +24,7 @@ def __init__(
         storage: LocalStorage,
         parent: Optional[QWidget] = None,
     ) -> None:
+        self.storage = storage
         description = "The CSV export requires some information before it starts:"
         super().__init__("export", description, parent)
 
@@ -37,9 +39,12 @@ def __init__(
         )
         design_matrix_path_chooser = PathChooser(self.design_matrix_path_model)
 
-        self.list_edit = ListEditBox(
-            [ensemble.name for ensemble in storage.ensembles if ensemble.has_data()]
-        )
+        ensemble_with_data_dict = {
+            ensemble.id: ensemble.name
+            for ensemble in storage.ensembles
+            if ensemble.has_data()
+        }
+        self.list_edit = ListEditBox(ensemble_with_data_dict)
 
         self.drop_const_columns_check = QCheckBox()
         self.drop_const_columns_check.setChecked(False)
@@ -60,7 +65,16 @@ def output_path(self) -> Optional[str]:
 
     @property
     def ensemble_list(self) -> str:
-        return ",".join(self.list_edit.getItems())
+        ensembles = {
+            str(ensemble.id): {
+                "ensemble_name": ensemble.name,
+                "experiment_name": ensemble.experiment.name,
+            }
+            for ensemble in self.storage.ensembles
+            if ensemble.name in self.list_edit.getItems().values()
+        }
+
+        return json.dumps(ensembles)
 
     @property
     def design_matrix_path(self) -> Optional[str]:
diff --git a/src/ert/resources/workflows/jobs/internal-gui/scripts/csv_export.py b/src/ert/resources/workflows/jobs/internal-gui/scripts/csv_export.py
index 829b9665ab4..18d5b340f8c 100644
--- a/src/ert/resources/workflows/jobs/internal-gui/scripts/csv_export.py
+++ b/src/ert/resources/workflows/jobs/internal-gui/scripts/csv_export.py
@@ -1,3 +1,4 @@
+import json
 import os
 
 import pandas
@@ -22,9 +23,9 @@ class CSVExportJob(ErtScript):
 
     Optional arguments:
 
-    ensemble_list: a comma separated list of ensembles to export (no spaces allowed)
-               if no list is provided the current ensemble is exported
-               a single * can be used to export all ensembles
+    ensemble_list: a JSON string representation of a dictionary where keys are
+                   UUID strings and values are ensemble names.
+                   A single * can be used to export all ensembles
 
     design_matrix: a path to a file containing the design matrix
 
@@ -57,17 +58,19 @@ def run(
         design_matrix_path = None if len(workflow_args) < 3 else workflow_args[2]
         _ = True if len(workflow_args) < 4 else workflow_args[3]
         drop_const_cols = False if len(workflow_args) < 5 else workflow_args[4]
-        ensembles = []
         facade = LibresFacade(ert_config)
 
-        ensembles = ensemble_list.split(",")
+        ensemble_dict = json.loads(ensemble_list) if ensemble_list else {}
 
-        if ensemble_list is None or len(ensembles) == 0:
-            ensembles = "default"
+        # Use the keys (UUIDs as strings) to get ensembles
+        ensembles = []
+        for ensemble_id in ensemble_dict:
+            ensemble = self.storage.get_ensemble(ensemble_id)
+            ensembles.append(ensemble)
 
         if design_matrix_path is not None:
             if not os.path.exists(design_matrix_path):
-                raise UserWarning("The design matrix file does not exists!")
+                raise UserWarning("The design matrix file does not exist!")
 
             if not os.path.isfile(design_matrix_path):
                 raise UserWarning("The design matrix is not a file!")
@@ -75,41 +78,44 @@ def run(
         data = pandas.DataFrame()
 
         for ensemble in ensembles:
-            ensemble = ensemble.strip()
-
             try:
-                ensemble = self.storage.get_ensemble_by_name(ensemble)
-            except KeyError as exc:
-                raise UserWarning(f"The ensemble '{ensemble}' does not exist!") from exc
-
-            if not ensemble.has_data():
-                raise UserWarning(f"The ensemble '{ensemble}' does not have any data!")
-
-            ensemble_data = ensemble.load_all_gen_kw_data()
+                if not ensemble.has_data():
+                    raise UserWarning(
+                        f"The ensemble '{ensemble.name}' does not have any data!"
+                    )
+
+                ensemble_data = ensemble.load_all_gen_kw_data()
+
+                if design_matrix_path is not None:
+                    design_matrix_data = loadDesignMatrix(design_matrix_path)
+                    if not design_matrix_data.empty:
+                        ensemble_data = ensemble_data.join(
+                            design_matrix_data, how="outer"
+                        )
+
+                misfit_data = facade.load_all_misfit_data(ensemble)
+                if not misfit_data.empty:
+                    ensemble_data = ensemble_data.join(misfit_data, how="outer")
+
+                summary_data = ensemble.load_all_summary_data()
+                if not summary_data.empty:
+                    ensemble_data = ensemble_data.join(summary_data, how="outer")
+                else:
+                    ensemble_data["Date"] = None
+                    ensemble_data.set_index(["Date"], append=True, inplace=True)
+
+                ensemble_data["Iteration"] = ensemble.iteration
+                ensemble_data["Ensemble"] = ensemble.name
+                ensemble_data.set_index(
+                    ["Ensemble", "Iteration"], append=True, inplace=True
+                )
+
+                data = pandas.concat([data, ensemble_data])
 
-            if design_matrix_path is not None:
-                design_matrix_data = loadDesignMatrix(design_matrix_path)
-                if not design_matrix_data.empty:
-                    ensemble_data = ensemble_data.join(design_matrix_data, how="outer")
-
-            misfit_data = facade.load_all_misfit_data(ensemble)
-            if not misfit_data.empty:
-                ensemble_data = ensemble_data.join(misfit_data, how="outer")
-
-            summary_data = ensemble.load_all_summary_data()
-            if not summary_data.empty:
-                ensemble_data = ensemble_data.join(summary_data, how="outer")
-            else:
-                ensemble_data["Date"] = None
-                ensemble_data.set_index(["Date"], append=True, inplace=True)
-
-            ensemble_data["Iteration"] = ensemble.iteration
-            ensemble_data["Ensemble"] = ensemble.name
-            ensemble_data.set_index(
-                ["Ensemble", "Iteration"], append=True, inplace=True
-            )
-
-            data = pandas.concat([data, ensemble_data])
+            except KeyError as exc:
+                raise UserWarning(
+                    f"The ensemble '{ensemble.name}' does not exist!"
+                ) from exc
 
         data = data.reorder_levels(["Realization", "Iteration", "Date", "Ensemble"])
         if drop_const_cols:
diff --git a/src/ert/resources/workflows/jobs/internal-gui/scripts/gen_data_rft_export.py b/src/ert/resources/workflows/jobs/internal-gui/scripts/gen_data_rft_export.py
index c04c426252a..9ac47046252 100644
--- a/src/ert/resources/workflows/jobs/internal-gui/scripts/gen_data_rft_export.py
+++ b/src/ert/resources/workflows/jobs/internal-gui/scripts/gen_data_rft_export.py
@@ -1,4 +1,5 @@
 import contextlib
+import json
 import os
 
 import numpy
@@ -77,16 +78,8 @@ def run(
         storage,
         workflow_args,
     ):
-        """The run method will export the RFT's for all wells and all ensembles.
+        """The run method will export the RFT's for all wells and all ensembles."""
 
-        The successful operation of this method hinges on two naming
-        conventions:
-
-          1. All the GEN_DATA RFT observations have key RFT_$WELL
-          2. The trajectory files are in $trajectory_path/$WELL.txt
-             or $trajectory_path/$WELL_R.txt
-
-        """
         output_file = workflow_args[0]
         trajectory_path = workflow_args[1]
         ensemble_list = None if len(workflow_args) < 3 else workflow_args[2]
@@ -94,20 +87,18 @@ def run(
 
         wells = set()
 
-        ensemble_names = []
-        if ensemble_list is not None:
-            ensemble_names = ensemble_list.split(",")
+        # Parse the ensemble_list from JSON string to dictionary
+        ensemble_dict = json.loads(ensemble_list) if ensemble_list else {}
 
-        if len(ensemble_names) == 0:
+        if not ensemble_dict:
             raise UserWarning("No ensembles given to load from")
 
         data = []
-        for ensemble_name in ensemble_names:
-            ensemble_name = ensemble_name.strip()
-            ensemble_data = []
+        for ensemble_id, ensemble_info in ensemble_dict.items():
+            ensemble_name = ensemble_info["ensemble_name"]
 
             try:
-                ensemble = storage.get_ensemble_by_name(ensemble_name)
+                ensemble = storage.get_ensemble(ensemble_id)
             except KeyError as exc:
                 raise UserWarning(
                     f"The ensemble '{ensemble_name}' does not exist!"
@@ -130,6 +121,7 @@ def run(
                     " GENERAL_OBSERVATIONS starting with RFT_*"
                 )
 
+            ensemble_data = []
             for obs_key in obs_keys:
                 well = obs_key.replace("RFT_", "")
                 wells.add(well)
@@ -155,8 +147,6 @@ def run(
                     columns=realizations,
                 )
 
-                realizations = ensemble.get_realization_list_with_responses()
-
                 # Trajectory
                 trajectory_file = os.path.join(trajectory_path, f"{well}.txt")
                 if not os.path.isfile(trajectory_file):
@@ -223,8 +213,12 @@ def getArguments(self, parent, storage):
         trajectory_chooser = PathChooser(trajectory_model)
         trajectory_chooser.setObjectName("trajectory_chooser")
 
-        all_ensemble_list = [ensemble.name for ensemble in storage.ensembles]
-        list_edit = ListEditBox(all_ensemble_list)
+        ensemble_with_data_dict = {
+            ensemble.id: ensemble.name
+            for ensemble in storage.ensembles
+            if ensemble.has_data()
+        }
+        list_edit = ListEditBox(ensemble_with_data_dict)
         list_edit.setObjectName("list_of_ensembles")
 
         drop_const_columns_check = QCheckBox()
@@ -244,12 +238,21 @@ def getArguments(self, parent, storage):
         success = dialog.showAndTell()
 
         if success:
-            ensemble_list = ",".join(list_edit.getItems())
+            ensemble_list = {
+                str(ensemble.id): {
+                    "ensemble_name": ensemble.name,
+                    "experiment_name": ensemble.experiment.name,
+                }
+                for ensemble in storage.ensembles
+                if ensemble.name in list_edit.getItems().values()
+            }
             with contextlib.suppress(ValueError):
                 return [
                     output_path_model.getPath(),
                     trajectory_model.getPath(),
-                    ensemble_list,
+                    json.dumps(
+                        ensemble_list
+                    ),  # Return the ensemble list as a JSON string
                     drop_const_columns_check.isChecked(),
                 ]
 
diff --git a/tests/unit_tests/gui/test_csv_export.py b/tests/unit_tests/gui/test_csv_export.py
index 2b391feda7c..70597d819f4 100644
--- a/tests/unit_tests/gui/test_csv_export.py
+++ b/tests/unit_tests/gui/test_csv_export.py
@@ -1,27 +1,25 @@
+import contextlib
 import os
+import shutil
 from pathlib import Path
 
+import pandas as pd
 import pytest
 from qtpy.QtCore import Qt, QTimer
-from qtpy.QtWidgets import (
-    QMessageBox,
-)
+from qtpy.QtWidgets import QComboBox, QMessageBox, QWidget
 
 from ert.gui.ertwidgets.listeditbox import ListEditBox
 from ert.gui.ertwidgets.pathchooser import PathChooser
+from ert.gui.simulation.experiment_panel import EnsembleExperimentPanel, ExperimentPanel
+from ert.gui.simulation.run_dialog import RunDialog
 from ert.gui.tools.export.export_panel import ExportDialog
 from ert.libres_facade import LibresFacade
+from ert.run_models import EnsembleExperiment
 
-from .conftest import (
-    get_child,
-    wait_for_child,
-)
+from .conftest import get_child, wait_for_child
 
 
-@pytest.mark.parametrize("ensemble_select", ["default_0", "*"])
-def test_csv_export(esmda_has_run, qtbot, ensemble_select):
-    gui = esmda_has_run
-
+def export_data(gui, qtbot, ensemble_select, export_path="output.csv"):
     file_name = None
 
     def handle_export_dialog():
@@ -53,9 +51,13 @@ def handle_finished_box():
     QTimer.singleShot(3000, handle_finished_box)
 
     gui.tools["Export data"].trigger()
-    assert file_name == "output.csv"
+    assert file_name == export_path
     qtbot.waitUntil(lambda: os.path.exists(file_name))
 
+    return file_name
+
+
+def verify_exported_content(file_name, gui, ensemble_select):
     file_content = Path(file_name).read_text(encoding="utf-8")
     ensemble_names = [ensemble_select]
     if ensemble_select == "*":
@@ -72,3 +74,45 @@ def handle_finished_box():
                 f",{name},{gen_kw_data.iloc[i]['COEFFS:a']},{gen_kw_data.iloc[i]['COEFFS:b']},{gen_kw_data.iloc[i]['COEFFS:c']},{misfit_data.iloc[i]['MISFIT:POLY_OBS']},{misfit_data.iloc[i]['MISFIT:TOTAL']}"
                 in file_content
             )
+
+
+@pytest.mark.parametrize("ensemble_select", ["default_0", "*"])
+def test_csv_export(esmda_has_run, qtbot, ensemble_select):
+    gui = esmda_has_run
+
+    file_name = export_data(gui, qtbot, ensemble_select)
+    verify_exported_content(file_name, gui, ensemble_select)
+
+
+def run_experiment_and_export(gui, qtbot):
+    experiment_panel = get_child(gui, ExperimentPanel)
+    simulation_mode_combo = get_child(experiment_panel, QComboBox)
+    simulation_mode_combo.setCurrentText(EnsembleExperiment.name())
+    ensemble_experiment_panel = get_child(experiment_panel, EnsembleExperimentPanel)
+    ensemble_experiment_panel._ensemble_name_field.setText("iter-0")
+
+    # Avoids run path dialog
+    with contextlib.suppress(FileNotFoundError):
+        shutil.rmtree("poly_out")
+
+    run_experiment = get_child(experiment_panel, QWidget, name="run_experiment")
+    qtbot.mouseClick(run_experiment, Qt.LeftButton)
+
+    run_dialog = wait_for_child(gui, qtbot, RunDialog)
+    qtbot.waitUntil(run_dialog.done_button.isVisible, timeout=100000)
+    qtbot.waitUntil(lambda: run_dialog._tab_widget.currentWidget() is not None)
+    qtbot.mouseClick(run_dialog.done_button, Qt.LeftButton)
+
+
+def test_that_export_tool_does_not_produce_duplicate_data(
+    ensemble_experiment_has_run_no_failure, qtbot
+):
+    gui = ensemble_experiment_has_run_no_failure
+
+    run_experiment_and_export(gui, qtbot)
+
+    file_name = export_data(gui, qtbot, "*")
+
+    df = pd.read_csv(file_name)
+    # Make sure data is not duplicated.
+    assert df.iloc[0]["COEFFS:a"] != df.iloc[20]["COEFFS:a"]