From 9e9b3d372a1e11da2f9333dafa025c32a0dcf28b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Diego=20Alonso=20=C3=81lvarez?=
 <d.alonso-alvarez@imperial.ac.uk>
Date: Fri, 4 Aug 2023 12:48:22 +0200
Subject: [PATCH] :memo: Add missing docstrings

---
 validated/functions.py | 103 +++++++++++++++++++++++++----------------
 1 file changed, 62 insertions(+), 41 deletions(-)

diff --git a/validated/functions.py b/validated/functions.py
index d8b924b0..cd93448f 100755
--- a/validated/functions.py
+++ b/validated/functions.py
@@ -237,18 +237,18 @@ def measurement_to_df(
 
 
 def join_data_and_preprocess(
-    data: Sequence[pd.DataFrame], maximum: float, minimum: float, fields: Sequence
+    data: Sequence[pd.DataFrame], maximum: Decimal, minimum: Decimal, fields: Sequence
 ) -> pd.DataFrame:
-    """
+    """Joins data frames and perform some validation of the data limits.
 
     Args:
-        data:
-        maximum:
-        minimum:
-        fields:
+        data: List of data frames to join and pre-process together.
+        maximum: Maximum value expected for the variable.
+        minimum: Minimum value expected for the variable.
+        fields: Data fields that are expected in this data.
 
     Returns:
-
+        The joint data frame.
     """
     joined = pd.concat(data).sort_values(
         by=["time_truncated", "is_validated", "id"], ascending=[True, False, False]
@@ -275,13 +275,19 @@ def join_data_and_preprocess(
 
 
 def select_values(joined: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
-    """
+    """Select the first values with a given time, labelling them in the dataframe.
+
+    When the times are truncated to the appropriate resolution, several entries might
+    have the same time. This function selects only the first of such duplicated entries,
+    providing as output the original dataframe with  the selected entries labelled and
+    also the selected entries themselves as a separate dataframe.
 
     Args:
-        joined:
+        joined: The data frame with all the data.
 
     Returns:
-
+        The joined dataframe with the selected entries labelled and the dataframe only
+        with the selected entries.
     """
     selected = joined.drop_duplicates("time_truncated", keep="first")
     selected.reset_index(drop=True, inplace=True)
@@ -293,21 +299,26 @@ def select_values(joined: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
         indicator=False,
     )
     joined["is_selected"].fillna(False, inplace=True)
-    return selected
+    return joined, selected
 
 
 def verify_time_lapse_status(
     joined: pd.DataFrame, selected: pd.DataFrame, period: float
 ) -> Tuple[pd.DataFrame, pd.DataFrame]:
-    """
+    """Verifies if period of the time entries is correct, labelling them appropriately.
 
     Args:
-        joined:
-        selected:
-        period:
+        joined: The full dataframe with the joined data.
+        selected: The subset of the selected data.
+        period: The expected period for the measurements.
 
     Returns:
-
+        Both input arrays with updated columns:
+            - time_lapse: with the time separation between entries.
+            - time_lapse_status: flag that indicates if the period is correct (1), too
+                small (0) or too large (2)
+            - lagged_value: with the variable value of the next record, used to identify
+                suspicious changes in value.
     """
     selected["time_lapse"] = selected["time_truncated"] - selected[
         "time_truncated"
@@ -334,36 +345,38 @@ def verify_time_lapse_status(
 
 
 def flag_value_difference_error(
-    data: pd.DataFrame, diff_error: Decimal
+    joined: pd.DataFrame, diff_error: Decimal
 ) -> pd.DataFrame:
-    """
+    """Identifies suspicious values based on the difference with the lagged ones.
 
     Args:
-        data:
-        diff_error:
+        joined: The full dataframe with the joined data.
+        diff_error: The value difference allowed for this variable.
 
     Returns:
-
+        The input arrays with two new columns:
+            - value_difference: With the difference in values.
+            - value_difference_error: Flag indicating if the difference is significant.
     """
-    data["value_difference"] = data["value"] - data["lagged_value"]
-    data["value_difference_error"] = np.where(
-        data["value_difference"].abs().gt(diff_error),
+    joined["value_difference"] = joined["value"] - joined["lagged_value"]
+    joined["value_difference_error"] = np.where(
+        joined["value_difference"].abs().gt(diff_error),
         True,
         False,
     )
-    return data
+    return joined
 
 
 def normalize_column_names(data: Sequence[pd.DataFrame], old: str, new: str) -> None:
-    """
+    """Switch column names between two patterns.
 
-    Args:
-        data:
-        old:
-        new:
-
-    Returns:
+    Used to normalize dataframes to a common column names pattern before a calculation
+    and returning them to the correct names afterward.
 
+    Args:
+        data: List of dataframes to be normalized.
+        old: The string to be looked for in the column names.
+        new: The replacement.
     """
     for df in data:
         for col in list(df):
@@ -376,17 +389,25 @@ def basic_calculations(
     variable: Variable,
     start_time: Union[datetime, str],
     end_time: Union[datetime, str],
-    minimum: float,
-    maximum: float,
+    minimum: Decimal,
+    maximum: Decimal,
 ):
-    """
-    Returns sub-hourly table with some calculations
-    It is used in main report for Validation interface and it is also called for "save_to_validated" function/request
-    """
-    # TODO: Is this needed?
-    minimum = float(minimum)
-    maximum = float(maximum)
+    """Returns sub-hourly tables with some validation information
+
+    It is used in main report for Validation interface, and it is also called for
+    "save_to_validated" function/request.
+
+    Args:
+        station: Station of interest.
+        variable: Variable of interest.
+        start_time: Start time.
+        end_time: End time.
+        maximum: Maximum value expected for the variable.
+        minimum: Minimum value expected for the variable.
 
+    Returns:
+
+    """
     tx_period = DeltaT.objects.get(station__station_id=station.station_id).delta_t
 
     validated = validated_to_df(station, variable, start_time, end_time)