feat: refactor and complete readme

linz · Apr 3, 2023 · 60421b2 · 60421b2
1 parent 51e9386
commit 60421b2
Show file tree

Hide file tree

Showing 2 changed files with 164 additions and 110 deletions.
diff --git a/tools/README.md b/tools/README.md
@@ -1,16 +1,48 @@
 # Tools
 
-This folder contains single use scripts which have been used to assist in running argo workflows.
-The scripts are stored in this folder if it is thought they may become useful again in the future.
+This folder contains single use scripts which have been used to assist in various argo tasks.  
+The scripts should be stored in this folder if they may become useful again in the future.
 
 ## generate-argo-commands-imagery.py
 
 **Date:** 14/02/2023
 
 **Related Jira Tickets:** [TDE-632](https://toitutewhenua.atlassian.net/jira/software/c/projects/TDE/boards/768/backlog?atlOrigin=eyJpIjoiNjVkNmMyNmNmNGJlNDIzOGI2YmIyMzViNzVkNDUwZjEiLCJwIjoiaiJ9); [TDE-631](https://toitutewhenua.atlassian.net/browse/TDE-631?atlOrigin=eyJpIjoiNDI5OGE5MGY5ZmUxNGUyNzkwZjdlYTcxOTg5ZmQ0MGUiLCJwIjoiaiJ9)
 
-**Description:** This script was generated to allow for the processing of numerous imagery datasets using the argo cli.
+**Description:**  
+This script sets up for the automated processing of numerous imagery datasets using the argo cli.
 
-**Additional Resources/links:**
+**Setup:**
 
-- [CSV](https://linzsrm.sharepoint.com/:x:/r/sites/Topography/_layouts/15/Doc.aspx?sourcedoc=%7B508567E2-EF88-458B-9115-0FC719CAA540%7D&file=imagery-standardising-parameters-bulk-process.xlsx&action=default&mobileredirect=true)
+Download the [parameters csv](https://linzsrm.sharepoint.com/:x:/r/sites/Topography/_layouts/15/Doc.aspx?sourcedoc=%7B508567E2-EF88-458B-9115-0FC719CAA540%7D&file=imagery-standardising-parameters-bulk-process.xlsx&action=default&mobileredirect=true) from sharepoint, store as `imagery-standardising-parameters-bulk-process.csv` in `./tools/`  
+ _nb: you may have to convert this from xlsx to csv, this can be done many places [online](https://cloudconvert.com/xlsx-to-csv)._
+
+**Instructions:**
+
+1. Update the `SOURCE` variable in generate-argo-cli-commands.py
+2. Run:
+
+```bash
+cd ./tools
+python3 generate-argo-cli-commands.py > log.txt
+```
+
+**Output:**
+
+- **region-year-scale.yaml:** workflow parameters for this dataset
+- **standardise-publish.sh:** bash script to 'deploy' argo workflows
+- **standardise-publish-import.sh:** bash script to 'deploy' argo workflows that also require basemaps import
+- **logs.txt:** Contains important logs about skipped datasets.
+
+**Submitting:**  
+`standardise-publish.sh` is set up and ready to go, just run:
+
+```bash
+sh standardise-publish.sh
+```
+
+If created, `standardise-publish-import.sh` will require you to uncomment some lines in `standardise-publish-import.yaml`, then run:
+
+```bash
+sh standardise-publish-import.sh
+```
diff --git a/tools/generate-argo-cli-commands.py b/tools/generate-argo-cli-commands.py
@@ -1,14 +1,16 @@
 import csv
-from typing import List, Optional
+from typing import Dict, List, Optional
 
 import yaml
 from linz_logger import get_log
 
-# nb: CHANGE if working from a different source
-# SOURCE = "s3://linz-data-lake-raster-prod/"
+# #######################################
+# USER PARAMETERS:
 SOURCE = "s3://linz-raster-data-store/"
-
 PARAMETERS_CSV = "./imagery-standardising-parameters-bulk-process.csv"
+# #######################################
+
+# read in enums from workflow template
 with open("../workflows/imagery/standardising.yaml", "r") as f:
     workflow = yaml.load(f, Loader=yaml.loader.SafeLoader)
     for parameter in workflow["spec"]["arguments"]["parameters"]:
@@ -19,9 +21,6 @@
         if parameter["name"] == "scale":
             SCALES = parameter["enum"]
 
-spi_list = []
-sp_list = []
-
 
 def _format_date(date: str) -> str:
     fd_lst = date.split("/")
@@ -57,137 +56,159 @@ def _validate_licensor(licensor: str) -> Optional[str]:
     return None
 
 
-def _validate_producer(producer: str) -> Optional[str]:
+def _add_licensor(row: List[str], index: Dict[str, int]) -> Dict[str, str]:
+    licensor = _validate_licensor(row[index["licensor"]])
+    if not licensor:
+        get_log().warning(
+            "skipped: invalid licensor",
+            licensor=row[index["licensor"]],
+            source=row[index["source"]],
+            title=row[index["title"]],
+        )
+        return {}
+    elif licensor and ";" in licensor:
+        return {"licensor-list": licensor, "licensor": ""}
+    else:
+        return {"licensor": licensor, "licensor-list": ""}
+
+
+def _get_valid_producer(producer: str) -> Dict[str, str]:
     if producer in PRODUCERS:
-        return producer
+        return {"producer": producer}
     elif producer == "NZ Aerial Mapping Ltd":
-        return "NZ Aerial Mapping"
+        return {"producer": "NZ Aerial Mapping"}
     elif producer == "Aerial Surveys Ltd" or producer == "Aerial Surveys Limited":
-        return "Aerial Surveys"
+        return {"producer": "Aerial Surveys"}
     elif producer == "AAM NZ Limited":
-        return "AAM NZ"
+        return {"producer": "AAM NZ"}
     elif producer == "Landpro Ltd":
-        return "Landpro"
+        return {"producer": "Landpro"}
     elif producer == "UAV Mapping NZ Ltd":
-        return "UAV Mapping NZ"
-    return None
+        return {"producer": "UAV Mapping NZ"}
+    return {}
 
 
-def _validate_scale(scale: str) -> Optional[str]:
+def _get_valid_scale(scale: str) -> Dict[str, str]:
     if scale in SCALES:
-        return scale
-    return None
+        return {"scale": scale}
+    return {}
+
+
+def _index_csv(header: List[str]) -> Dict[str, int]:
+    ind = {}
+    ind["comment"] = header.index("Comment")
+    ind["source"] = header.index("source")
+    ind["target"] = header.index("target")
+    ind["scale"] = header.index("scale")
+    ind["title"] = header.index("Title")
+    ind["licensor"] = header.index("licensor(s)")
+    ind["producer"] = header.index("producer(s)")
+    ind["description"] = header.index("description")
+    ind["startdate"] = header.index("start_datetime")
+    ind["enddate"] = header.index("end_datetime")
+    ind["basemaps"] = header.index("basemaps s3 path")
+    return ind
+
+
+def _add_bm_params(target: str, row: List[str], index: Dict[str, int]) -> Dict[str, str]:
+    get_log().info(
+        "basemaps import required",
+        source=row[index["source"]],
+        title=row[index["title"]],
+    )
+    return {
+        "category": "Urban Aerial Photos",
+        "name": "target".rstrip("/rgb/2193/").split("/")[-1],
+        "tile-matrix": "NZTM2000Quad/WebMercatorQuad",
+        "blend": "20",
+        "aligned-level": "6",
+        "create-pull-request": "true",
+    }
+
+
+def _validate_params(params: Dict[str, str], row: List[str], index: Dict[str, int]) -> bool:
+    if not params["scale"]:
+        get_log().warning(
+            "skipped: invalid scale",
+            scale=row[index["scale"]],
+            source=row[index["source"]],
+            title=row[index["title"]],
+        )
+        return False
+    if not params["producer"]:
+        get_log().warning(
+            "skipped: invalid producer",
+            producer=row[index["producer"]],
+            source=row[index["source"]],
+            title=row[index["title"]],
+        )
+        return False
+    return True
+
+
+def _write_params(params: Dict[str, str], file: str) -> None:
+    with open(f"./{file}.yaml", "w", encoding="utf-8") as output:
+        yaml.dump(
+            params,
+            output,
+            default_flow_style=False,
+            default_style='"',
+            sort_keys=False,
+            allow_unicode=True,
+            width=1000,
+        )
+
 
 def main() -> None:
+    spi_list = []
+    sp_list = []
+
+    command = "argo submit ~/dev/topo-workflows/workflows/imagery/standardising-publish-import.yaml -n argo -f ./{0}.yaml --generate-name ispi-{1}-\n"
+
     with open(PARAMETERS_CSV, "r") as csv_file:
         reader = csv.reader(csv_file)
         header = next(reader)
-
-        ind_comment = header.index("Comment")
-        ind_source = header.index("source")
-        ind_target = header.index("target")
-        ind_scale = header.index("scale")
-        ind_title = header.index("Title")
-        ind_licensor = header.index("licensor(s)")
-        ind_producer = header.index("producer(s)")
-        ind_description = header.index("description")
-        ind_startdate = header.index("start_datetime")
-        ind_enddate = header.index("end_datetime")
-        ind_basemaps = header.index("basemaps s3 path")
-
-        command = "argo submit ~/dev/topo-workflows/workflows/imagery/standardising-publish-import.yaml -n argo -f ./{0}.yaml --generate-name ispi-{1}-\n"
+        index = _index_csv(header)
 
         for row in reader:
-            if not row[ind_source].startswith(SOURCE):
+            if not row[index["source"]].startswith(SOURCE):
                 continue
 
-            if row[ind_comment] != "":
+            if row[index["comment"]] != "":
                 get_log().warning(
                     "skipped: comment",
-                    comment=row[ind_comment],
-                    source=row[ind_source],
-                    title=row[ind_title],
+                    comment=row[index["comment"]],
+                    source=row[index["source"]],
+                    title=row[index["title"]],
                 )
                 continue
 
+            file_name = row[index["target"]].rstrip("/rgb/2193/").split("/")[-1]
+            formatted_file_name = file_name.replace("_", "-").replace(".", "-")
+
             params = {
-                "source": row[ind_source].rstrip("/") + "/",
-                "target": row[ind_target],
-                "scale": _validate_scale(row[ind_scale]),
-                "title": row[ind_title],
-                "description": row[ind_description],
-                "producer": _validate_producer(row[ind_producer]),
-                "start-datetime": _format_date(row[ind_startdate]),
-                "end-datetime": _format_date(row[ind_enddate]),
+                "source": row[index["source"]].rstrip("/") + "/",
+                "target": row[index["target"]],
+                "title": row[index["title"]],
+                "description": row[index["description"]],
+                "start-datetime": _format_date(row[index["startdate"]]),
+                "end-datetime": _format_date(row[index["enddate"]]),
             }
 
-            licensor = _validate_licensor(row[ind_licensor])
-            if licensor and ";" in licensor:
-                params["licensor-list"] = licensor
-                params["licensor"] = ""
-            else:
-                params["licensor"] = licensor
-                params["licensor-list"] = ""
+            params = {**params, **_add_licensor(row, index)}
+            params = {**params, **_get_valid_producer(row[index["producer"]])}
+            params = {**params, **_get_valid_scale(row[index["scale"]])}
 
-            if not params["licensor"] and params["licensor-list"] == "":
-                get_log().warning(
-                    "skipped: invalid licensor",
-                    licensor=row[ind_licensor],
-                    source=row[ind_source],
-                    title=row[ind_title],
-                )
-                continue
-
-            if not params["producer"]:
-                get_log().warning(
-                    "skipped: invalid producer",
-                    producer=row[ind_producer],
-                    source=row[ind_source],
-                    title=row[ind_title],
-                )
+            if not _validate_params(params, row, index):
                 continue
 
-            if not params["scale"]:
-                get_log().warning(
-                    "skipped: invalid scale",
-                    scale=f"{row[ind_scale]}",
-                    source=row[ind_source],
-                    title=row[ind_title],
-                )
-                continue
-
-            file_name = row[ind_target].rstrip("/rgb/2193/").split("/")[-1]
-            formatted_file_name = file_name.replace("_", "-").replace(".", "-")
-
-            if row[ind_basemaps] == "":
-                get_log().info(
-                    "basemaps import required",
-                    source=row[ind_source],
-                    title=row[ind_title],
-                )
-                bm_params = {
-                    "category": "Urban Aerial Photos",
-                    "name": params["target"].rstrip("/rgb/2193/").split("/")[-1],
-                    "tile-matrix": "NZTM2000Quad/WebMercatorQuad",
-                    "blend": "20",
-                    "aligned-level": "6",
-                    "create-pull-request": "true"
-                }
-                params = {**params, **bm_params}
+            if row[index["basemaps"]] == "":
+                params = {**params, **_add_bm_params(params["target"], row, index)}
                 spi_list.append(command.format(formatted_file_name, formatted_file_name))
             else:
                 sp_list.append(command.format(formatted_file_name, formatted_file_name))
 
-            with open(f"./{formatted_file_name}.yaml", "w", encoding="utf-8") as output:
-                yaml.dump(
-                    params,
-                    output,
-                    default_flow_style=False,
-                    default_style='"',
-                    sort_keys=False,
-                    allow_unicode=True,
-                    width=1000,
-                )
+            _write_params(params, formatted_file_name)
 
     with open("standardise-publish.sh", "w") as script:
         script.write("#!/bin/bash\n\n")
@@ -197,4 +218,5 @@ def main() -> None:
         script.write("#!/bin/bash\n\n")
         script.writelines(spi_list)
 
+
 main()