From 953aa0069bf61e652ea5fc527fa6a09264249c95 Mon Sep 17 00:00:00 2001 From: Jhonray Acojedo Date: Tue, 14 May 2024 21:56:42 +0800 Subject: [PATCH 01/23] Add validation for multi-instance training job --- Form1.cs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/Form1.cs b/Form1.cs index 0ab4d3d..02480b0 100644 --- a/Form1.cs +++ b/Form1.cs @@ -981,6 +981,10 @@ private void CalculateBatchSize() { idealBatchSize = -1; } + else if (!supportedInstances.Contains(instance)) + { + idealBatchSize = 16 * instanceCount; + } else { idealBatchSize = 16 * instanceCount * gpuCount; @@ -1047,6 +1051,12 @@ private bool ValidateTrainingParameters(string img_size, string batch_size, stri } } + if (!supportedInstances.Contains(selectedInstance)) + { + MessageBox.Show("Multi-instance training does not support instances with no GPU", "Validation Error", MessageBoxButtons.OK, MessageBoxIcon.Error); + return false; + } + if (!Int32.TryParse(txtBatchSize.Text, out int batchSize)) { MessageBox.Show("Batch size must be an integer.", "Validation Error", MessageBoxButtons.OK, MessageBoxIcon.Error); From 556eeea48f347f38b8fd1146ca87693a8b0fb83b Mon Sep 17 00:00:00 2001 From: Jhonray Acojedo Date: Tue, 14 May 2024 22:02:23 +0800 Subject: [PATCH 02/23] Fix validation for multi instance --- Form1.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Form1.cs b/Form1.cs index 02480b0..88a43b3 100644 --- a/Form1.cs +++ b/Form1.cs @@ -1051,7 +1051,7 @@ private bool ValidateTrainingParameters(string img_size, string batch_size, stri } } - if (!supportedInstances.Contains(selectedInstance)) + if (!supportedInstances.Contains(selectedInstance) && Int32.TryParse(instanceCount, out int instance) && instance > 1) { MessageBox.Show("Multi-instance training does not support instances with no GPU", "Validation Error", MessageBoxButtons.OK, MessageBoxIcon.Error); return false; From 46e05dbca7ec0b5c6d6f941d44d1a43f76606d09 Mon Sep 17 00:00:00 2001 From: kerrlabajo Date: Wed, 15 May 2024 06:09:46 +0800 Subject: [PATCH 03/23] Add detailed exception handler for `FailureReason` in SageMaker --- docker/yolov5-training/train_and_export.py | 23 +++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/docker/yolov5-training/train_and_export.py b/docker/yolov5-training/train_and_export.py index dd82389..9dc25b6 100644 --- a/docker/yolov5-training/train_and_export.py +++ b/docker/yolov5-training/train_and_export.py @@ -147,9 +147,26 @@ def main(): if __name__ == "__main__": try: main() + except AssertionError as e: + with open("/opt/ml/output/failure", "w") as f: + instructions = "Please refer to your AWS Console Management -> SageMaker -> Training Jobs -> -> Monitor Section -> View Logs -> `/aws/sagemaker/TrainingJobs` Log group -> -> Select host `algo-1` for more information." + f.write(str(e) + "\n" + instructions) + print(str(e)) + print(traceback.format_exc()) + sys.exit(1) except Exception as e: with open("/opt/ml/output/failure", "w") as f: - print(e) - f.write(str(e)) - f.write(traceback.format_exc()) + instructions = "Please refer to your AWS Console Management -> SageMaker -> Training Jobs -> -> Monitor Section -> View Logs -> `/aws/sagemaker/TrainingJobs` Log group -> -> Select host `algo-1` for more information." + if "insufficient CUDA devices for DDP command" in str(e): + f.write("Insufficient/No CUDA devices for DDP Training.\n" + instructions) + print(str(e)) + print(traceback.format_exc()) + elif "CUDA out of memory" in str(e): + f.write("CUDA device out of memory.\n" + instructions) + print(str(e)) + print(traceback.format_exc()) + else: + f.write(str(e) + "\n" + instructions) + print(str(e)) + print(traceback.format_exc()) sys.exit(1) From b0f25d36455b3c675c7744bfbc9b3aeeec1b4609 Mon Sep 17 00:00:00 2001 From: kerrlabajo Date: Wed, 15 May 2024 06:31:42 +0800 Subject: [PATCH 04/23] Attempt capture error in a subprocess --- docker/yolov5-training/train_and_export.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/docker/yolov5-training/train_and_export.py b/docker/yolov5-training/train_and_export.py index 9dc25b6..7100ad2 100644 --- a/docker/yolov5-training/train_and_export.py +++ b/docker/yolov5-training/train_and_export.py @@ -35,10 +35,17 @@ def run_script(args, use_module=False): Returns: `None` """ - if use_module: - subprocess.run(["python3", "-m"] + args, check=True) - else: - subprocess.run(["python3"] + args, check=True) + try: + if use_module: + subprocess.run(["python3", "-m"] + args, check=True) + else: + subprocess.run(["python3"] + args, check=True) + except subprocess.CalledProcessError as e: + with open("/opt/ml/output/failure", "w") as f: + f.write(f"Error occurred in subprocess: {str(e)}") + print(str(e)) + print(traceback.format_exc()) + sys.exit(1) def parse_arguments(): parser = argparse.ArgumentParser( From ec91973c29285af8ca1102c8ff94bbfc276c0623 Mon Sep 17 00:00:00 2001 From: kerrlabajo Date: Wed, 15 May 2024 07:12:32 +0800 Subject: [PATCH 05/23] Set future revision to 11 --- LSC-Trainer.csproj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LSC-Trainer.csproj b/LSC-Trainer.csproj index 89565f1..6b7f08b 100644 --- a/LSC-Trainer.csproj +++ b/LSC-Trainer.csproj @@ -23,7 +23,7 @@ false false true - 10 + 11 1.1.1.%2a false true From bcce19da4417ebec1f5916fd2c22a41b9b569fbb Mon Sep 17 00:00:00 2001 From: kerrlabajo Date: Wed, 15 May 2024 07:58:14 +0800 Subject: [PATCH 06/23] Handle retrieval of specific subprocess error --- docker/yolov5-training/train_and_export.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/docker/yolov5-training/train_and_export.py b/docker/yolov5-training/train_and_export.py index 7100ad2..6144dfd 100644 --- a/docker/yolov5-training/train_and_export.py +++ b/docker/yolov5-training/train_and_export.py @@ -37,14 +37,22 @@ def run_script(args, use_module=False): """ try: if use_module: - subprocess.run(["python3", "-m"] + args, check=True) + result = subprocess.run(["python3", "-m"] + args, check=True, stderr=subprocess.PIPE) else: - subprocess.run(["python3"] + args, check=True) - except subprocess.CalledProcessError as e: + result = subprocess.run(["python3"] + args, check=True, stderr=subprocess.PIPE) + except (subprocess.CalledProcessError, AssertionError) as e: + error_message = e.stderr.decode('utf-8') if hasattr(e, 'stderr') else str(e) + instructions = "Please refer to your AWS Console Management -> SageMaker -> Training Jobs -> -> Monitor Section -> View Logs -> `/aws/sagemaker/TrainingJobs` Log group -> -> Select host `algo-1` for more information." + error_message += "\n" + instructions with open("/opt/ml/output/failure", "w") as f: - f.write(f"Error occurred in subprocess: {str(e)}") - print(str(e)) - print(traceback.format_exc()) + if "FileNotFoundError" in error_message: + f.write(f"FileNotFoundError occurred in subprocess: {error_message}") + elif "AssertionError" in error_message: + f.write(f"AssertionError occurred in subprocess: {error_message}") + else: + f.write(f"Error occurred in subprocess: {str(e)}") + print(error_message) + print(traceback.format_exc()) sys.exit(1) def parse_arguments(): From c26abf02a630baf227aa82a4be2af71c9ad5a6ec Mon Sep 17 00:00:00 2001 From: kerrlabajo Date: Wed, 15 May 2024 08:25:24 +0800 Subject: [PATCH 07/23] Add a debug when printing error message to see for actual message --- docker/yolov5-training/train_and_export.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/yolov5-training/train_and_export.py b/docker/yolov5-training/train_and_export.py index 6144dfd..3cee082 100644 --- a/docker/yolov5-training/train_and_export.py +++ b/docker/yolov5-training/train_and_export.py @@ -50,9 +50,9 @@ def run_script(args, use_module=False): elif "AssertionError" in error_message: f.write(f"AssertionError occurred in subprocess: {error_message}") else: - f.write(f"Error occurred in subprocess: {str(e)}") - print(error_message) + f.write(f"Error occurred in subprocess: {error_message}") print(traceback.format_exc()) + print("Debug: " + error_message) sys.exit(1) def parse_arguments(): From 1337487531148458ca2e7d86cfd6456f9742d207 Mon Sep 17 00:00:00 2001 From: kerrlabajo Date: Wed, 15 May 2024 09:12:13 +0800 Subject: [PATCH 08/23] Remove redundant exceptions in main but capture in `run_script` instead --- docker/yolov5-training/train_and_export.py | 26 +--------------------- 1 file changed, 1 insertion(+), 25 deletions(-) diff --git a/docker/yolov5-training/train_and_export.py b/docker/yolov5-training/train_and_export.py index 3cee082..0c5b161 100644 --- a/docker/yolov5-training/train_and_export.py +++ b/docker/yolov5-training/train_and_export.py @@ -160,28 +160,4 @@ def main(): shutil.copy2("/opt/ml/output/data/results/weights/best.onnx", "/opt/ml/model/") if __name__ == "__main__": - try: - main() - except AssertionError as e: - with open("/opt/ml/output/failure", "w") as f: - instructions = "Please refer to your AWS Console Management -> SageMaker -> Training Jobs -> -> Monitor Section -> View Logs -> `/aws/sagemaker/TrainingJobs` Log group -> -> Select host `algo-1` for more information." - f.write(str(e) + "\n" + instructions) - print(str(e)) - print(traceback.format_exc()) - sys.exit(1) - except Exception as e: - with open("/opt/ml/output/failure", "w") as f: - instructions = "Please refer to your AWS Console Management -> SageMaker -> Training Jobs -> -> Monitor Section -> View Logs -> `/aws/sagemaker/TrainingJobs` Log group -> -> Select host `algo-1` for more information." - if "insufficient CUDA devices for DDP command" in str(e): - f.write("Insufficient/No CUDA devices for DDP Training.\n" + instructions) - print(str(e)) - print(traceback.format_exc()) - elif "CUDA out of memory" in str(e): - f.write("CUDA device out of memory.\n" + instructions) - print(str(e)) - print(traceback.format_exc()) - else: - f.write(str(e) + "\n" + instructions) - print(str(e)) - print(traceback.format_exc()) - sys.exit(1) + main() From d1c2f8942dc9d6a2d2fdcd6fe0997a8a15cad785 Mon Sep 17 00:00:00 2001 From: kerrlabajo Date: Wed, 15 May 2024 09:12:34 +0800 Subject: [PATCH 09/23] Attempt use disk as caching method for training/validating --- docker/yolov5-training/train_and_export.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/yolov5-training/train_and_export.py b/docker/yolov5-training/train_and_export.py index 0c5b161..84fa04f 100644 --- a/docker/yolov5-training/train_and_export.py +++ b/docker/yolov5-training/train_and_export.py @@ -136,7 +136,7 @@ def main(): "--data", args.data, "--hyp", "/opt/ml/input/config/custom-hyps.yaml" if args.hyp == "Custom" else args.hyp, "--project", args.project, "--name", args.name, "--patience", args.patience, "--workers", args.workers, "--optimizer", args.optimizer, - "--device", args.device, "--cache", "--exist-ok", + "--device", args.device, "--cache", "disk", "--exist-ok", ] export_args = [ "/code/yolov5/export.py", "--img-size", args.img_size, From 41e7c343a080f31cd0c3a6ccd5185a6927a3b797 Mon Sep 17 00:00:00 2001 From: kerrlabajo Date: Wed, 15 May 2024 09:31:27 +0800 Subject: [PATCH 10/23] Re-enter subprocess' stdout --- docker/yolov5-training/train_and_export.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/yolov5-training/train_and_export.py b/docker/yolov5-training/train_and_export.py index 84fa04f..174c020 100644 --- a/docker/yolov5-training/train_and_export.py +++ b/docker/yolov5-training/train_and_export.py @@ -37,9 +37,9 @@ def run_script(args, use_module=False): """ try: if use_module: - result = subprocess.run(["python3", "-m"] + args, check=True, stderr=subprocess.PIPE) + result = subprocess.run(["python3", "-m"] + args, check=True, stderr=subprocess.PIPE, stdout=sys.stdout) else: - result = subprocess.run(["python3"] + args, check=True, stderr=subprocess.PIPE) + result = subprocess.run(["python3"] + args, check=True, stderr=subprocess.PIPE, stdout=sys.stdout) except (subprocess.CalledProcessError, AssertionError) as e: error_message = e.stderr.decode('utf-8') if hasattr(e, 'stderr') else str(e) instructions = "Please refer to your AWS Console Management -> SageMaker -> Training Jobs -> -> Monitor Section -> View Logs -> `/aws/sagemaker/TrainingJobs` Log group -> -> Select host `algo-1` for more information." From 555e749bf02b7ded418897bc1f4f3a0db9e0f478 Mon Sep 17 00:00:00 2001 From: kerrlabajo Date: Wed, 15 May 2024 09:47:07 +0800 Subject: [PATCH 11/23] Attempt maually gettting the error line only to be returned --- docker/yolov5-training/train_and_export.py | 33 ++++++++++++---------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/docker/yolov5-training/train_and_export.py b/docker/yolov5-training/train_and_export.py index 174c020..ea5f613 100644 --- a/docker/yolov5-training/train_and_export.py +++ b/docker/yolov5-training/train_and_export.py @@ -4,6 +4,7 @@ import json import sys import traceback +import re def get_hosts_and_node_rank(): """ @@ -37,23 +38,25 @@ def run_script(args, use_module=False): """ try: if use_module: - result = subprocess.run(["python3", "-m"] + args, check=True, stderr=subprocess.PIPE, stdout=sys.stdout) + subprocess.run(["python3", "-m"] + args, check=True) else: - result = subprocess.run(["python3"] + args, check=True, stderr=subprocess.PIPE, stdout=sys.stdout) - except (subprocess.CalledProcessError, AssertionError) as e: - error_message = e.stderr.decode('utf-8') if hasattr(e, 'stderr') else str(e) + subprocess.run(["python3"] + args, check=True) + except subprocess.CalledProcessError as e: instructions = "Please refer to your AWS Console Management -> SageMaker -> Training Jobs -> -> Monitor Section -> View Logs -> `/aws/sagemaker/TrainingJobs` Log group -> -> Select host `algo-1` for more information." - error_message += "\n" + instructions - with open("/opt/ml/output/failure", "w") as f: - if "FileNotFoundError" in error_message: - f.write(f"FileNotFoundError occurred in subprocess: {error_message}") - elif "AssertionError" in error_message: - f.write(f"AssertionError occurred in subprocess: {error_message}") - else: - f.write(f"Error occurred in subprocess: {error_message}") - print(traceback.format_exc()) - print("Debug: " + error_message) - sys.exit(1) + with open("/opt/ml/output/failure", "w") as f: + error_message = str(e) + if "FileNotFoundError" in error_message: + error_line = re.search("FileNotFoundError.*", error_message).group() + f.write(f"FileNotFoundError occurred in subprocess: {error_line}\n{instructions}") + elif "AssertionError" in error_message: + error_line = re.search("AssertionError.*", error_message).group() + f.write(f"AssertionError occurred in subprocess: {error_line}\n{instructions}") + else: + f.write(f"Error occurred in subprocess: {error_message}\n{instructions}") + + print(error_message) + print(traceback.format_exc()) + sys.exit(1) def parse_arguments(): parser = argparse.ArgumentParser( From fbcd129c03bdf3f64abca832f4310ddbc5655f1b Mon Sep 17 00:00:00 2001 From: kerrlabajo Date: Wed, 15 May 2024 09:58:22 +0800 Subject: [PATCH 12/23] Fix unbound local error of error_message that was out of exception context --- docker/yolov5-training/train_and_export.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/docker/yolov5-training/train_and_export.py b/docker/yolov5-training/train_and_export.py index ea5f613..4c65711 100644 --- a/docker/yolov5-training/train_and_export.py +++ b/docker/yolov5-training/train_and_export.py @@ -53,10 +53,9 @@ def run_script(args, use_module=False): f.write(f"AssertionError occurred in subprocess: {error_line}\n{instructions}") else: f.write(f"Error occurred in subprocess: {error_message}\n{instructions}") - - print(error_message) - print(traceback.format_exc()) - sys.exit(1) + print(error_message) + print(traceback.format_exc()) + sys.exit(1) def parse_arguments(): parser = argparse.ArgumentParser( From c9a0aac02b934f759d1f4cbaa4afa63ed9de3c13 Mon Sep 17 00:00:00 2001 From: kerrlabajo Date: Wed, 15 May 2024 10:00:10 +0800 Subject: [PATCH 13/23] Move all context of opening failure file in exception context --- docker/yolov5-training/train_and_export.py | 26 +++++++++++----------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/docker/yolov5-training/train_and_export.py b/docker/yolov5-training/train_and_export.py index 4c65711..ce899c2 100644 --- a/docker/yolov5-training/train_and_export.py +++ b/docker/yolov5-training/train_and_export.py @@ -43,19 +43,19 @@ def run_script(args, use_module=False): subprocess.run(["python3"] + args, check=True) except subprocess.CalledProcessError as e: instructions = "Please refer to your AWS Console Management -> SageMaker -> Training Jobs -> -> Monitor Section -> View Logs -> `/aws/sagemaker/TrainingJobs` Log group -> -> Select host `algo-1` for more information." - with open("/opt/ml/output/failure", "w") as f: - error_message = str(e) - if "FileNotFoundError" in error_message: - error_line = re.search("FileNotFoundError.*", error_message).group() - f.write(f"FileNotFoundError occurred in subprocess: {error_line}\n{instructions}") - elif "AssertionError" in error_message: - error_line = re.search("AssertionError.*", error_message).group() - f.write(f"AssertionError occurred in subprocess: {error_line}\n{instructions}") - else: - f.write(f"Error occurred in subprocess: {error_message}\n{instructions}") - print(error_message) - print(traceback.format_exc()) - sys.exit(1) + with open("/opt/ml/output/failure", "w") as f: + error_message = str(e) + if "FileNotFoundError" in error_message: + error_line = re.search("FileNotFoundError.*", error_message).group() + f.write(f"FileNotFoundError occurred in subprocess: {error_line}\n{instructions}") + elif "AssertionError" in error_message: + error_line = re.search("AssertionError.*", error_message).group() + f.write(f"AssertionError occurred in subprocess: {error_line}\n{instructions}") + else: + f.write(f"Error occurred in subprocess: {error_message}\n{instructions}") + print(error_message) + print(traceback.format_exc()) + sys.exit(1) def parse_arguments(): parser = argparse.ArgumentParser( From aa6698d82ab26c26af8fe7cef340cb464b09a464 Mon Sep 17 00:00:00 2001 From: kerrlabajo Date: Wed, 15 May 2024 10:19:35 +0800 Subject: [PATCH 14/23] Attempt reusing stderr and stdout to output either error/output in subprocess --- docker/yolov5-training/train_and_export.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/docker/yolov5-training/train_and_export.py b/docker/yolov5-training/train_and_export.py index ce899c2..6ccca44 100644 --- a/docker/yolov5-training/train_and_export.py +++ b/docker/yolov5-training/train_and_export.py @@ -38,13 +38,13 @@ def run_script(args, use_module=False): """ try: if use_module: - subprocess.run(["python3", "-m"] + args, check=True) + result = subprocess.run(["python3", "-m"] + args, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) else: - subprocess.run(["python3"] + args, check=True) + result = subprocess.run(["python3"] + args, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) except subprocess.CalledProcessError as e: + error_message = e.stderr.decode('utf-8') # decode from bytes to string instructions = "Please refer to your AWS Console Management -> SageMaker -> Training Jobs -> -> Monitor Section -> View Logs -> `/aws/sagemaker/TrainingJobs` Log group -> -> Select host `algo-1` for more information." with open("/opt/ml/output/failure", "w") as f: - error_message = str(e) if "FileNotFoundError" in error_message: error_line = re.search("FileNotFoundError.*", error_message).group() f.write(f"FileNotFoundError occurred in subprocess: {error_line}\n{instructions}") @@ -53,9 +53,12 @@ def run_script(args, use_module=False): f.write(f"AssertionError occurred in subprocess: {error_line}\n{instructions}") else: f.write(f"Error occurred in subprocess: {error_message}\n{instructions}") - print(error_message) - print(traceback.format_exc()) - sys.exit(1) + print(error_message) + print(traceback.format_exc()) + sys.exit(1) + else: + output_message = result.stdout.decode('utf-8') # decode from bytes to string + print(output_message) def parse_arguments(): parser = argparse.ArgumentParser( From 9684656078f47f65da42e2861a5322ff651c66d3 Mon Sep 17 00:00:00 2001 From: kerrlabajo Date: Wed, 15 May 2024 10:32:23 +0800 Subject: [PATCH 15/23] Attempt fix by printing error_message incase the output was stored there --- docker/yolov5-training/train_and_export.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/docker/yolov5-training/train_and_export.py b/docker/yolov5-training/train_and_export.py index 6ccca44..810a0b3 100644 --- a/docker/yolov5-training/train_and_export.py +++ b/docker/yolov5-training/train_and_export.py @@ -47,18 +47,20 @@ def run_script(args, use_module=False): with open("/opt/ml/output/failure", "w") as f: if "FileNotFoundError" in error_message: error_line = re.search("FileNotFoundError.*", error_message).group() - f.write(f"FileNotFoundError occurred in subprocess: {error_line}\n{instructions}") + f.write(f"{error_line}\n{instructions}") elif "AssertionError" in error_message: error_line = re.search("AssertionError.*", error_message).group() - f.write(f"AssertionError occurred in subprocess: {error_line}\n{instructions}") + f.write(f"{error_line}\n{instructions}") else: - f.write(f"Error occurred in subprocess: {error_message}\n{instructions}") + f.write(f"{error_message}\n{instructions}") print(error_message) print(traceback.format_exc()) sys.exit(1) else: output_message = result.stdout.decode('utf-8') # decode from bytes to string + error_message = result.stderr.decode('utf-8') # decode from bytes to string print(output_message) + print(error_message) # print stderr in case the program writes its output to stderr def parse_arguments(): parser = argparse.ArgumentParser( From 7f68edbe9ef14ca0c58dfc2c8656ff721b27bc74 Mon Sep 17 00:00:00 2001 From: Jhonray Acojedo Date: Wed, 15 May 2024 10:49:42 +0800 Subject: [PATCH 16/23] Show message boxes on upload error --- Functions/FileTransferUtility.cs | 91 +++++++++++++++++++++++++++++--- 1 file changed, 85 insertions(+), 6 deletions(-) diff --git a/Functions/FileTransferUtility.cs b/Functions/FileTransferUtility.cs index 7c4cc6f..8b25b1b 100644 --- a/Functions/FileTransferUtility.cs +++ b/Functions/FileTransferUtility.cs @@ -22,6 +22,8 @@ internal class FileTransferUtility : IFileTransferUtility private IUIUpdater UIUpdater { get; set; } private CancellationTokenSource cancellationTokenSource = new CancellationTokenSource(); + + private bool isMessageBoxShown = false; public FileTransferUtility(IUIUpdater uIUpdater) { UIUpdater = uIUpdater; @@ -92,7 +94,7 @@ public async Task UploadFileToS3(AmazonS3Client s3Client, string filePat using (TransferUtility transferUtility = new TransferUtility(s3Client)) { var uploadRequest = CreateUploadRequest(filePath, fileName, bucketName); - ConfigureProgressTracking(uploadRequest, progress, totalSize, UIUpdater,cancellationTokenSource.Token); + ConfigureProgressTracking(uploadRequest, progress, totalSize, UIUpdater, cancellationTokenSource.Token); await transferUtility.UploadAsync(uploadRequest, cancellationTokenSource.Token); @@ -101,14 +103,40 @@ public async Task UploadFileToS3(AmazonS3Client s3Client, string filePat UIUpdater.UpdateTrainingStatus($"Uploading Files to S3", $"Uploading {totalUploaded}/{totalSize} - {overallPercentage}%"); } } - + LogUploadTime(startTime); return fileName; } catch (AmazonS3Exception e) { - LogError("Error uploading file to S3: ", e); + if (e.ErrorCode == "RequestTimeTooSkewed") + { + if (!isMessageBoxShown) + { + isMessageBoxShown = true; + MessageBox.Show($"Error uploading file to S3: A file took too long to upload. The difference between the request time and the current time is too large.", "Error", MessageBoxButtons.OK, MessageBoxIcon.Error); + isMessageBoxShown = false; + } + else + { + Console.WriteLine($"Error uploading file to S3: A file took too long to upload. The difference between the request time and the current time is too large."); + } + } + else + { + if (!isMessageBoxShown) + { + isMessageBoxShown = true; + MessageBox.Show($"Error uploading file to S3: {e}", "Error", MessageBoxButtons.OK, MessageBoxIcon.Error); + isMessageBoxShown = false; + } + else + { + LogError("Error uploading file to S3: ", e); + } + } + cancellationTokenSource.Cancel(); return null; } catch (OperationCanceledException e) @@ -118,7 +146,17 @@ public async Task UploadFileToS3(AmazonS3Client s3Client, string filePat } catch (Exception e) { - LogError("Error uploading file to S3: ", e); + if (!isMessageBoxShown) + { + isMessageBoxShown = true; + MessageBox.Show($"Error uploading file to S3: {e}", "Error", MessageBoxButtons.OK, MessageBoxIcon.Error); + isMessageBoxShown = false; + } + else + { + LogError("Error uploading file to S3: ", e); + } + cancellationTokenSource.Cancel(); return null; } } @@ -164,12 +202,53 @@ public async Task UploadFileToS3(AmazonS3Client s3Client, MemoryStream f } catch (AmazonS3Exception e) { - LogError("Error uploading file to S3: ", e); + if (e.ErrorCode == "RequestTimeTooSkewed") + { + if (!isMessageBoxShown) + { + isMessageBoxShown = true; + MessageBox.Show($"Error uploading file to S3: A file took too long to upload. The difference between the request time and the current time is too large.", "Error", MessageBoxButtons.OK, MessageBoxIcon.Error); + isMessageBoxShown = false; + } + else + { + Console.WriteLine($"Error uploading file to S3: A file took too long to upload. The difference between the request time and the current time is too large."); + } + } + else + { + if (!isMessageBoxShown) + { + isMessageBoxShown = true; + MessageBox.Show($"Error uploading file to S3: {e}", "Error", MessageBoxButtons.OK, MessageBoxIcon.Error); + isMessageBoxShown = false; + } + else + { + LogError("Error uploading file to S3: ", e); + } + } + cancellationTokenSource.Cancel(); + return null; + } + catch (OperationCanceledException e) + { + LogError("File Upload has been cancelled: ", e); return null; } catch (Exception e) { - LogError("Error uploading file to S3: ", e); + if (!isMessageBoxShown) + { + isMessageBoxShown = true; + MessageBox.Show($"Error uploading file to S3: {e}", "Error", MessageBoxButtons.OK, MessageBoxIcon.Error); + isMessageBoxShown = false; + } + else + { + LogError("Error uploading file to S3: ", e); + } + cancellationTokenSource.Cancel(); return null; } } From 28ae36d6ac02802560fe397cc91eccca0f947287 Mon Sep 17 00:00:00 2001 From: kerrlabajo Date: Wed, 15 May 2024 11:00:53 +0800 Subject: [PATCH 17/23] Revert back to not using stderr and stdout and simplify return instructions if error --- docker/yolov5-training/train_and_export.py | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/docker/yolov5-training/train_and_export.py b/docker/yolov5-training/train_and_export.py index 810a0b3..f3f1910 100644 --- a/docker/yolov5-training/train_and_export.py +++ b/docker/yolov5-training/train_and_export.py @@ -38,29 +38,15 @@ def run_script(args, use_module=False): """ try: if use_module: - result = subprocess.run(["python3", "-m"] + args, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + subprocess.run(["python3", "-m"] + args, check=True) else: - result = subprocess.run(["python3"] + args, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - except subprocess.CalledProcessError as e: - error_message = e.stderr.decode('utf-8') # decode from bytes to string + subprocess.run(["python3"] + args, check=True) + except Exception as e: instructions = "Please refer to your AWS Console Management -> SageMaker -> Training Jobs -> -> Monitor Section -> View Logs -> `/aws/sagemaker/TrainingJobs` Log group -> -> Select host `algo-1` for more information." with open("/opt/ml/output/failure", "w") as f: - if "FileNotFoundError" in error_message: - error_line = re.search("FileNotFoundError.*", error_message).group() - f.write(f"{error_line}\n{instructions}") - elif "AssertionError" in error_message: - error_line = re.search("AssertionError.*", error_message).group() - f.write(f"{error_line}\n{instructions}") - else: - f.write(f"{error_message}\n{instructions}") - print(error_message) + f.write(instructions) print(traceback.format_exc()) sys.exit(1) - else: - output_message = result.stdout.decode('utf-8') # decode from bytes to string - error_message = result.stderr.decode('utf-8') # decode from bytes to string - print(output_message) - print(error_message) # print stderr in case the program writes its output to stderr def parse_arguments(): parser = argparse.ArgumentParser( From b74219e4d6e76009737541ea497492b3218cc6a8 Mon Sep 17 00:00:00 2001 From: Jhonray Acojedo Date: Wed, 15 May 2024 11:48:31 +0800 Subject: [PATCH 18/23] Handle amazon service exception --- Functions/FileTransferUtility.cs | 52 ++++++++++++++++++++++++++++++-- 1 file changed, 50 insertions(+), 2 deletions(-) diff --git a/Functions/FileTransferUtility.cs b/Functions/FileTransferUtility.cs index 8b25b1b..56afef2 100644 --- a/Functions/FileTransferUtility.cs +++ b/Functions/FileTransferUtility.cs @@ -11,6 +11,8 @@ using SharpCompress.Common; using SharpCompress.Readers; using System.Threading; +using System.Net; +using Amazon.Runtime; namespace LSC_Trainer.Functions { @@ -139,6 +141,29 @@ public async Task UploadFileToS3(AmazonS3Client s3Client, string filePat cancellationTokenSource.Cancel(); return null; } + catch (AmazonServiceException e) + { + if (e.InnerException is WebException webEx && webEx.Status == WebExceptionStatus.NameResolutionFailure) + { + // Handle the NameResolutionFailure exception + if (!isMessageBoxShown) + { + isMessageBoxShown = true; + MessageBox.Show($"Error in Tracking Training Job: Failed to resolve the hostname. Please check your network connection and the hostname.", "Error", MessageBoxButtons.OK, MessageBoxIcon.Error); + isMessageBoxShown = false; + } + else + { + Console.WriteLine($"Error in Tracking Training Job: Failed to resolve the hostname. Please check your network connection and the hostname."); + } + } + else + { + LogError("Error uploading file to S3: An error occurred within the AWS SDK.", e); + } + cancellationTokenSource.Cancel(); + return null; + } catch (OperationCanceledException e) { LogError("File Upload has been cancelled: ", e); @@ -149,7 +174,7 @@ public async Task UploadFileToS3(AmazonS3Client s3Client, string filePat if (!isMessageBoxShown) { isMessageBoxShown = true; - MessageBox.Show($"Error uploading file to S3: {e}", "Error", MessageBoxButtons.OK, MessageBoxIcon.Error); + MessageBox.Show($"Error uploading file to S3: {e.Message}", "Error", MessageBoxButtons.OK, MessageBoxIcon.Error); isMessageBoxShown = false; } else @@ -231,6 +256,29 @@ public async Task UploadFileToS3(AmazonS3Client s3Client, MemoryStream f cancellationTokenSource.Cancel(); return null; } + catch (AmazonServiceException e) + { + if (e.InnerException is WebException webEx && webEx.Status == WebExceptionStatus.NameResolutionFailure) + { + // Handle the NameResolutionFailure exception + if (!isMessageBoxShown) + { + isMessageBoxShown = true; + MessageBox.Show($"Error in uploading file to S3: Failed to resolve the hostname. Please check your network connection and the hostname.", "Error", MessageBoxButtons.OK, MessageBoxIcon.Error); + isMessageBoxShown = false; + } + else + { + Console.WriteLine($"Error in uploading file to S3: Failed to resolve the hostname. Please check your network connection and the hostname."); + } + } + else + { + LogError("Error uploading file to S3: An error occurred within the AWS SDK.", e); + } + cancellationTokenSource.Cancel(); + return null; + } catch (OperationCanceledException e) { LogError("File Upload has been cancelled: ", e); @@ -254,7 +302,7 @@ public async Task UploadFileToS3(AmazonS3Client s3Client, MemoryStream f } /// - /// Extracts the contents of a ZIP file into Memory Stream and uploads them to Amazon S3 asynchronously. + /// Gets the contents of a folder and uploads them to Amazon S3 asynchronously. /// /// The Amazon S3 client instance. /// The name of the S3 bucket where the files will be uploaded. From 7bdca257dae743acc23d257ea65732c23e571491 Mon Sep 17 00:00:00 2001 From: Jhonray Acojedo Date: Wed, 15 May 2024 11:48:56 +0800 Subject: [PATCH 19/23] Show upload complete msg box on compleete --- Form1.cs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Form1.cs b/Form1.cs index 88a43b3..e76e97d 100644 --- a/Form1.cs +++ b/Form1.cs @@ -460,7 +460,8 @@ private void backgroundWorker_ProgressChanged(object sender, System.ComponentMod /// An instance of RunWorkerCompletedEventArgs containing event data. private void backgroundWorker_RunWorkerCompleted(object sender, System.ComponentModel.RunWorkerCompletedEventArgs e) { - MessageBox.Show("Upload completed!"); + if (progressBar.Value >= 100) + MessageBox.Show("Upload completed!"); progressBar.Value = 0; mainPanel.Enabled = true; logPanel.Enabled = true; From b7fd92745590bbc022098c5b219680594fdd6e59 Mon Sep 17 00:00:00 2001 From: Jhonray Acojedo Date: Wed, 15 May 2024 11:52:29 +0800 Subject: [PATCH 20/23] Fix error msg mistake --- Functions/FileTransferUtility.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Functions/FileTransferUtility.cs b/Functions/FileTransferUtility.cs index 56afef2..eceb715 100644 --- a/Functions/FileTransferUtility.cs +++ b/Functions/FileTransferUtility.cs @@ -149,12 +149,12 @@ public async Task UploadFileToS3(AmazonS3Client s3Client, string filePat if (!isMessageBoxShown) { isMessageBoxShown = true; - MessageBox.Show($"Error in Tracking Training Job: Failed to resolve the hostname. Please check your network connection and the hostname.", "Error", MessageBoxButtons.OK, MessageBoxIcon.Error); + MessageBox.Show($"Error in uploading file to S3: Failed to resolve the hostname. Please check your network connection and the hostname.", "Error", MessageBoxButtons.OK, MessageBoxIcon.Error); isMessageBoxShown = false; } else { - Console.WriteLine($"Error in Tracking Training Job: Failed to resolve the hostname. Please check your network connection and the hostname."); + Console.WriteLine($"Error in uploading file to S3: Failed to resolve the hostname. Please check your network connection and the hostname."); } } else From c1dbd4673e25c6f5c3a2c55307984747ecbc0671 Mon Sep 17 00:00:00 2001 From: kerrlabajo Date: Wed, 15 May 2024 13:13:21 +0800 Subject: [PATCH 21/23] Set future revision to 12 --- LSC-Trainer.csproj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LSC-Trainer.csproj b/LSC-Trainer.csproj index 6b7f08b..ba6738b 100644 --- a/LSC-Trainer.csproj +++ b/LSC-Trainer.csproj @@ -23,7 +23,7 @@ false false true - 11 + 12 1.1.1.%2a false true From d62510dca7120437a71abc6f44da31559e6424bb Mon Sep 17 00:00:00 2001 From: kerrlabajo Date: Wed, 15 May 2024 13:19:51 +0800 Subject: [PATCH 22/23] Remove unused `totalUploaded` --- Functions/AWS_Helper.cs | 5 ----- 1 file changed, 5 deletions(-) diff --git a/Functions/AWS_Helper.cs b/Functions/AWS_Helper.cs index 9e00cff..e8bfe39 100644 --- a/Functions/AWS_Helper.cs +++ b/Functions/AWS_Helper.cs @@ -28,11 +28,6 @@ namespace LSC_Trainer.Functions /// public class AWS_Helper { - /// - /// Represents the total size of data uploaded. - /// - private static long totalUploaded = 0; - /// /// Validates the provided access key ID by retrieving the username associated with it using the IAM client. /// Updates the UserConnectionInfo.UserName property with the retrieved username if the key is valid. From 95f4dac7194e22e012a5fa25ab90d36ded311886 Mon Sep 17 00:00:00 2001 From: kerrlabajo Date: Wed, 15 May 2024 15:46:37 +0800 Subject: [PATCH 23/23] Restructure to accept dynamic main env vars --- Form1.cs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/Form1.cs b/Form1.cs index e76e97d..86fc463 100644 --- a/Form1.cs +++ b/Form1.cs @@ -115,10 +115,11 @@ public MainForm(bool development) UserConnectionInfo.Region = Environment.GetEnvironmentVariable("REGION"); UserConnectionInfo.RoleArn = Environment.GetEnvironmentVariable("ROLE_ARN"); UserConnectionInfo.EcrUri = Environment.GetEnvironmentVariable("INTELLISYS_ECR_URI"); - UserConnectionInfo.SagemakerBucket = Environment.GetEnvironmentVariable("SAGEMAKER_BUCKET"); - UserConnectionInfo.DefaultDatasetURI = Environment.GetEnvironmentVariable("DEFAULT_DATASET_URI"); - UserConnectionInfo.CustomUploadsURI = Environment.GetEnvironmentVariable("CUSTOM_UPLOADS_URI"); - UserConnectionInfo.DestinationURI = Environment.GetEnvironmentVariable("DESTINATION_URI"); + + UserConnectionInfo.SagemakerBucket = $"sagemaker-{UserConnectionInfo.Region}-{UserConnectionInfo.AccountId}"; + UserConnectionInfo.DefaultDatasetURI = $"s3://{UserConnectionInfo.SagemakerBucket}/default-datasets/MMX059XA_COVERED5B/"; + UserConnectionInfo.CustomUploadsURI = $"s3://{UserConnectionInfo.SagemakerBucket}/users/{UserConnectionInfo.UserName}/custom-uploads/"; + UserConnectionInfo.DestinationURI = $"s3://{UserConnectionInfo.SagemakerBucket}/users/{UserConnectionInfo.UserName}/training-jobs/"; MessageBox.Show("Established Connection using ENV for Development", "Success", MessageBoxButtons.OK, MessageBoxIcon.Information); } else if (!development && UserConnectionInfo.AccountId == null && UserConnectionInfo.AccessKey == null && UserConnectionInfo.SecretKey == null && UserConnectionInfo.Region == null && UserConnectionInfo.RoleArn == null)