Merge pull request #62 from kerrlabajo/feat/docker-enhancement

Allow generic dataset's yaml config to allow different type of datasets regardless of structure
kerrlabajo · May 16, 2024 · d210ebb · d210ebb
2 parents 95f4dac + cd1172e
commit d210ebb
Show file tree

Hide file tree

Showing 2 changed files with 33 additions and 4 deletions.
diff --git a/docker/scripts/pull_build_push.sh b/docker/scripts/pull_build_push.sh
@@ -76,6 +76,9 @@ NEW_TAG="${VERSION}${TAG_BASE}"
 # Authenticate Docker to ECR
 aws ecr get-login-password --region ${AWS_REGION} | sudo docker login --username AWS --password-stdin ${ECR_URL}
 
+# Check if the repository exists, if not create it
+aws ecr describe-repositories --repository-names ${DOCKER_IMAGE} > /dev/null 2>&1 || aws ecr create-repository --repository-name ${DOCKER_IMAGE} > /dev/null 2>&1
+
 # Build and push the image
 sudo docker build -t ${ECR_URL}/${DOCKER_IMAGE}:${NEW_TAG} -f ../yolov5-training/Dockerfile ../yolov5-training
 sudo docker push ${ECR_URL}/${DOCKER_IMAGE}:${NEW_TAG}

diff --git a/docker/yolov5-training/configure_dataset.py b/docker/yolov5-training/configure_dataset.py
@@ -1,6 +1,7 @@
 import yaml
 import argparse
 import os
+import glob
 
 # Define the argument parser
 parser = argparse.ArgumentParser(description='Configure dataset')
@@ -15,15 +16,40 @@
 # Define the file path
 FILE_PATH = args.dataset_config_path
 
-# Define the new paths
-NEW_PATH = f"/opt/ml/input/data/{DATASET_NAME}"
-NEW_TRAIN = f"{NEW_PATH}/images/train"
-NEW_VAL = f"{NEW_PATH}/images/train"
+# Get the directory from the dataset_config_path
+dir_path = os.path.dirname(args.dataset_config_path)
+
+# Check if DATASET_NAME.yaml exists
+if not os.path.isfile(FILE_PATH):
+    # If not, find any .yaml file in the current directory
+    yaml_files = glob.glob(os.path.join(dir_path, '*.yaml'))
+    if yaml_files:
+        # Rename the first .yaml file to DATASET_NAME.yaml
+        os.rename(yaml_files[0], FILE_PATH)
+    else:
+        raise FileNotFoundError("No .yaml file found to rename")
 
 # Open and load the YAML file
 with open(FILE_PATH, 'r') as file:
   data = yaml.safe_load(file)
 
+# Check if DATASET_NAME is in the train and val paths
+if DATASET_NAME in data['train'] and DATASET_NAME in data['val']:
+  # Extract subdirectories after the dataset name in the original paths
+  train_subdirs = data['train'].split(DATASET_NAME, 1)[1]
+  val_subdirs = data['val'].split(DATASET_NAME, 1)[1]
+elif data['train'].startswith('..') and data['val'].startswith('..'):
+  # Remove the '..' from the original paths
+  train_subdirs = data['train'][2:]
+  val_subdirs = data['val'][2:]
+else:
+  raise ValueError("Invalid format for train or val paths")
+
+# Define the new paths
+NEW_PATH = f"/opt/ml/input/data/{DATASET_NAME}"
+NEW_TRAIN = f"{NEW_PATH}{train_subdirs}".replace('\\', '/')
+NEW_VAL = f"{NEW_PATH}{val_subdirs}".replace('\\', '/')
+
 # Modify the values
 data['path'] = NEW_PATH
 data['train'] = NEW_TRAIN